# Setup

On google colab, you have to restart runtime after running the following line

In [9]:
!pip install omegaconf



In [10]:
from google.colab import drive
drive.mount("/content/drive/")
#"/content/drive/My Drive/NN-kNN/"
folder_name = "/content/drive/Othercomputers/My MacBook Pro/GitHub/NN-kNN/"
import sys
sys.path.insert(0,folder_name)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [12]:
##This is added because my Rdata uses Cdata for the covid data set.
##Rdata use Cdata function to load the data set, then convert it to regression problem
import os
import sys
sys.path.append('/content/drive/Othercomputers/My MacBook Pro/GitHub/NN-kNN/dataset')


In [None]:
# folder_name = os.getcwd()

In [13]:
import torch
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from omegaconf import DictConfig, OmegaConf

from dataset import cls_small_data as Cdata
import model.cls_model as Cmodel
from dataset import cls_medium_data

from dataset import reg_data as Rdata
import model.reg_model as Rmodel

In [14]:
conf_file = OmegaConf.load(os.path.join(folder_name, 'config.yaml'))

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# NCA and LMNN setup

In [16]:
pip install metric-learn

Collecting metric-learn
  Downloading metric_learn-0.7.0-py2.py3-none-any.whl (67 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: metric-learn
Successfully installed metric-learn-0.7.0


In [17]:
import metric_learn
from metric_learn import LMNN,NCA

# Data Sets

Supported small dataset for classification:  
'zebra',
'zebra_special',
'bal',
'digits',
'iris',
'wine',
'breast_cancer',

for regression:
'califonia_housing',
'abalone',
'diabets',
'body_fat',
'ziweifaces'


Newly added data sets for mental health (psychology):

Classification:
'psych_depression_physical_symptons',
'covid_anxious',
'covid_depressed'


In [25]:
dataset_name = 'covid_anxious'
cfg = conf_file['dataset'][dataset_name]

if dataset_name in ['covid_depressed','covid_anxious','psych_depression_physical_symptons',
                    'zebra','zebra_special','bal','digits','iris','wine','breast_cancer']:
    criterion = torch.nn.CrossEntropyLoss()
    Xs, ys = Cdata.Cls_small_data(dataset_name)
elif dataset_name in []:
    criterion = torch.nn.CrossEntropyLoss()
    Xs, ys = cls_medium_data.Cls_medium_data(dataset_name)
else:
    criterion = torch.nn.MSELoss()
    Xs, ys = Rdata.Reg_data(dataset_name)

Columns in the dataset: Index(['SU_ID', 'P_PANEL', 'NATIONAL_WEIGHT', 'REGION_WEIGHT',
       'NATIONAL_WEIGHT_POP', 'REGION_WEIGHT_POP', 'NAT_WGT_COMB_POP',
       'REG_WGT_COMB_POP', 'P_GEO', 'SOC1',
       ...
       'REGION9', 'P_DENSE', 'MODE', 'LANGUAGE', 'MAIL50', 'RACE1_BANNER',
       'RACE2_BANNER', 'INC_BANNER', 'AGE_BANNER', 'HH_BANNER'],
      dtype='object', length=177)


In [24]:
# This section is used to reload the imported module.
# For example, if you made any changes in the model.cls_model, you should run importlib.reload(Cmodel) as long as you set import model.cls_model as Cmodel.
import importlib
importlib.reload(Cdata)

<module 'dataset.cls_small_data' from '/content/drive/Othercomputers/My MacBook Pro/GitHub/NN-kNN/dataset/cls_small_data.py'>

# Classification with NNKNN

In [26]:
# prompt: get the unique y values and their counts

unique_values, counts = np.unique(ys, return_counts=True)
print(f"Unique values: {unique_values}")
print(f"Counts: {counts}")
print(f"Xs.size(): {Xs.size()}")


Unique values: [0 1 2 3]
Counts: [1651 1651 1651 1651]
Xs.size(): torch.Size([6604, 161])


In [27]:
def train_cls(X_train,y_train, X_test, y_test, cfg:DictConfig):
  X_train = X_train.to(device)
  y_train = y_train.to(device)
  X_test = X_test.to(device)

  train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_train, y_train), batch_size=cfg.batch_size, shuffle=True)

  # Train model
  model = Cmodel.NN_k_NN(X_train,
                         y_train,
                         cfg.ca_weight_sharing,
                         cfg.top_case_enabled,
                         cfg.top_k,
                         cfg.discount,
                         device=device)

  optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate) #, weight_decay=1e-5)

  patience_counter = 0
  for epoch in range(cfg.training_epochs):
    epoch_msg = True

    for X_train_batch, y_train_batch in train_loader:
      model.train()
      _, _, output, predicted_class = model(X_train_batch)
      loss = criterion(output, y_train_batch)

      # Backward and optimize
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      if epoch_msg and (epoch + 1) % 2 == 0:
        print(f'Epoch [{epoch + 1}/{cfg.training_epochs}], Loss: {loss.item():.4f}')

        epoch_msg = False
      # print("evaluating")
    model.eval()
    with torch.no_grad():
      _, _, output, predicted_class = model(X_test)

      # Calculate accuracy
      accuracy_temp = accuracy_score(y_test.numpy(), predicted_class.cpu().numpy())
    if epoch == 0:
      best_accuracy = accuracy_temp
      torch.save(model.state_dict(), cfg.PATH)

    elif accuracy_temp > best_accuracy:
      #memorize best model
      torch.save(model.state_dict(), cfg.PATH)
      best_accuracy = accuracy_temp
      patience_counter = 0

    elif patience_counter > cfg.patience:
      model.eval()
      print("patience exceeded, loading best model")
      break
    else:
      patience_counter += 1

  return best_accuracy, model

In [None]:
def load_model(X_train,y_train,cfg):
  # Define the model architecture
  model = Cmodel.NN_k_NN(
      X_train,
      y_train,
      cfg.ca_weight_sharing,
      cfg.top_case_enabled,
      cfg.top_k,
      cfg.discount,
      device=device
  )
  # Load the state dictionary
  model.load_state_dict(torch.load(cfg.path))
  model.to(device)
  model.eval()
  return model

In [None]:
accuracies = []
knn_accuracies = []
lmnn_accuracies = []
PATH = os.path.join(folder_name, f'checkpoints/classifier_{dataset_name}.h5')
cfg.PATH = PATH
k_fold = KFold(n_splits=10, shuffle=True, random_state = None)
enable_lmnn = True

for train_index, test_index in k_fold.split(Xs):
  # Get training and testing data
  X_train, X_test = Xs[train_index], Xs[test_index]
  y_train, y_test = ys[train_index], ys[test_index]
  if(enable_lmnn):
    # https://contrib.scikit-learn.org/metric-learn/supervised.html#lmnn
    lmnn = LMNN(n_neighbors=5, learn_rate=1e-6)
    ##TODO, change here if you need to use a different one
    # lmnn = metric_learn.MLKR()
    # lmnn = metric_learn.NCA(max_iter=1000)
    lmnn.fit(X_train,y_train)
    knn = KNeighborsClassifier(n_neighbors=5,metric=lmnn.get_metric())
    knn.fit(X_train,y_train)
    # klmnn_accuracies.append( accuracy_score(knn.predict(X_test), y_test))
    lmnn_acc = accuracy_score(knn.predict(X_test), y_test)
    lmnn_accuracies.append(lmnn_acc)

  knn =  KNeighborsClassifier(n_neighbors=cfg.top_k)
  knn.fit(X_train, y_train)
  knn_acc  = accuracy_score(knn.predict(X_test), y_test)
  knn_accuracies.append(knn_acc)

  best_accuracy, model = train_cls(X_train,y_train, X_test, y_test, cfg)
  accuracies.append(best_accuracy)

print(f"Average accuracy:{np.mean(accuracies):.3f}")
print(f"KNN accuracy:{np.mean(knn_accuracies):.3f}")
print(f"LMNN/NCA accuracy:{np.mean(lmnn_accuracies):.3f}")


Epoch [2/1000], Loss: 1.3931
Epoch [4/1000], Loss: 1.3770
Epoch [6/1000], Loss: 1.3782
Epoch [8/1000], Loss: 1.3675
Epoch [10/1000], Loss: 1.3548
Epoch [12/1000], Loss: 1.3700
Epoch [14/1000], Loss: 1.3753
Epoch [16/1000], Loss: 1.3764
Epoch [18/1000], Loss: 1.3417
Epoch [20/1000], Loss: 1.3732
Epoch [22/1000], Loss: 1.3644
Epoch [24/1000], Loss: 1.3571
Epoch [26/1000], Loss: 1.3610
Epoch [28/1000], Loss: 1.3416
Epoch [30/1000], Loss: 1.3497
Epoch [32/1000], Loss: 1.3425
Epoch [34/1000], Loss: 1.3684
Epoch [36/1000], Loss: 1.3964
Epoch [38/1000], Loss: 1.3569
Epoch [40/1000], Loss: 1.3352
Epoch [42/1000], Loss: 1.3492
Epoch [44/1000], Loss: 1.3216
Epoch [46/1000], Loss: 1.3089
Epoch [48/1000], Loss: 1.3468
Epoch [50/1000], Loss: 1.3180
Epoch [52/1000], Loss: 1.3358
Epoch [54/1000], Loss: 1.3742
Epoch [56/1000], Loss: 1.3566
Epoch [58/1000], Loss: 1.3592
Epoch [60/1000], Loss: 1.3237
Epoch [62/1000], Loss: 1.3161
Epoch [64/1000], Loss: 1.3317
Epoch [66/1000], Loss: 1.2911


# Regression with NNKNN

In [None]:
unique_values, counts = np.unique(ys, return_counts=True)
print(f"Unique values: {unique_values}")
print(f"Counts: {counts}")
print(f"Xs.size(): {Xs.size()}")

Unique values: [-1.3415393  -0.44717973  0.44717973  1.3415393 ]
Counts: [1651 1651 1651 1651]
Xs.size(): torch.Size([6604, 162])


In [None]:
def train_reg(X_train,y_train, X_test, y_test, cfg:DictConfig):
  X_train = X_train.to(device)
  y_train = y_train.to(device)
  X_test = X_test.to(device)

  train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_train, y_train), batch_size=cfg.batch_size, shuffle=True)
  test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_test, y_test), batch_size=cfg.batch_size, shuffle=False)


    # Train model
  model = Rmodel.NN_k_NN_regression(X_train,
                                    y_train,
                                    cfg.ca_weight_sharing,
                                    cfg.top_case_enabled,
                                    cfg.top_k,
                                    cfg.discount,
                                    cfg.class_weight_sharing,
                                    device=device)

  optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate) #, weight_decay=1e-5)

  patience_counter = 0
  for epoch in range(cfg.training_epochs):
    # break # no training
    epoch_msg = True
    for X_train_batch, y_train_batch in train_loader:
      model.train()
      _, _, _, predicted_number = model(X_train_batch)
      # break
      loss = criterion(predicted_number.squeeze(), y_train_batch)
      # Backward and optimize
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      if epoch_msg and (epoch + 1) % 2 == 0:
        epoch_msg = False
        print(f'Epoch [{epoch + 1}/{cfg.training_epochs}], Loss: {loss.item():.4f}')

    model.eval()
    with torch.no_grad():
      predicted_numbers = []
      for X_test_batch, _ in test_loader:
        X_test_batch = X_test_batch.to(device)
        _, _, _, predicted_number = model(X_test_batch)
        predicted_numbers.extend(predicted_number.squeeze().cpu().detach())

      predicted_numbers = torch.Tensor(predicted_numbers)
      accuracy_temp = criterion(y_test, predicted_numbers)

    if epoch == 0:
      best_accuracy = accuracy_temp
      torch.save(model.state_dict(), cfg.PATH)
    elif accuracy_temp < best_accuracy:
      torch.save(model.state_dict(), cfg.PATH)
      best_accuracy = accuracy_temp
      patience_counter = 0
    elif patience_counter > cfg.patience:
      model.eval()
      print("patience exceeded, loading best model")
      break
    else:
      patience_counter += 1

  _, case_activations, _, predicted_number = model(X_test)

  top_case_indices = torch.topk(case_activations, 5, dim=1)[1].cpu()

  accuracy = criterion(y_test, predicted_number.squeeze().cpu())
  y_train = y_train.cpu()
  top_k_average_accuracy = mean_squared_error(torch.mean(y_train[top_case_indices], dim=1), y_test)

  return best_accuracy, accuracy, top_k_average_accuracy, model

In [None]:
best_accuracies = []
accuracies = []
top_k_average_accuracies = []
knn_accuracies = []
PATH = os.path.join(folder_name, f'checkpoints/regression_{dataset_name}.h5')
cfg.PATH = PATH
k_fold = KFold(n_splits=10, shuffle = True,random_state = None)


for train_index, test_index in k_fold.split(Xs):
  # Get training and testing data
  X_train, X_test = Xs[train_index], Xs[test_index]
  y_train, y_test = ys[train_index], ys[test_index]

  knn = KNeighborsRegressor(n_neighbors=cfg.top_k)
  knn.fit(X_train, y_train)
  knn_accuracies.append(mean_squared_error(knn.predict(X_test), y_test))

  best_accuracy, accuracy, top_k_average_accuracy, model= train_reg(X_train, y_train, X_test, y_test, cfg)
  best_accuracies.append(best_accuracy)
  accuracies.append(accuracy)
  top_k_average_accuracies.append(top_k_average_accuracy)

print("Average accuracy:", np.mean([acc.detach().numpy() for acc in accuracies]))
print("Average top_k_average_accuracies", np.mean(top_k_average_accuracies))
print("KNN accuracy:", np.mean(knn_accuracies))

Epoch [2/1000], Loss: 1.1363
Epoch [4/1000], Loss: 1.0000
Epoch [6/1000], Loss: 0.9063
Epoch [8/1000], Loss: 0.9325
Epoch [10/1000], Loss: 1.0979
Epoch [12/1000], Loss: 1.0777
Epoch [14/1000], Loss: 1.0533
Epoch [16/1000], Loss: 1.0521
Epoch [18/1000], Loss: 0.9599
Epoch [20/1000], Loss: 1.2118
Epoch [22/1000], Loss: 1.0878
Epoch [24/1000], Loss: 1.0351
Epoch [26/1000], Loss: 0.9030
Epoch [28/1000], Loss: 0.9901
Epoch [30/1000], Loss: 1.0564
Epoch [32/1000], Loss: 1.0790
Epoch [34/1000], Loss: 1.1429
Epoch [36/1000], Loss: 1.1163
Epoch [38/1000], Loss: 1.1742
Epoch [40/1000], Loss: 0.9303
Epoch [42/1000], Loss: 0.8624
Epoch [44/1000], Loss: 0.9859
Epoch [46/1000], Loss: 0.9060
Epoch [48/1000], Loss: 0.9122
Epoch [50/1000], Loss: 0.9942
Epoch [52/1000], Loss: 0.8828
Epoch [54/1000], Loss: 0.8170
Epoch [56/1000], Loss: 0.8536
Epoch [58/1000], Loss: 0.9334
Epoch [60/1000], Loss: 1.0324
Epoch [62/1000], Loss: 0.8176
Epoch [64/1000], Loss: 0.9927
Epoch [66/1000], Loss: 0.9490
Epoch [68/1000

# Results Interpretation

In [None]:
def print_model_features(input_model):
  for n, p in model.named_parameters():
    print(n)
    print(p.data)

In [None]:
print_model_features(model)

In [None]:
# for regression only. for classification is different
#feature_activations, case_activations, predicted_number
model.eval()
feature_activations, case_activations, output, predicted_class = model(X_test)

In [None]:
predicted_class

In [None]:
y_test

In [None]:
#inspecting the case activations
top_case_indices = torch.topk(case_activations, 5, dim=1)[1]

In [None]:
X_test[0]

In [None]:
X_train[top_case_indices[0][0]]

By comparing the following two blocks' outputs, you can see we are retrieving a good neighbor.

In [None]:
#sum abs of X_test[0] and the top activated case
sum(abs(X_test[0] - X_train[top_case_indices[0][0]]))

In [None]:
# prompt: average sum abs of X_test[0] and X_train data
print(np.mean([sum(abs(X_test[0] - X_train[i])) for i in range(len(X_train))]))

TODO:: A better way is to show the distribution of ``X_test[0] - X_train[i]``

In [None]:
y_train[top_case_indices[0]]

In [None]:
knn.predict(X_test)[0]

In [None]:
indices = knn.kneighbors(X_test)[1][0]

In [None]:
y_train[indices]