In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import torch
import numpy as np

https://machinelearningknowledge.ai/knn-classifier-in-sklearn-using-gridsearchcv-with-example/ <br>
https://towardsdatascience.com/building-a-k-nearest-neighbors-k-nn-model-with-scikit-learn-51209555453a

## Get the data and labels

### Real data

In [2]:
num_testsub = 32
num_classes = 4
num_channel = 40
num_datapoints = 8064
num_trials = 40
sampling_rate = 128 # 128Hz as given in the data

def loadfiles_normalized():
    data_dict = {}
    print("Loading files into data_dict .................")
    for i in range(num_testsub):
                    if i < 10:
                        name = '%0*d' % (2,i+1)
                    else:
                        name = i+1
                    fname = 'data/data_prepared/data_norm_bhat/noramlized_datasub'+str(name) +'.npy'
                    data_dict["sub%s" %name] = np.load(fname)    
    print("Loaded!!!!!") 
    return data_dict

In [3]:
data_dict = loadfiles_normalized()

Loading files into data_dict .................
Loaded!!!!!


In [4]:
valence = np.load('data/data_prepared/labels/label_class_0.npy')
arousal = np.load('data/data_prepared/labels/label_class_1.npy')
dominance = np.load('data/data_prepared/labels/label_class_2.npy')
liking = np.load('data/data_prepared/labels/label_class_3.npy')

In [5]:
valence.shape

(1280,)

In [6]:
data_dict['sub01'].shape

(40, 40, 99)

### Create train test split

### Real Data

In [7]:
# training with 22 participants each has 40 videos = 880
train_R = np.zeros((0,40,99))
train_R[:] = np.nan
#train1 = 
test_R = np.zeros((0,40,99))
test_R[:] = np.nan

participation_counter = 0
for sub in data_dict.keys():
    if participation_counter < 22:
        train_R = np.concatenate((train_R, data_dict[sub]), axis=0)
    else:
        test_R = np.concatenate((test_R, data_dict[sub]), axis=0)
    participation_counter = participation_counter + 1

In [8]:
# training with 22 participants each has 40 videos = 880
data = np.zeros((0,40,99))
data[:] = np.nan

for sub in data_dict.keys():
    data = np.concatenate((data, data_dict[sub]), axis=0)

In [9]:
data.shape

(1280, 40, 99)

In [10]:
train_R.shape

(880, 40, 99)

In [11]:
test_R.shape

(400, 40, 99)

### Synthetic Data

In [12]:
arrays = []
for i in range(10):
    arrays.append(torch.load("data/sampled/0x/1_sample/sampled_01_128_"+str(i),map_location=torch.device('cpu')))
generated_data = np.concatenate(arrays, axis=0)

In [13]:
data_S = np.zeros((1280,40,99))
data_S[:] = np.nan

for i in range(1280):
    data_S[i] = generated_data[i][0][:,:99] 

In [14]:
data_S.shape

(1280, 40, 99)

In [15]:
train_S = data_S[:880]
test_S = data_S[880:]

In [16]:
train_S.shape

(880, 40, 99)

In [17]:
test_S.shape

(400, 40, 99)

### Labels

In [18]:
val32 = valence[:]
aro32 = arousal[:]
dom32 = dominance[:]
lik32 = liking[:]

val22 = valence[: 880]
aro22 = arousal[: 880]
dom22 = dominance[: 880]
lik22 = liking[: 880]

val10 = valence[880: ]
aro10 = arousal[880: ]
dom10 = dominance[880: ]
lik10 = liking[880: ]

In [19]:
val32.shape

(1280,)

In [20]:
val22.shape

(880,)

In [21]:
val10.shape

(400,)

----------

The cell below is deprecated and can be ignored

In [56]:
trainlabel_list = [val22, aro22, dom22, lik22]
testlabel_list = [val10, aro10, dom10, lik10]
names = ["Valence","Arousal","Dominance","Liking"]

for traain,tesst,name in zip(trainlabel_list,testlabel_list,names):
    print("#########################################",name,"#########################################")
    #sc = knnmodel2(train.reshape(-1, 40*99),traain,test.reshape(-1, 40*99),tesst,kval = k)

    #define cross-validation method to use
    cv = KFold(n_splits=32, random_state=1, shuffle=True)

    #build multiple linear regression model
    model = KNeighborsClassifier(n_neighbors=20)

    #use k-fold CV to evaluate model
    scores = cross_val_score(model, train_R.reshape(-1, 40*99), traain, cv=5)

    #print each cv score (accuracy) and average them
    #print(cv_scores)
    print("cv_scores mean:{}".format(np.mean(scores)))

    #print("test score:{}".format(model.score(test.reshape(-1, 40*99),tesst)))

######################################### Valence #########################################
cv_scores mean:0.5170454545454545
######################################### Arousal #########################################
cv_scores mean:0.36363636363636365
######################################### Dominance #########################################
cv_scores mean:0.43863636363636366
######################################### Liking #########################################
cv_scores mean:0.5579545454545455


-------

## 1) Train with real data and test with real data

In [22]:
trainlabel_list = [val22, aro22, dom22, lik22]
testlabel_list = [val10, aro10, dom10, lik10]
names = ["Valence","Arousal","Dominance","Liking"]

knn = KNeighborsClassifier()
param_grid = {"n_neighbors": np.arange(1, 50)}

for traain,name in zip(trainlabel_list,names):
    print("#########################################",name,"#########################################")
    
    # defining parameter range
    grid = GridSearchCV(knn, param_grid, cv=32, scoring='accuracy', return_train_score=False,verbose=1)

    # fitting the model for grid search
    grid_search=grid.fit(train_R.reshape(-1, 40*99), traain)
    print("Average ",grid_search.best_score_)
    print("best param ",grid_search.best_params_)

######################################### Valence #########################################
Fitting 32 folds for each of 49 candidates, totalling 1568 fits
Average  0.4967757936507936
best param  {'n_neighbors': 11}
######################################### Arousal #########################################
Fitting 32 folds for each of 49 candidates, totalling 1568 fits
Average  0.5082671957671958
best param  {'n_neighbors': 49}
######################################### Dominance #########################################
Fitting 32 folds for each of 49 candidates, totalling 1568 fits
Average  0.4833829365079365
best param  {'n_neighbors': 49}
######################################### Liking #########################################
Fitting 32 folds for each of 49 candidates, totalling 1568 fits
Average  0.6567046957671958
best param  {'n_neighbors': 49}


In [23]:
n = [11,49,49,49]
for traain,tesst,name,kn in zip(trainlabel_list,testlabel_list,names,n):
    print("#########################################",name,"#########################################")
    
    knn = KNeighborsClassifier(n_neighbors=kn)
    knn.fit(train_R.reshape(-1, 40*99), traain)
    y_test_hat=knn.predict(test_R.reshape(-1, 40*99)) 
    test_accuracy=accuracy_score(tesst,y_test_hat)*100
    
    print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy) )

######################################### Valence #########################################
Accuracy for our testing dataset with tuning is : 49.00%
######################################### Arousal #########################################
Accuracy for our testing dataset with tuning is : 51.75%
######################################### Dominance #########################################
Accuracy for our testing dataset with tuning is : 57.25%
######################################### Liking #########################################
Accuracy for our testing dataset with tuning is : 63.00%


## 2) Train with real data and test with synthetic data

In [24]:
n = [11,49,49,49]
for traain,tesst,name,kn in zip(trainlabel_list,testlabel_list,names,n):
    print("#########################################",name,"#########################################")
    
    knn = KNeighborsClassifier(n_neighbors=kn)
    knn.fit(train_R.reshape(-1, 40*99), traain)
    y_test_hat=knn.predict(test_S.reshape(-1, 40*99)) 
    test_accuracy=accuracy_score(tesst,y_test_hat)*100
    
    print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy) )

######################################### Valence #########################################
Accuracy for our testing dataset with tuning is : 51.00%
######################################### Arousal #########################################
Accuracy for our testing dataset with tuning is : 54.00%
######################################### Dominance #########################################
Accuracy for our testing dataset with tuning is : 44.75%
######################################### Liking #########################################
Accuracy for our testing dataset with tuning is : 62.25%


## 3) Train with synthetic data and test with real data

In [25]:
trainlabel_list = [val22, aro22, dom22, lik22]
testlabel_list = [val10, aro10, dom10, lik10]
names = ["Valence","Arousal","Dominance","Liking"]

knn = KNeighborsClassifier()
param_grid = {"n_neighbors": np.arange(1, 50)}

for traain,name in zip(trainlabel_list,names):
    print("#########################################",name,"#########################################")
    
    # defining parameter range
    grid = GridSearchCV(knn, param_grid, cv=32, scoring='accuracy', return_train_score=False,verbose=1)

    # fitting the model for grid search
    grid_search=grid.fit(train_S.reshape(-1, 40*99), traain)
    print("Average ",grid_search.best_score_)
    print("best param ",grid_search.best_params_)

######################################### Valence #########################################
Fitting 32 folds for each of 49 candidates, totalling 1568 fits
Average  0.5329861111111112
best param  {'n_neighbors': 49}
######################################### Arousal #########################################
Fitting 32 folds for each of 49 candidates, totalling 1568 fits
Average  0.5590691137566137
best param  {'n_neighbors': 49}
######################################### Dominance #########################################
Fitting 32 folds for each of 49 candidates, totalling 1568 fits
Average  0.5204199735449735
best param  {'n_neighbors': 7}
######################################### Liking #########################################
Fitting 32 folds for each of 49 candidates, totalling 1568 fits
Average  0.6830357142857142
best param  {'n_neighbors': 43}


In [26]:
n = [49,49,7,43]
for traain,tesst,name,kn in zip(trainlabel_list,testlabel_list,names,n):
    print("#########################################",name,"#########################################")
    
    knn = KNeighborsClassifier(n_neighbors=kn)
    knn.fit(train_R.reshape(-1, 40*99), traain)
    y_test_hat=knn.predict(test_R.reshape(-1, 40*99)) 
    test_accuracy=accuracy_score(tesst,y_test_hat)*100
    
    print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy) )

######################################### Valence #########################################
Accuracy for our testing dataset with tuning is : 53.25%
######################################### Arousal #########################################
Accuracy for our testing dataset with tuning is : 51.75%
######################################### Dominance #########################################
Accuracy for our testing dataset with tuning is : 58.75%
######################################### Liking #########################################
Accuracy for our testing dataset with tuning is : 61.50%


## 4) Train with synthetic data and test with synthetic data

In [27]:
n = [11,49,49,49]
for traain,tesst,name,kn in zip(trainlabel_list,testlabel_list,names,n):
    print("#########################################",name,"#########################################")
    
    knn = KNeighborsClassifier(n_neighbors=kn)
    knn.fit(train_R.reshape(-1, 40*99), traain)
    y_test_hat=knn.predict(test_S.reshape(-1, 40*99)) 
    test_accuracy=accuracy_score(tesst,y_test_hat)*100
    
    print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy) )

######################################### Valence #########################################
Accuracy for our testing dataset with tuning is : 51.00%
######################################### Arousal #########################################
Accuracy for our testing dataset with tuning is : 54.00%
######################################### Dominance #########################################
Accuracy for our testing dataset with tuning is : 44.75%
######################################### Liking #########################################
Accuracy for our testing dataset with tuning is : 62.25%
