In [10]:
#Imports
import h5py
import sys
import numpy as np
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from matplotlib.axes import Axes
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [3]:
#Reading data
dir_dataset_sg = 'data/SG24_dataset.h5'
# Open H5 file to read
file = h5py.File(dir_dataset_sg,'r')

X = file['Predictors']
T = file['Target']
U = file['User']

X = np.array(X).transpose() #features
T = np.array(T).transpose() #target
U = np.array(U).transpose() #user
U = U[:,0]
T = T[:,0]

In [4]:
#Make user 8 appear only on the test data
user = 8
ind_all = np.arange(X.shape[0])
ind_all_u = ind_all[U[ind_all]==user]
ind_all = ind_all[U[ind_all]!=user]

ind_train, ind_test = train_test_split(ind_all,
                                       shuffle=True,
                                       stratify=T[ind_all],
                                       test_size=600,
                                       random_state=42)
ind_val, ind_test = train_test_split(ind_test,
                                     shuffle=True,
                                     stratify=T[ind_test],
                                     test_size=240,
                                     random_state=41)

ind_test = np.concatenate((ind_test, ind_all_u))
X_train = X[ind_train,:]
X_val   = X[ind_val,:]
X_test  = X[ind_test,:]
t_train = T[ind_train]
t_val   = T[ind_val]
t_test  = T[ind_test]
u_train = U[ind_train]
u_val   = U[ind_val]
u_test  = U[ind_test]

In [5]:
clf = RandomForestClassifier(max_depth=4, random_state=0)
clf.fit(X_train, t_train)
t_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(t_test, t_pred))
print(classification_report(t_test,t_pred))
#print(confusion_matrix(t_test,t_pred))

Accuracy: 0.9194444444444444
              precision    recall  f1-score   support

           1       0.87      0.87      0.87        15
           2       0.87      0.87      0.87        15
           3       1.00      0.73      0.85        15
           4       0.88      0.93      0.90        15
           5       1.00      1.00      1.00        15
           6       1.00      1.00      1.00        15
           7       1.00      1.00      1.00        15
           8       1.00      0.27      0.42        15
           9       0.88      0.93      0.90        15
          10       0.88      0.93      0.90        15
          11       1.00      0.87      0.93        15
          12       0.94      1.00      0.97        15
          13       0.93      0.93      0.93        15
          14       1.00      1.00      1.00        15
          15       1.00      1.00      1.00        15
          16       0.93      0.87      0.90        15
          17       1.00      0.93      0.97        1

In [6]:
#--------------------CROSS VALIDATION-------------------------------

tuned_parameters = [{'kernel': ['rbf'], 'K': list(range(10))}]

scores = ['precision', 'recall']

ind_all = np.arange(X.shape[0])

ind_train, ind_test = train_test_split(ind_all,
                                       shuffle=True,
                                       stratify=T[ind_all],
                                       test_size=0.3,
                                       random_state=42)

X_train = X[ind_train,:]
X_test  = X[ind_test,:]
t_train = T[ind_train]
t_test  = T[ind_test]
u_train = U[ind_train]
u_test  = U[ind_test]

In [13]:
from sklearn.model_selection import GridSearchCV#create new a knn model
RF = RandomForestClassifier()#create a dictionary of all values we want to test for n_neighbors
param_grid = [{'max_depth' : (np.arange(2,15)),'n_estimators' : (np.arange(99,110))}]

RF_gscv = GridSearchCV(RF, param_grid, cv=5)#fit model to data
RF_gscv.fit(X, T)
#check top performing n_neighbors value
RF_gscv.best_params_


{'max_depth': 13, 'n_estimators': 104}

In [15]:
clf = RandomForestClassifier(max_depth=RF_gscv.best_params_['max_depth'], random_state=0,n_estimators = RF_gscv.best_params_['n_estimators'])
clf.fit(X_train, t_train)
t_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(t_test, t_pred))
print(classification_report(t_test,t_pred))
#print(confusion_matrix(t_test,t_pred))

Accuracy: 0.9430555555555555
              precision    recall  f1-score   support

           1       1.00      0.87      0.93        30
           2       0.93      0.87      0.90        30
           3       1.00      0.93      0.97        30
           4       0.83      0.97      0.89        30
           5       0.96      0.90      0.93        30
           6       1.00      0.97      0.98        30
           7       1.00      0.97      0.98        30
           8       0.84      0.70      0.76        30
           9       0.93      0.93      0.93        30
          10       0.91      1.00      0.95        30
          11       1.00      0.97      0.98        30
          12       1.00      0.93      0.97        30
          13       0.88      1.00      0.94        30
          14       0.97      0.97      0.97        30
          15       0.94      1.00      0.97        30
          16       0.96      0.90      0.93        30
          17       0.97      0.97      0.97        3