In [93]:
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import minmax_scale
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd

# Data Preparation

## Data cleaning

In [52]:
# Load dataset
df = pd.read_excel('TrainingSet.xlsx')
df_test = pd.read_excel('TestSet.xlsx')

# Convert Sex attribute in UpperCase
df['Sesso'] = df['Sesso'].str.upper()

# Remove CAP attribute
df = df.drop(columns =['CAP'])
# df = df.drop(columns =['Sesso'])
# df = df.drop(columns =['ID'])

# Remove unlabeled instances
df = df.loc[df["Patologia"] == df["Patologia"]]

dim = df.shape[0]
df.head()

Unnamed: 0,ID,Scolarita,Eta,Sesso,MMSE_PG,MMSE_PC,MMSE_PE,MMSE_ESITO,CLOCKTEST_PG,CLOCKTEST_PE,...,FAB_ESITO,FLUENZAVERBFON_PG,FLUENZAVERBFON_PC,FLUENZAVERBFON_PE,FLUENZAVERBFON_ESITO,TESTMATRICIATTENTIVE_PG,TESTMATRICIATTENTIVE_PC,TESTMATRICIATTENTIVE_PE,TESTMATRICIATTENTIVE_ESITO,Patologia
0,Row0,13.0,65.0,F,29.0,28.49,238.0,NORMA,5.0,7.0,...,,24.0,20.9,1.0,AI LIMITI INFERIORI DELLA NORMA,47.0,45.75,3.0,NORMA,Malattia di Alzheimer
1,Row1,8.0,70.0,M,27.0,28.2,238.0,NORMA,6.0,5.0,...,,31.0,34.9,4.0,NORMA,,,,,Assenza di patologia
2,Row2,12.0,70.0,F,27.0,26.86,238.0,NORMA,,7.0,...,,24.0,27.9,3.0,NORMA,45.0,47.0,3.0,NORMA,Malattia di Alzheimer
3,Row3,8.0,66.0,M,17.0,17.53,238.0,DEFICIT,2.0,5.0,...,DEFICIT,4.0,7.2,0.0,DEFICIT,7.0,5.25,0.0,DEFICIT,Malattia di Alzheimer
4,Row4,7.0,82.0,M,25.0,25.0,238.0,NORMA,1.0,3.0,...,DEFICIT,11.0,21.4,2.0,NORMA,37.0,41.75,2.0,NORMA,Malattia di Alzheimer


In [53]:
def new_column(df, df_clean, feature):
    col_pc = feature + '_PC'
    col_pg = feature + '_PG'
    col_es = feature + '_ESITO'
    col_new = feature
    tresh = const.const_dict["TRESHOLD"]

    for i in range(dim_df):

        if col_pc in df.columns:

            if (df.loc[i, col_pc] == df.loc[i, col_pc]) and (df.loc[i,col_pg] == df.loc[i,col_pg]):
                if abs(df.loc[i, col_pc] - df.loc[i,col_pg]) < tresh:
                    df_clean.loc[i,col_new] = df.loc[i, col_pc]
                else:
                    df_clean.loc[i,col_new] = df.loc[i, col_pg]

            elif (df.loc[i, col_pc] != df.loc[i, col_pc]) and (df.loc[i,col_pg] == df.loc[i,col_pg]):
                df_clean.loc[i,col_new] = df.loc[i, col_pg] 
            elif (df.loc[i, col_pc] == df.loc[i, col_pc]) and (df.loc[i,col_pg] != df.loc[i,col_pg]):
                df_clean.loc[i,col_new] = df.loc[i, col_pc]

            elif df.loc[i,col_es] == df.loc[i,col_es]:
                if df.loc[i,col_es] == "NORMA":
                    df_clean.loc[i,col_new] = (const.const_dict[col_new + "_MAX"] + const.const_dict[col_new + "_CUTOFF"])/2
                elif df.loc[i,col_es] == "DEFICIT":
                    df_clean.loc[i,col_new] = const.const_dict[col_new + "_CUTOFF"]/2
                else:
                    df_clean.loc[i,col_new] = const.const_dict[col_new + "_CUTOFF"]

            # else:
                #Null

        else:
            if df.loc[i,col_pg] == df.loc[i,col_pg]:
                df_clean.loc[i,col_new] = df.loc[i, col_pg]
                
            elif df.loc[i,col_es] == df.loc[i,col_es]:
                if df.loc[i,col_es] == "NORMA":
                    df_clean.loc[i,col_new] = (const.const_dict[col_new + "_MAX"] + const.const_dict[col_new + "_CUTOFF"])/2
                elif df.loc[i,col_es] == "DEFICIT":
                    df_clean.loc[i,col_new] = const.const_dict[col_new + "_CUTOFF"]/2
                else:
                    df_clean.loc[i,col_new] = const.const_dict[col_new + "_CUTOFF"]
            # else:
                #Null

# plt.hist(df_clean['MMSE'], bins = 20)

## Imputation

In [54]:
# Imputation
num_col = [c for c in df.columns if not("_esito" in c.lower()) and not("_pe" in c.lower()) 
           and not("patologia" in c.lower()) and not("sesso" in c.lower()) and not("id" in c.lower())]
enum_col = [c for c in df.columns if("_esito" in c.lower()) or ("sesso" in c.lower())]

df_imp_num = df.loc[:,num_col]
df_imp_enum = df.loc[:,enum_col]

columns = df_imp_num.columns.append(df_imp_enum.columns).insert(0, "ID")

num_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
num_imputer.fit(df_imp_num.values)
df_imp_num = pd.DataFrame(num_imputer.transform(df_imp_num.values))

enum_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
enum_imputer.fit(df_imp_enum.values)
df_imp_enum = pd.DataFrame(enum_imputer.transform(df_imp_enum.values))

df_imp_num.insert(0, "ID", df["ID"])

df_imp_enum.insert(0, "ID", df["ID"])
df_imp_num

# df_imp_num.append(df_imp_enum, axis = 1)

result = pd.merge(df_imp_num, df_imp_enum, on = "ID")
result.columns = columns
result["Patologia"] = df["Patologia"]
result

Unnamed: 0,ID,Scolarita,Eta,MMSE_PG,MMSE_PC,CLOCKTEST_PG,COPIAFIGURAREY_PG,COPIAFIGURAREY_PC,PAROLEREYIMM_PG,PAROLEREYIMM_PC,...,MMSE_ESITO,CLOCKTEST_ESITO,COPIAFIGURAREY_ESITO,PAROLEREYIMM_ESITO,PAROLEREYDIFF_ESITO,MEMORIAFIGURAREY_ESITO,FAB_ESITO,FLUENZAVERBFON_ESITO,TESTMATRICIATTENTIVE_ESITO,Patologia
0,Row0,13.0,65.0,29.0,28.49,5.000000,34.00000,34.75,28.0,29.3,...,NORMA,DEFICIT,NORMA,DEFICIT,NORMA,DEFICIT,NORMA,AI LIMITI INFERIORI DELLA NORMA,NORMA,Malattia di Alzheimer
1,Row1,8.0,70.0,27.0,28.20,6.000000,34.00000,36.00,28.0,33.9,...,NORMA,NORMA,NORMA,NORMA,NORMA,AI LIMITI INFERIORI DELLA NORMA,NORMA,NORMA,NORMA,Assenza di patologia
2,Row2,12.0,70.0,27.0,26.86,6.927686,28.19788,18.50,21.0,26.9,...,NORMA,DEFICIT,DEFICIT,DEFICIT,NORMA,AI LIMITI INFERIORI DELLA NORMA,NORMA,NORMA,NORMA,Malattia di Alzheimer
3,Row3,8.0,66.0,17.0,17.53,2.000000,2.00000,3.50,12.0,16.0,...,DEFICIT,DEFICIT,DEFICIT,DEFICIT,DEFICIT,DEFICIT,DEFICIT,DEFICIT,DEFICIT,Malattia di Alzheimer
4,Row4,7.0,82.0,25.0,25.00,1.000000,15.00000,17.75,18.0,30.2,...,NORMA,DEFICIT,DEFICIT,AI LIMITI INFERIORI DELLA NORMA,NORMA,DEFICIT,DEFICIT,NORMA,NORMA,Malattia di Alzheimer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,Row500,5.0,76.0,29.0,31.03,5.000000,28.00000,30.75,25.0,35.0,...,NORMA,NORMA,NORMA,NORMA,DEFICIT,NORMA,NORMA,NORMA,NORMA,Assenza di patologia
327,Row501,14.0,54.0,28.0,26.31,10.000000,35.00000,33.75,19.0,15.5,...,NORMA,NORMA,NORMA,DEFICIT,DEFICIT,NORMA,NORMA,NORMA,NORMA,Assenza di patologia
328,Row503,13.0,53.0,30.0,28.99,10.000000,35.00000,34.25,35.0,31.5,...,NORMA,NORMA,NORMA,AI LIMITI INFERIORI DELLA NORMA,AI LIMITI INFERIORI DELLA NORMA,NORMA,NORMA,NORMA,NORMA,Assenza di patologia
329,Row504,5.0,67.0,28.0,29.27,6.927686,31.00000,33.00,40.0,46.1,...,NORMA,NORMA,NORMA,NORMA,NORMA,NORMA,NORMA,NORMA,NORMA,Assenza di patologia


# Model Selection

In [111]:
# Splitting data and prediction
num_col = [c for c in df.columns if not("_esito" in c.lower()) and not("_pe" in c.lower()) 
           and not("patologia" in c.lower()) and not("sesso" in c.lower()) and not("id" in c.lower())]
nom_col = [c for c in df.columns if("_esito" in c.lower()) or ("sesso" in c.lower()) or ("patologia" in c.lower())]

df_data_num = result[num_col]
df_data_nom = result[nom_col]
df_data_num_norm = result.drop(["ID", "Patologia"], axis = 1)
df_target = result['Patologia']

## KNN numerical

In [116]:
# Normalize data
df_data_num_norm = pd.DataFrame(data = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True), columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"metric": ["manhattan", "euclidean"],
          "n_neighbors": np.arange(1,20,1),
          "weights": ["distance", "uniform"]}

# KNN Classifier
# model = NearestNeighbors(*, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None)
model = KNeighborsClassifier(radius=1.0, algorithm='auto', leaf_size=30, n_jobs=2)


# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, scoring = 'accuracy', verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7098883338230974


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7251322362621218


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7069497502203939


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.691962973846606
0.7069497502203936
Mean score:  0.7081766088745225


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.7s finished


## SVC with rbf kernel

In [42]:
# Normalize data
df_data_num_norm = pd.DataFrame(data = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True), columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"C": np.arange(1,4,0.5),
          "gamma": np.arange(0.01,1,0.1),
          "degree":[1,2,3]}

# We will use a Support Vector Classifier
model = SVC(kernel="rbf")

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6978768733470466


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6857919482809286


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    9.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7009256538348516


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6859756097560976
0.700852189244784
Mean score:  0.6942844548927416


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.4s finished


## SVC with polinomial kernel

In [44]:
# Normalize data
df_data_num_norm = pd.DataFrame(data = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True), columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"C": np.arange(1,4,0.5),
          "gamma": np.arange(0.01,1,0.1),
          "degree":[1,2,3]}

# We will use a Support Vector Classifier
model = SVC(kernel="poly")

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv) #verbose = 1)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7038275051425213


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6798780487804879


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6978034087569792


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.713157508081105
0.6888039964736997
Mean score:  0.6966940934469587


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.1s finished


# Hyperparameter Selection

In [77]:
# Split dataset (2/3 training , 1/3 test)
data_train, data_test, target_train, target_test = train_test_split(df_data.values, df_target.values, test_size=0.33, stratify = df_target)

# Normalize data
data_train_norm = minmax_scale(data_train, feature_range=(0, 1), axis=0, copy=True)
data_test_norm = minmax_scale(data_test, feature_range=(0, 1), axis=0, copy=True)

# Select best hyerparameter
p_grid = {"C": np.arange(1,4,0.5),
          "gamma": np.arange(0.01,1,0.1),
          "degree":[1,2,3]}

model = SVC(kernel="poly")
hyp_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
clf = GridSearchCV(estimator=model, param_grid=p_grid, cv= hyp_cv, verbose = 1, scoring = 'accuracy')

clf.fit(data_train_norm,target_train)

print("Best score: ", clf.best_score_)
print("Best params: ", clf.best_params_)
# print(clf.best_index_)
# print(clf.scorer_)

df_score = pd.DataFrame(clf.cv_results_)
df_score = df_score.sort_values(by=["rank_test_score"])
df_score = df_score.loc[:,"params":"rank_test_score"]
df_score.head()

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best score:  0.7104743083003953
Best params:  {'C': 2.0, 'degree': 2, 'gamma': 0.31000000000000005}


[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    7.0s finished


Unnamed: 0,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
73,"{'C': 2.0, 'degree': 2, 'gamma': 0.31000000000...",0.695652,0.772727,0.681818,0.681818,0.727273,0.727273,0.772727,0.727273,0.636364,0.681818,0.710474,0.040955,1
103,"{'C': 2.5, 'degree': 2, 'gamma': 0.31000000000...",0.695652,0.772727,0.681818,0.681818,0.727273,0.681818,0.772727,0.727273,0.636364,0.681818,0.705929,0.041359,2
141,"{'C': 3.0, 'degree': 3, 'gamma': 0.11}",0.652174,0.727273,0.681818,0.727273,0.681818,0.727273,0.681818,0.727273,0.681818,0.727273,0.701581,0.027025,3
171,"{'C': 3.5, 'degree': 3, 'gamma': 0.11}",0.652174,0.727273,0.681818,0.727273,0.681818,0.727273,0.681818,0.727273,0.681818,0.727273,0.701581,0.027025,3
158,"{'C': 3.5, 'degree': 1, 'gamma': 0.81}",0.652174,0.727273,0.681818,0.727273,0.727273,0.727273,0.772727,0.681818,0.636364,0.681818,0.701581,0.039456,5


# Model Evaluation

## Test set evaluation

In [82]:
# Evaluating model on 1/3 of dataset
model = SVC(kernel="poly", C = 1.5,  degree = 3, gamma = 0.2)

model.fit(data_train_norm, target_train)
prediction = model.predict(data_test_norm)
# print("Accuracy: ", svm.score(data_test, target_test))
print("Accuracy: ", accuracy_score(target_test, prediction))
labels = ["Assenza di patologia", "Malattia di Alzheimer",  "Disturbo cognitivo lieve", "Disturbo depressivo"]
df_cm = pd.DataFrame(confusion_matrix(target_test, prediction, labels= labels, sample_weight=None, normalize=None), columns = labels)
df_cm.insert(0, "Conf. matrix", labels)
df_cm

Accuracy:  0.6818181818181818


Unnamed: 0,Conf. matrix,Assenza di patologia,Malattia di Alzheimer,Disturbo cognitivo lieve,Disturbo depressivo
0,Assenza di patologia,61,5,1,0
1,Malattia di Alzheimer,8,13,0,0
2,Disturbo cognitivo lieve,4,9,1,0
3,Disturbo depressivo,4,4,0,0


## Cross validation on entire dataset

In [88]:
# Normalize data
data_norm = minmax_scale(df_data, feature_range=(0, 1), axis=0, copy=True)

model = SVC(kernel="poly", C = 1.5,  degree = 3, gamma = 0.2)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(model, data_norm, y=df_target.values, scoring= 'accuracy', cv=cv, verbose=0)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.69 (+/- 0.10)


# New instances prediction