In [2]:
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict, KFold, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import minmax_scale
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
import Constants as const

# Data Preparation

## Data cleaning

In [3]:
# Load dataset
df = pd.read_excel('TrainingSet.xlsx')
df_test = pd.read_excel('TestSet.xlsx')

# Convert Sex attribute in UpperCase
df['Sesso'] = df['Sesso'].str.upper()

# Remove CAP attribute
df = df.drop(columns =['CAP'])
df_test = df_test.drop(columns =['CAP'])
# df = df.drop(columns =['Sesso'])
# df = df.drop(columns =['ID'])

# Remove unlabeled instances
df = df.loc[df["Patologia"] == df["Patologia"]]

dim_df = df.shape[0]
dim_df_test = df_test.shape[0]
df.head()

Unnamed: 0,ID,Scolarita,Eta,Sesso,MMSE_PG,MMSE_PC,MMSE_PE,MMSE_ESITO,CLOCKTEST_PG,CLOCKTEST_PE,...,FAB_ESITO,FLUENZAVERBFON_PG,FLUENZAVERBFON_PC,FLUENZAVERBFON_PE,FLUENZAVERBFON_ESITO,TESTMATRICIATTENTIVE_PG,TESTMATRICIATTENTIVE_PC,TESTMATRICIATTENTIVE_PE,TESTMATRICIATTENTIVE_ESITO,Patologia
0,Row0,13.0,65.0,F,29.0,28.49,238.0,NORMA,5.0,7.0,...,,24.0,20.9,1.0,AI LIMITI INFERIORI DELLA NORMA,47.0,45.75,3.0,NORMA,Malattia di Alzheimer
1,Row1,8.0,70.0,M,27.0,28.2,238.0,NORMA,6.0,5.0,...,,31.0,34.9,4.0,NORMA,,,,,Assenza di patologia
2,Row2,12.0,70.0,F,27.0,26.86,238.0,NORMA,,7.0,...,,24.0,27.9,3.0,NORMA,45.0,47.0,3.0,NORMA,Malattia di Alzheimer
3,Row3,8.0,66.0,M,17.0,17.53,238.0,DEFICIT,2.0,5.0,...,DEFICIT,4.0,7.2,0.0,DEFICIT,7.0,5.25,0.0,DEFICIT,Malattia di Alzheimer
4,Row4,7.0,82.0,M,25.0,25.0,238.0,NORMA,1.0,3.0,...,DEFICIT,11.0,21.4,2.0,NORMA,37.0,41.75,2.0,NORMA,Malattia di Alzheimer


In [5]:
def new_column(df, df_clean, feature, dim):
    col_pc = feature + '_PC'
    col_pg = feature + '_PG'
    col_es = feature + '_ESITO'
    col_new = feature
    tresh = const.const_dict["TRESHOLD"]

    for i in range(dim):

        if col_pc in df.columns:

            if (df.loc[i, col_pc] == df.loc[i, col_pc]) and (df.loc[i,col_pg] == df.loc[i,col_pg]):
                if abs(df.loc[i, col_pc] - df.loc[i,col_pg]) < tresh:
                    df_clean.loc[i,col_new] = df.loc[i, col_pc]
                else:
                    df_clean.loc[i,col_new] = df.loc[i, col_pg]

            elif (df.loc[i, col_pc] != df.loc[i, col_pc]) and (df.loc[i,col_pg] == df.loc[i,col_pg]):
                df_clean.loc[i,col_new] = df.loc[i, col_pg] 
            elif (df.loc[i, col_pc] == df.loc[i, col_pc]) and (df.loc[i,col_pg] != df.loc[i,col_pg]):
                df_clean.loc[i,col_new] = df.loc[i, col_pc]

            elif df.loc[i,col_es] == df.loc[i,col_es]:
                if df.loc[i,col_es] == "NORMA":
                    df_clean.loc[i,col_new] = (const.const_dict[col_new + "_MAX"] + const.const_dict[col_new + "_CUTOFF"])/2
                elif df.loc[i,col_es] == "DEFICIT":
                    df_clean.loc[i,col_new] = const.const_dict[col_new + "_CUTOFF"]/2
                else:
                    df_clean.loc[i,col_new] = const.const_dict[col_new + "_CUTOFF"]

            # else:
                #Null

        else:
            if df.loc[i,col_pg] == df.loc[i,col_pg]:
                df_clean.loc[i,col_new] = df.loc[i, col_pg]
                
            elif df.loc[i,col_es] == df.loc[i,col_es]:
                if df.loc[i,col_es] == "NORMA":
                    df_clean.loc[i,col_new] = (const.const_dict[col_new + "_MAX"] + const.const_dict[col_new + "_CUTOFF"])/2
                elif df.loc[i,col_es] == "DEFICIT":
                    df_clean.loc[i,col_new] = const.const_dict[col_new + "_CUTOFF"]/2
                else:
                    df_clean.loc[i,col_new] = const.const_dict[col_new + "_CUTOFF"]
            # else:
                #Null

# plt.hist(df_clean['MMSE'], bins = 20)

## Imputation

In [6]:
df_clean = pd.DataFrame(columns = df.columns[0:4], data = df.iloc[:,0:4])
df_clean
# "Feature selection"
new_column(df, df_clean, "MMSE", dim_df)
new_column(df, df_clean, "CLOCKTEST", dim_df)
new_column(df, df_clean, "COPIAFIGURAREY", dim_df)
new_column(df, df_clean, "PAROLEREYIMM", dim_df)
new_column(df, df_clean, "PAROLEREYDIFF", dim_df)
new_column(df, df_clean, "MEMORIAFIGURAREY", dim_df)
new_column(df, df_clean, "FAB", dim_df)
new_column(df, df_clean, "FLUENZAVERBFON", dim_df)
new_column(df, df_clean, "TESTMATRICIATTENTIVE", dim_df)

df_clean = df_clean.drop(["Scolarita", "Eta", "Sesso", "FAB"], axis = 1)
df_clean

df_imp_num = df_clean.drop("ID", axis = 1)

num_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
num_imputer.fit(df_imp_num.values)
df_imp_num = pd.DataFrame(num_imputer.transform(df_imp_num.values))
df_imp_num.columns = df_clean.columns.drop("ID")
df_imp_num.isna().any()

Scolarita               False
Eta                     False
MMSE                    False
CLOCKTEST               False
COPIAFIGURAREY          False
PAROLEREYIMM            False
PAROLEREYDIFF           False
MEMORIAFIGURAREY        False
FLUENZAVERBFON          False
TESTMATRICIATTENTIVE    False
dtype: bool

In [7]:
df_clean_test = pd.DataFrame(columns = df_test.columns[0:4], data = df_test.iloc[:,0:4])
df_clean_test
# "Feature selection"
new_column(df_test, df_clean_test, "MMSE", dim_df_test)
new_column(df_test, df_clean_test, "CLOCKTEST", dim_df_test)
new_column(df_test, df_clean_test, "COPIAFIGURAREY", dim_df_test)
new_column(df_test, df_clean_test, "PAROLEREYIMM", dim_df_test)
new_column(df_test, df_clean_test, "PAROLEREYDIFF", dim_df_test)
new_column(df_test, df_clean_test, "MEMORIAFIGURAREY", dim_df_test)
new_column(df_test, df_clean_test, "FAB", dim_df_test)
new_column(df_test, df_clean_test, "FLUENZAVERBFON", dim_df_test)
new_column(df_test, df_clean_test, "TESTMATRICIATTENTIVE", dim_df_test)

df_clean_test = df_clean_test.drop(["Scolarita", "Eta", "Sesso", "FAB"], axis = 1)
df_clean_test

df_imp_num_test = df_clean_test.drop("ID", axis = 1)

df_imp_num_test = pd.DataFrame(num_imputer.transform(df_imp_num_test.values))
df_imp_num_test.columns = df_clean_test.columns.drop("ID")
df_imp_num_test.isna().any()

Scolarita               False
Eta                     False
MMSE                    False
CLOCKTEST               False
COPIAFIGURAREY          False
PAROLEREYIMM            False
PAROLEREYDIFF           False
MEMORIAFIGURAREY        False
FLUENZAVERBFON          False
TESTMATRICIATTENTIVE    False
dtype: bool

# Model Selection

In [8]:
# Splitting Training data and prediction
df_data_num = df_imp_num
df_target = df['Patologia']

# Splitting Test Set data
df_data_test_num = df_imp_num_test

## KNN numerical

In [9]:
# Normalize data
df_data_num_norm = pd.DataFrame(data = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True), columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"metric": ["manhattan", "euclidean"],
          "n_neighbors": np.arange(1,20,1),
          "weights": ["distance", "uniform"]}

# KNN Classifier
# model = NearestNeighbors(*, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None)
model = KNeighborsClassifier(radius=1.0, algorithm='auto', leaf_size=30, n_jobs=2)


# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, scoring = 'accuracy', verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.66169556273876


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6706949750220395


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7010725830149866


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6647810755215986
0.6645606817513959
Mean score:  0.6725609756097561


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.4s finished


## SVC with rbf kernel

In [10]:
# Normalize data
df_data_num_norm = pd.DataFrame(data = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True), columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"C": np.arange(1,4,0.5),
          "gamma": np.arange(0.01,1,0.1),
          "degree":[1,2,3]}

# We will use a Support Vector Classifier
model = SVC(kernel="rbf")

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6767558037026153


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6616588304437261


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6798045841904201


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6738906846899795
0.6888407287687335
Mean score:  0.6761901263590948


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.7s finished


## SVC with polinomial kernel

In [11]:
# Normalize data
df_data_num_norm = pd.DataFrame(data = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True), columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"C": np.arange(1,4,0.5),
          "gamma": np.arange(0.01,1,0.1),
          "degree":[1,2,3]}

# We will use a Support Vector Classifier
model = SVC(kernel="poly")

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv) #verbose = 1)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6797678518953865


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6768292682926829


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6797678518953865


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6678298560094035
0.6677196591243021
Mean score:  0.6743828974434323


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.5s finished


# Hyperparameter Selection

In [7]:
# Split dataset (2/3 training , 1/3 test)
data_train, data_test, target_train, target_test = train_test_split(df_data_num.values, df_target.values, test_size=0.33, stratify = df_target)

# Normalize data
data_train_norm = minmax_scale(data_train, feature_range=(0, 1), axis=0, copy=True)
data_test_norm = minmax_scale(data_test, feature_range=(0, 1), axis=0, copy=True)

# Select best hyerparameter
p_grid = {"C": np.arange(1,4,0.5),
          "gamma": np.arange(0.01,1,0.1),
          "degree":[1,2,3]}

model = SVC(kernel="poly")
hyp_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
clf = GridSearchCV(estimator=model, param_grid=p_grid, cv= hyp_cv, verbose = 1, scoring = 'accuracy')

clf.fit(data_train_norm,target_train)

print("Best score: ", clf.best_score_)
print("Best params: ", clf.best_params_)
# print(clf.best_index_)
# print(clf.scorer_)

df_score = pd.DataFrame(clf.cv_results_)
df_score = df_score.sort_values(by=["rank_test_score"])
df_score = df_score.loc[:,"params":"rank_test_score"]
df_score.head()

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best score:  0.7284584980237154
Best params:  {'C': 2.5, 'degree': 3, 'gamma': 0.31000000000000005}


[Parallel(n_jobs=1)]: Done 1800 out of 1800 | elapsed:    7.4s finished


Unnamed: 0,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
143,"{'C': 3.0, 'degree': 3, 'gamma': 0.31000000000...",0.73913,0.772727,0.681818,0.818182,0.681818,0.772727,0.772727,0.545455,0.818182,0.681818,0.728458,0.07881,1
113,"{'C': 2.5, 'degree': 3, 'gamma': 0.31000000000...",0.73913,0.772727,0.681818,0.818182,0.681818,0.772727,0.772727,0.545455,0.818182,0.681818,0.728458,0.07881,1
23,"{'C': 1.0, 'degree': 3, 'gamma': 0.31000000000...",0.782609,0.772727,0.727273,0.818182,0.681818,0.727273,0.772727,0.545455,0.772727,0.681818,0.728261,0.073963,3
172,"{'C': 3.5, 'degree': 3, 'gamma': 0.21000000000...",0.782609,0.772727,0.727273,0.818182,0.681818,0.727273,0.772727,0.545455,0.772727,0.681818,0.728261,0.073963,3
103,"{'C': 2.5, 'degree': 2, 'gamma': 0.31000000000...",0.73913,0.772727,0.727273,0.772727,0.681818,0.727273,0.818182,0.545455,0.818182,0.636364,0.723913,0.080048,5


# Model Evaluation

## Test set evaluation

In [82]:
# Evaluating model on 1/3 of dataset
model = SVC(kernel="poly", C = 1.5,  degree = 3, gamma = 0.2)

model.fit(data_train_norm, target_train)
prediction = model.predict(data_test_norm)
# print("Accuracy: ", svm.score(data_test, target_test))
print("Accuracy: ", accuracy_score(target_test, prediction))
labels = ["Assenza di patologia", "Malattia di Alzheimer",  "Disturbo cognitivo lieve", "Disturbo depressivo"]
df_cm = pd.DataFrame(confusion_matrix(target_test, prediction, labels= labels, sample_weight=None, normalize=None), columns = labels)
df_cm.insert(0, "Conf. matrix", labels)
df_cm

Accuracy:  0.6818181818181818


Unnamed: 0,Conf. matrix,Assenza di patologia,Malattia di Alzheimer,Disturbo cognitivo lieve,Disturbo depressivo
0,Assenza di patologia,61,5,1,0
1,Malattia di Alzheimer,8,13,0,0
2,Disturbo cognitivo lieve,4,9,1,0
3,Disturbo depressivo,4,4,0,0


## Cross validation on entire dataset

In [26]:
# Normalize data
data_norm = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True)

model = SVC(kernel="poly", C = 1.5,  degree = 3, gamma = 0.2)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
prediction = cross_val_predict(model, data_norm, df_target.values, cv=cv, verbose=5)


# print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Accuracy: ", accuracy_score(df_target.values, prediction))
labels = ["Assenza di patologia", "Malattia di Alzheimer",  "Disturbo cognitivo lieve", "Disturbo depressivo"]
df_cm = pd.DataFrame(confusion_matrix(df_target.values, prediction, labels = labels, sample_weight=None, normalize=None), columns = labels)
df_cm.insert(0, "Conf. matrix", labels)
df_cm


Accuracy:  0.6918429003021148


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


Unnamed: 0,Conf. matrix,Assenza di patologia,Malattia di Alzheimer,Disturbo cognitivo lieve,Disturbo depressivo
0,Assenza di patologia,188,13,2,0
1,Malattia di Alzheimer,26,33,5,0
2,Disturbo cognitivo lieve,14,19,8,0
3,Disturbo depressivo,14,9,0,0


# New instances prediction

In [81]:
# Normalize data
data_norm = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True)
data_test_num_norm = minmax_scale(df_data_test_num, feature_range=(0, 1), axis=0, copy=True)

model = SVC(kernel="poly", C = 1.5,  degree = 3, gamma = 0.2)
model.fit(data_norm,df_target.values)
prediction = model.predict(data_test_num_norm)

df_sub = result_test[["ID"]]
df_sub.insert(len(df_sub.columns), "Patologia", prediction)

df_sub.groupby("Patologia").count()

Unnamed: 0_level_0,ID
Patologia,Unnamed: 1_level_1
Assenza di patologia,105
Disturbo cognitivo lieve,22
Malattia di Alzheimer,37
