In [1]:
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict, KFold, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import minmax_scale
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np
import pandas as pd

# Data Preparation

## Data cleaning

In [2]:
# Load dataset
df = pd.read_excel('TrainingSet.xlsx')
df_test = pd.read_excel('TestSet.xlsx')

# Convert Sex attribute in UpperCase
df['Sesso'] = df['Sesso'].str.upper()

# Remove CAP attribute
df = df.drop(columns =['CAP'])
df_test = df_test.drop(columns =['CAP'])
# df = df.drop(columns =['Sesso'])
# df = df.drop(columns =['ID'])

# Remove unlabeled instances
df = df.loc[df["Patologia"] == df["Patologia"]]

dim = df.shape[0]
df.head()

Unnamed: 0,ID,Scolarita,Eta,Sesso,MMSE_PG,MMSE_PC,MMSE_PE,MMSE_ESITO,CLOCKTEST_PG,CLOCKTEST_PE,...,FAB_ESITO,FLUENZAVERBFON_PG,FLUENZAVERBFON_PC,FLUENZAVERBFON_PE,FLUENZAVERBFON_ESITO,TESTMATRICIATTENTIVE_PG,TESTMATRICIATTENTIVE_PC,TESTMATRICIATTENTIVE_PE,TESTMATRICIATTENTIVE_ESITO,Patologia
0,Row0,13.0,65.0,F,29.0,28.49,238.0,NORMA,5.0,7.0,...,,24.0,20.9,1.0,AI LIMITI INFERIORI DELLA NORMA,47.0,45.75,3.0,NORMA,Malattia di Alzheimer
1,Row1,8.0,70.0,M,27.0,28.2,238.0,NORMA,6.0,5.0,...,,31.0,34.9,4.0,NORMA,,,,,Assenza di patologia
2,Row2,12.0,70.0,F,27.0,26.86,238.0,NORMA,,7.0,...,,24.0,27.9,3.0,NORMA,45.0,47.0,3.0,NORMA,Malattia di Alzheimer
3,Row3,8.0,66.0,M,17.0,17.53,238.0,DEFICIT,2.0,5.0,...,DEFICIT,4.0,7.2,0.0,DEFICIT,7.0,5.25,0.0,DEFICIT,Malattia di Alzheimer
4,Row4,7.0,82.0,M,25.0,25.0,238.0,NORMA,1.0,3.0,...,DEFICIT,11.0,21.4,2.0,NORMA,37.0,41.75,2.0,NORMA,Malattia di Alzheimer


In [3]:
def new_column(df, df_clean, feature):
    col_pc = feature + '_PC'
    col_pg = feature + '_PG'
    col_es = feature + '_ESITO'
    col_new = feature
    tresh = const.const_dict["TRESHOLD"]

    for i in range(dim_df):

        if col_pc in df.columns:

            if (df.loc[i, col_pc] == df.loc[i, col_pc]) and (df.loc[i,col_pg] == df.loc[i,col_pg]):
                if abs(df.loc[i, col_pc] - df.loc[i,col_pg]) < tresh:
                    df_clean.loc[i,col_new] = df.loc[i, col_pc]
                else:
                    df_clean.loc[i,col_new] = df.loc[i, col_pg]

            elif (df.loc[i, col_pc] != df.loc[i, col_pc]) and (df.loc[i,col_pg] == df.loc[i,col_pg]):
                df_clean.loc[i,col_new] = df.loc[i, col_pg] 
            elif (df.loc[i, col_pc] == df.loc[i, col_pc]) and (df.loc[i,col_pg] != df.loc[i,col_pg]):
                df_clean.loc[i,col_new] = df.loc[i, col_pc]

            elif df.loc[i,col_es] == df.loc[i,col_es]:
                if df.loc[i,col_es] == "NORMA":
                    df_clean.loc[i,col_new] = (const.const_dict[col_new + "_MAX"] + const.const_dict[col_new + "_CUTOFF"])/2
                elif df.loc[i,col_es] == "DEFICIT":
                    df_clean.loc[i,col_new] = const.const_dict[col_new + "_CUTOFF"]/2
                else:
                    df_clean.loc[i,col_new] = const.const_dict[col_new + "_CUTOFF"]

            # else:
                #Null

        else:
            if df.loc[i,col_pg] == df.loc[i,col_pg]:
                df_clean.loc[i,col_new] = df.loc[i, col_pg]
                
            elif df.loc[i,col_es] == df.loc[i,col_es]:
                if df.loc[i,col_es] == "NORMA":
                    df_clean.loc[i,col_new] = (const.const_dict[col_new + "_MAX"] + const.const_dict[col_new + "_CUTOFF"])/2
                elif df.loc[i,col_es] == "DEFICIT":
                    df_clean.loc[i,col_new] = const.const_dict[col_new + "_CUTOFF"]/2
                else:
                    df_clean.loc[i,col_new] = const.const_dict[col_new + "_CUTOFF"]
            # else:
                #Null

# plt.hist(df_clean['MMSE'], bins = 20)

## Imputation

In [3]:
# Imputation
num_col = [c for c in df.columns if not("_esito" in c.lower()) and not("_pe" in c.lower()) 
           and not("patologia" in c.lower()) and not("sesso" in c.lower()) and not("id" in c.lower())]
enum_col = [c for c in df.columns if("_esito" in c.lower()) or ("sesso" in c.lower())]

df_imp_num = df.loc[:,num_col]
df_imp_enum = df.loc[:,enum_col]

columns = df_imp_num.columns.append(df_imp_enum.columns).insert(0, "ID")

num_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
num_imputer.fit(df_imp_num.values)
df_imp_num = pd.DataFrame(num_imputer.transform(df_imp_num.values))

enum_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
enum_imputer.fit(df_imp_enum.values)
df_imp_enum = pd.DataFrame(enum_imputer.transform(df_imp_enum.values))

df_imp_num.insert(0, "ID", df["ID"])

df_imp_enum.insert(0, "ID", df["ID"])
# df_imp_num

# df_imp_num.append(df_imp_enum, axis = 1)

result = pd.merge(df_imp_num, df_imp_enum, on = "ID")
result.columns = columns
result["Patologia"] = df["Patologia"]
result

# Test set imputation
num_col = [c for c in df_test.columns if not("_esito" in c.lower()) and not("_pe" in c.lower()) 
           and not("sesso" in c.lower()) and not("id" in c.lower())]
enum_col = [c for c in df_test.columns if("_esito" in c.lower()) or ("sesso" in c.lower())]

df_imp_num = df_test.loc[:,num_col]
df_imp_enum = df_test.loc[:,enum_col]

df_imp_num = pd.DataFrame(num_imputer.transform(df_imp_num.values))
df_imp_enum = pd.DataFrame(enum_imputer.transform(df_imp_enum.values))

df_imp_num.insert(0, "ID", df_test["ID"])
df_imp_enum.insert(0, "ID", df_test["ID"])

result_test = pd.merge(df_imp_num, df_imp_enum, on = "ID")
result_test.columns = columns


# Model Selection

In [4]:
# Splitting Training data and prediction
num_col = [c for c in df.columns if not("_esito" in c.lower()) and not("_pe" in c.lower()) 
           and not("patologia" in c.lower()) and not("sesso" in c.lower()) and not("id" in c.lower())]
nom_col = [c for c in df.columns if("_esito" in c.lower()) or ("sesso" in c.lower()) or ("patologia" in c.lower())]

df_data_num = result[num_col]
df_data_nom = result[nom_col]
df_data_num_nom = result.drop(["ID", "Patologia"], axis = 1)
df_target = result['Patologia']


# Splitting Test Set data
num_col = [c for c in df_test.columns if not("_esito" in c.lower()) and not("_pe" in c.lower()) 
           and not("sesso" in c.lower()) and not("id" in c.lower())]
nom_col = [c for c in df_test.columns if("_esito" in c.lower()) or ("sesso" in c.lower())]

df_data_test_num = result_test[num_col]
df_data_test_nom = result_test[nom_col]
df_data_test_num_nom = result_test.drop(["ID"], axis = 1)
df_data_test_num

Unnamed: 0,Scolarita,Eta,MMSE_PG,MMSE_PC,CLOCKTEST_PG,COPIAFIGURAREY_PG,COPIAFIGURAREY_PC,PAROLEREYIMM_PG,PAROLEREYIMM_PC,PAROLEREYDIFF_PG,PAROLEREYDIFF_PC,MEMORIAFIGURAREY_PG,MEMORIAFIGURAREY_PC,FAB_PG,FAB_PC,FLUENZAVERBFON_PG,FLUENZAVERBFON_PC,TESTMATRICIATTENTIVE_PG,TESTMATRICIATTENTIVE_PC
0,13.0,81.0,25.0,25.00,7.0,34.0,36.00,24.0,31.4,0.0,2.6,3.0,6.75,14.0,13.245,14.0,13.5,52.0,52.50
1,5.0,76.0,24.0,26.03,4.0,31.0,35.25,20.0,30.0,3.0,6.1,18.0,25.50,9.0,13.245,14.0,23.4,45.0,44.75
2,8.0,66.0,28.0,28.53,7.0,14.5,16.00,29.0,33.0,4.0,5.3,10.0,11.25,14.0,13.245,20.0,23.2,38.0,44.75
3,10.0,62.0,27.0,26.49,7.0,35.0,35.25,25.0,27.3,5.0,5.7,10.0,10.00,14.0,13.245,41.0,43.5,40.0,39.25
4,14.0,59.0,25.0,23.31,0.0,25.0,24.00,39.0,37.0,8.0,7.5,7.0,7.50,15.0,13.245,32.0,27.8,38.0,34.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,5.0,80.0,8.0,27.97,8.0,31.0,31.00,29.0,32.9,5.0,6.0,10.0,10.00,14.0,13.245,29.0,29.5,45.0,44.75
160,8.0,72.0,25.0,26.20,9.0,30.0,32.00,21.0,26.9,0.0,1.9,10.0,9.25,14.0,13.245,23.0,26.9,37.0,36.50
161,13.0,77.0,29.0,28.86,7.0,32.0,33.50,28.0,33.2,5.0,6.9,10.0,19.25,18.0,13.245,27.0,25.5,52.0,53.75
162,17.0,62.0,29.0,27.46,10.0,34.0,33.50,38.0,35.8,3.0,2.6,10.0,10.75,18.0,13.245,48.0,40.2,55.0,46.00


## Multi Layer Perceptron

In [None]:
# Normalize data
df_data_num_norm = pd.DataFrame(data = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True), columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"alpha": np.arange(0.01,1,0.1),
          "hidden_layer_sizes": [(19, ), (23, ), (11, )],
          "momentum": np.arange(0.1,1,0.1),
          "activation": ["logistic", "identity", "tanh", "relu"]}

# Multi Layer Perceptron Classifier
model = MLPClassifier(solver='sgd', learning_rate='adaptive', learning_rate_init=0.1, max_iter=1500, shuffle=True,
                      tol=0.001, verbose=0, nesterovs_momentum=False, early_stopping=True)

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, scoring = 'accuracy', verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


## Logistic Regression

In [14]:
# Normalize data
df_data_num_norm = pd.DataFrame(data = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True), columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"C": np.arange(0.1,2,0.3),
          "solver": ["lbfgs", "newton-cg",],
          "multi_class": ["ovr", "multinomial"]}

weights = {"Assenza di patologia" : 1,
           "Malattia di Alzheimer" : 1,
           "Disturbo cognitivo lieve" : 2,
           "Disturbo depressivo" : 3}

# Logistic Regression Classifier
model = LogisticRegression(penalty='l2', tol=0.0001, fit_intercept=True, intercept_scaling=1, 
                           class_weight= weights, random_state=None, max_iter=1000, verbose=0, l1_ratio=None)

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, scoring = 'accuracy', verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6767190714075816


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6767190714075816


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   12.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6797678518953865


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.679841316485454
0.6765721422274464
Mean score:  0.6779238906846901


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.6s finished


## KNN numerical

In [6]:
# Normalize data
df_data_num_norm = pd.DataFrame(data = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True), columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"metric": ["manhattan", "euclidean"],
          "n_neighbors": np.arange(1,20,1),
          "weights": ["distance", "uniform"]}

# KNN Classifier
# model = NearestNeighbors(*, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None)
model = KNeighborsClassifier(radius=1.0, algorithm='auto', leaf_size=30, n_jobs=2)


# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, scoring = 'accuracy', verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7159124302086395


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7008154569497502


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7219732588892154


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6795841904202174
0.7008154569497502
Mean score:  0.7038201586835147


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.6s finished


## SVC with rbf kernel

In [7]:
# Normalize data
df_data_num_norm = pd.DataFrame(data = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True), columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"C": np.arange(0.1,2,0.3),
          "gamma": np.arange(0.01,1,0.1),
          "degree":[1,2,3]}

# We will use a Support Vector Classifier
model = SVC(kernel="rbf")

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    9.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6948648251542757


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6737437555098443


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    9.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6858286805759624


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.691962973846606
0.700852189244784
Mean score:  0.6894504848662945


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    9.0s finished


## SVC with polinomial kernel

In [77]:
# Normalize data
df_data_num_norm = pd.DataFrame(data = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True), columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"C": np.arange(0.1,2,0.3),
          "gamma": np.arange(0.01,1,0.1),
          "degree":[1,2,3]}

weights = {"Assenza di patologia" : 1,
           "Malattia di Alzheimer" : 1,
           "Disturbo cognitivo lieve" : 1,
           "Disturbo depressivo" : 1.5}

# We will use a Support Vector Classifier
model = SVC(kernel="poly", class_weight = None)

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   12.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6948648251542757


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6949382897443432


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7039377020276227


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.691962973846606
0.6737070232148106
Mean score:  0.6918821627975318


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.4s finished


# Hyperparameter Selection

In [79]:
# Split dataset (2/3 training , 1/3 test)
data_train, data_test, target_train, target_test = train_test_split(df_data_num.values, df_target.values, test_size=0.33, stratify = df_target)

# Normalize data
data_train_norm = minmax_scale(data_train, feature_range=(0, 1), axis=0, copy=True)
data_test_norm = minmax_scale(data_test, feature_range=(0, 1), axis=0, copy=True)

# Select best hyerparameter
p_grid = {"C": np.arange(0.1,2,0.3),
          "gamma": np.arange(0.01,1,0.1),
          "degree":[1,2,3]}


weights = {"Assenza di patologia" : 1,
           "Malattia di Alzheimer" : 1,
           "Disturbo cognitivo lieve" : 1,
           "Disturbo depressivo" : 1.5}

model = SVC(kernel="poly", class_weight = weights)
hyp_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
clf = GridSearchCV(estimator=model, param_grid=p_grid, cv= hyp_cv, verbose = 1, scoring = 'accuracy')

clf.fit(data_train_norm,target_train)

print("Best score: ", clf.best_score_)
print("Best params: ", clf.best_params_)


df_score = pd.DataFrame(clf.cv_results_)
df_score = df_score.sort_values(by=["rank_test_score"])
df_score = df_score.loc[:,"params":"rank_test_score"]
df_score.head()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 210 candidates, totalling 2100 fits
Best score:  0.7148221343873519
Best params:  {'C': 1.0000000000000002, 'degree': 3, 'gamma': 0.21000000000000002}


[Parallel(n_jobs=1)]: Done 2100 out of 2100 | elapsed:    7.3s finished


Unnamed: 0,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
112,"{'C': 1.0000000000000002, 'degree': 3, 'gamma'...",0.73913,0.818182,0.590909,0.727273,0.772727,0.681818,0.727273,0.681818,0.681818,0.727273,0.714822,0.058064,1
53,"{'C': 0.4, 'degree': 3, 'gamma': 0.31000000000...",0.73913,0.818182,0.590909,0.727273,0.772727,0.681818,0.727273,0.636364,0.681818,0.727273,0.710277,0.062108,2
142,"{'C': 1.3000000000000003, 'degree': 3, 'gamma'...",0.73913,0.818182,0.590909,0.727273,0.772727,0.681818,0.727273,0.636364,0.681818,0.727273,0.710277,0.062108,2
192,"{'C': 1.9000000000000004, 'degree': 2, 'gamma'...",0.782609,0.772727,0.590909,0.727273,0.772727,0.681818,0.727273,0.636364,0.681818,0.727273,0.710079,0.059355,4
54,"{'C': 0.4, 'degree': 3, 'gamma': 0.41000000000...",0.73913,0.727273,0.636364,0.727273,0.772727,0.636364,0.727273,0.636364,0.727273,0.727273,0.705731,0.047286,5


# Model Evaluation

## Test set evaluation

In [80]:
# Evaluating model on 1/3 of dataset
weights = {"Assenza di patologia" : 1,
           "Malattia di Alzheimer" : 1,
           "Disturbo cognitivo lieve" : 1.5,
           "Disturbo depressivo" : 2}

model = SVC(kernel="poly", C = 1,  degree = 3, gamma = 0.2, class_weight = weights)

model.fit(data_train_norm, target_train)
prediction = model.predict(data_test_norm)
# print("Accuracy: ", svm.score(data_test, target_test))
print("Accuracy: ", accuracy_score(target_test, prediction))
labels = ["Assenza di patologia", "Malattia di Alzheimer",  "Disturbo cognitivo lieve", "Disturbo depressivo"]
df_cm = pd.DataFrame(confusion_matrix(target_test, prediction, labels= labels, sample_weight=None, normalize=None), columns = labels)
df_cm.insert(0, "Conf. matrix", labels)
df_cm

Accuracy:  0.7272727272727273


Unnamed: 0,Conf. matrix,Assenza di patologia,Malattia di Alzheimer,Disturbo cognitivo lieve,Disturbo depressivo
0,Assenza di patologia,64,2,1,0
1,Malattia di Alzheimer,7,12,2,0
2,Disturbo cognitivo lieve,5,6,3,0
3,Disturbo depressivo,4,3,0,1


## Cross validation on entire dataset

In [81]:
# Normalize data
data_norm = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True)

weights = {"Assenza di patologia" : 1,
           "Malattia di Alzheimer" : 1,
           "Disturbo cognitivo lieve" : 1,
           "Disturbo depressivo" : 1.5}

model = SVC(kernel="poly", C = 1,  degree = 3, gamma = 0.2, class_weight = weights)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
prediction = cross_val_predict(model, data_norm, df_target.values, cv=cv, verbose=5)


# print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Accuracy: ", accuracy_score(df_target.values, prediction))
labels = ["Assenza di patologia", "Malattia di Alzheimer",  "Disturbo cognitivo lieve", "Disturbo depressivo"]
df_cm = pd.DataFrame(confusion_matrix(df_target.values, prediction, labels = labels, sample_weight=None, normalize=None), columns = labels)
df_cm.insert(0, "Conf. matrix", labels)
df_cm


Accuracy:  0.6918429003021148


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


Unnamed: 0,Conf. matrix,Assenza di patologia,Malattia di Alzheimer,Disturbo cognitivo lieve,Disturbo depressivo
0,Assenza di patologia,190,11,2,0
1,Malattia di Alzheimer,27,33,4,0
2,Disturbo cognitivo lieve,16,19,6,0
3,Disturbo depressivo,15,8,0,0


# New instances prediction

In [82]:
# Normalize data
data_norm = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True)
data_test_num_norm = minmax_scale(df_data_test_num, feature_range=(0, 1), axis=0, copy=True)

weights = {"Assenza di patologia" : 1,
           "Malattia di Alzheimer" : 1,
           "Disturbo cognitivo lieve" : 1,
           "Disturbo depressivo" : 1.5}

model = SVC(kernel="poly", C = 1,  degree = 3, gamma = 0.2, class_weight = weights)

model.fit(data_norm,df_target.values)
prediction = model.predict(data_test_num_norm)

df_sub = result_test[["ID"]]
df_sub.insert(len(df_sub.columns), "Patologia", prediction)

df_sub.groupby("Patologia").count()

Unnamed: 0_level_0,ID
Patologia,Unnamed: 1_level_1
Assenza di patologia,106
Disturbo cognitivo lieve,16
Malattia di Alzheimer,42


In [83]:
df_sub

Unnamed: 0,ID,Patologia
0,Row6,Disturbo cognitivo lieve
1,Row8,Malattia di Alzheimer
2,Row9,Disturbo cognitivo lieve
3,Row10,Assenza di patologia
4,Row13,Malattia di Alzheimer
...,...,...
159,Row505,Assenza di patologia
160,Row506,Malattia di Alzheimer
161,Row507,Assenza di patologia
162,Row508,Assenza di patologia


In [13]:
df_sub.to_csv("SVC_poly_1p3_3_0p5.csv", index = False)