In [1]:
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict, KFold, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import minmax_scale, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
import Constants as const
import numpy as np
import pandas as pd

# Data Preparation

## Data Cleaning

In [2]:
def corr_esito(df):
    perc = 10/100
    for i in range(df.shape[0]):
        if df.loc[i,"MMSE_PC"] == df.loc[i,"MMSE_PC"]:
            if df.loc[i,"MMSE_PC"] < const.const_dict["MMSE_CUTOFF"] + const.const_dict["MMSE_CUTOFF"]*perc and df.loc[i,"MMSE_PC"] > const.const_dict["MMSE_CUTOFF"] - const.const_dict["MMSE_CUTOFF"]*perc:
                df.loc[i,"MMSE_ESITO"] = "LIMITI"
            elif df.loc[i,"MMSE_PC"] < const.const_dict["MMSE_CUTOFF"] - const.const_dict["MMSE_CUTOFF"]*perc:
                df.loc[i,"MMSE_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"MMSE_ESITO"] = "NORMA"
        if df.loc[i,"MMSE_PG"] == df.loc[i,"MMSE_PG"]:
            if df.loc[i,"MMSE_PG"] < const.const_dict["MMSE_CUTOFF"] + const.const_dict["MMSE_CUTOFF"]*perc and df.loc[i,"MMSE_PG"] > const.const_dict["MMSE_CUTOFF"] - const.const_dict["MMSE_CUTOFF"]*perc:
                df.loc[i,"MMSE_ESITO"] = "LIMITI"
            elif df.loc[i,"MMSE_PG"] < const.const_dict["MMSE_CUTOFF"] - const.const_dict["MMSE_CUTOFF"]*perc:
                df.loc[i,"MMSE_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"MMSE_ESITO"] = "NORMA"
        
        
        if df.loc[i,"COPIAFIGURAREY_PC"] == df.loc[i,"COPIAFIGURAREY_PC"]:
            if df.loc[i,"COPIAFIGURAREY_PC"] < const.const_dict["COPIAFIGURAREY_CUTOFF"] + const.const_dict["COPIAFIGURAREY_CUTOFF"]*perc and df.loc[i,"COPIAFIGURAREY_PC"] > const.const_dict["COPIAFIGURAREY_CUTOFF"] - const.const_dict["COPIAFIGURAREY_CUTOFF"]*perc:
                df.loc[i,"COPIAFIGURAREY_ESITO"] = "LIMITI"
            elif df.loc[i,"COPIAFIGURAREY_PC"] < const.const_dict["COPIAFIGURAREY_CUTOFF"] - const.const_dict["COPIAFIGURAREY_CUTOFF"]*perc:
                df.loc[i,"COPIAFIGURAREY_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"COPIAFIGURAREY_ESITO"] = "NORMA"
        if df.loc[i,"COPIAFIGURAREY_PG"] == df.loc[i,"COPIAFIGURAREY_PG"]:
            if df.loc[i,"COPIAFIGURAREY_PG"] < const.const_dict["COPIAFIGURAREY_CUTOFF"] + const.const_dict["COPIAFIGURAREY_CUTOFF"]*perc and df.loc[i,"COPIAFIGURAREY_PG"] > const.const_dict["COPIAFIGURAREY_CUTOFF"] - const.const_dict["COPIAFIGURAREY_CUTOFF"]*perc:
                df.loc[i,"COPIAFIGURAREY_ESITO"] = "LIMITI"
            elif df.loc[i,"COPIAFIGURAREY_PG"] < const.const_dict["COPIAFIGURAREY_CUTOFF"] - const.const_dict["COPIAFIGURAREY_CUTOFF"]*perc:
                df.loc[i,"COPIAFIGURAREY_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"COPIAFIGURAREY_ESITO"] = "NORMA"
                
        
        if df.loc[i,"PAROLEREYIMM_PC"] == df.loc[i,"PAROLEREYIMM_PC"]:
            if df.loc[i,"PAROLEREYIMM_PC"] < const.const_dict["PAROLEREYIMM_CUTOFF"] + const.const_dict["PAROLEREYIMM_CUTOFF"]*perc and df.loc[i,"PAROLEREYIMM_PC"] > const.const_dict["PAROLEREYIMM_CUTOFF"] - const.const_dict["PAROLEREYIMM_CUTOFF"]*perc:
                df.loc[i,"PAROLEREYIMM_ESITO"] = "LIMITI"
            elif df.loc[i,"PAROLEREYIMM_PC"] < const.const_dict["PAROLEREYIMM_CUTOFF"] - const.const_dict["PAROLEREYIMM_CUTOFF"]*perc:
                df.loc[i,"PAROLEREYIMM_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"PAROLEREYIMM_ESITO"] = "NORMA"
        if df.loc[i,"PAROLEREYIMM_PG"] == df.loc[i,"PAROLEREYIMM_PG"]:
            if df.loc[i,"PAROLEREYIMM_PG"] < const.const_dict["PAROLEREYIMM_CUTOFF"] + const.const_dict["PAROLEREYIMM_CUTOFF"]*perc and df.loc[i,"PAROLEREYIMM_PG"] > const.const_dict["PAROLEREYIMM_CUTOFF"] - const.const_dict["PAROLEREYIMM_CUTOFF"]*perc:
                df.loc[i,"PAROLEREYIMM_ESITO"] = "LIMITI"
            elif df.loc[i,"PAROLEREYIMM_PG"] < const.const_dict["PAROLEREYIMM_CUTOFF"] - const.const_dict["PAROLEREYIMM_CUTOFF"]*perc:
                df.loc[i,"PAROLEREYIMM_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"PAROLEREYIMM_ESITO"] = "NORMA"
        
        
        if df.loc[i,"PAROLEREYDIFF_PC"] == df.loc[i,"PAROLEREYDIFF_PC"]:
            if df.loc[i,"PAROLEREYDIFF_PC"] < const.const_dict["PAROLEREYDIFF_CUTOFF"] + const.const_dict["PAROLEREYDIFF_CUTOFF"]*perc and df.loc[i,"PAROLEREYDIFF_PC"] > const.const_dict["PAROLEREYDIFF_CUTOFF"] - const.const_dict["PAROLEREYDIFF_CUTOFF"]*perc:
                df.loc[i,"PAROLEREYDIFF_ESITO"] = "LIMITI"
            elif df.loc[i,"PAROLEREYDIFF_PC"] < const.const_dict["PAROLEREYDIFF_CUTOFF"] - const.const_dict["PAROLEREYDIFF_CUTOFF"]*perc:
                df.loc[i,"PAROLEREYDIFF_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"PAROLEREYDIFF_ESITO"] = "NORMA"
        if df.loc[i,"PAROLEREYDIFF_PG"] == df.loc[i,"PAROLEREYDIFF_PG"]:
            if df.loc[i,"PAROLEREYDIFF_PG"] < const.const_dict["PAROLEREYDIFF_CUTOFF"] + const.const_dict["PAROLEREYDIFF_CUTOFF"]*perc and df.loc[i,"PAROLEREYDIFF_PG"] > const.const_dict["PAROLEREYDIFF_CUTOFF"] - const.const_dict["PAROLEREYDIFF_CUTOFF"]*perc:
                df.loc[i,"PAROLEREYDIFF_ESITO"] = "LIMITI"
            elif df.loc[i,"PAROLEREYDIFF_PG"] < const.const_dict["PAROLEREYDIFF_CUTOFF"] - const.const_dict["PAROLEREYDIFF_CUTOFF"]*perc:
                df.loc[i,"PAROLEREYDIFF_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"PAROLEREYDIFF_ESITO"] = "NORMA"
        
        
        if df.loc[i,"MEMORIAFIGURAREY_PC"] == df.loc[i,"MEMORIAFIGURAREY_PC"]:
            if df.loc[i,"MEMORIAFIGURAREY_PC"] < const.const_dict["MEMORIAFIGURAREY_CUTOFF"] + const.const_dict["MEMORIAFIGURAREY_CUTOFF"]*perc and df.loc[i,"MEMORIAFIGURAREY_PC"] > const.const_dict["MEMORIAFIGURAREY_CUTOFF"] - const.const_dict["MEMORIAFIGURAREY_CUTOFF"]*perc:
                df.loc[i,"MEMORIAFIGURAREY_ESITO"] = "LIMITI"
            elif df.loc[i,"MEMORIAFIGURAREY_PC"] < const.const_dict["MEMORIAFIGURAREY_CUTOFF"] - const.const_dict["MEMORIAFIGURAREY_CUTOFF"]*perc:
                df.loc[i,"MEMORIAFIGURAREY_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"MEMORIAFIGURAREY_ESITO"] = "NORMA"
        if df.loc[i,"MEMORIAFIGURAREY_PG"] == df.loc[i,"MEMORIAFIGURAREY_PG"]:
            if df.loc[i,"MEMORIAFIGURAREY_PG"] < const.const_dict["MEMORIAFIGURAREY_CUTOFF"] + const.const_dict["MEMORIAFIGURAREY_CUTOFF"]*perc and df.loc[i,"MEMORIAFIGURAREY_PG"] > const.const_dict["MEMORIAFIGURAREY_CUTOFF"] - const.const_dict["MEMORIAFIGURAREY_CUTOFF"]*perc:
                df.loc[i,"MEMORIAFIGURAREY_ESITO"] = "LIMITI"
            elif df.loc[i,"MEMORIAFIGURAREY_PG"] < const.const_dict["MEMORIAFIGURAREY_CUTOFF"] - const.const_dict["MEMORIAFIGURAREY_CUTOFF"]*perc:
                df.loc[i,"MEMORIAFIGURAREY_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"MEMORIAFIGURAREY_ESITO"] = "NORMA"
        
        
        if df.loc[i,"FAB_PC"] == df.loc[i,"FAB_PC"]:
            if df.loc[i,"FAB_PC"] < const.const_dict["FAB_CUTOFF"] + const.const_dict["FAB_CUTOFF"]*perc and df.loc[i,"FAB_PC"] > const.const_dict["FAB_CUTOFF"] - const.const_dict["FAB_CUTOFF"]*perc:
                df.loc[i,"FAB_ESITO"] = "LIMITI"
            elif df.loc[i,"FAB_PC"] < const.const_dict["FAB_CUTOFF"] - const.const_dict["FAB_CUTOFF"]*perc:
                df.loc[i,"FAB_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"FAB_ESITO"] = "NORMA"
        if df.loc[i,"FAB_PG"] == df.loc[i,"FAB_PG"]:
            if df.loc[i,"FAB_PG"] < const.const_dict["FAB_CUTOFF"] + const.const_dict["FAB_CUTOFF"]*perc and df.loc[i,"FAB_PG"] > const.const_dict["FAB_CUTOFF"] - const.const_dict["FAB_CUTOFF"]*perc:
                df.loc[i,"FAB_ESITO"] = "LIMITI"
            elif df.loc[i,"FAB_PG"] < const.const_dict["FAB_CUTOFF"] - const.const_dict["FAB_CUTOFF"]*perc:
                df.loc[i,"FAB_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"FAB_ESITO"] = "NORMA"
        
        
        if df.loc[i,"FLUENZAVERBFON_PC"] == df.loc[i,"FLUENZAVERBFON_PC"]:
            if df.loc[i,"FLUENZAVERBFON_PC"] < const.const_dict["FLUENZAVERBFON_CUTOFF"] + const.const_dict["FLUENZAVERBFON_CUTOFF"]*perc and df.loc[i,"FLUENZAVERBFON_PC"] > const.const_dict["FLUENZAVERBFON_CUTOFF"] - const.const_dict["FLUENZAVERBFON_CUTOFF"]*perc:
                df.loc[i,"FLUENZAVERBFON_ESITO"] = "LIMITI"
            elif df.loc[i,"FLUENZAVERBFON_PC"] < const.const_dict["FLUENZAVERBFON_CUTOFF"] - const.const_dict["FLUENZAVERBFON_CUTOFF"]*perc:
                df.loc[i,"FLUENZAVERBFON_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"FLUENZAVERBFON_ESITO"] = "NORMA"
        if df.loc[i,"FLUENZAVERBFON_PG"] == df.loc[i,"FLUENZAVERBFON_PG"]:
            if df.loc[i,"FLUENZAVERBFON_PG"] < const.const_dict["FLUENZAVERBFON_CUTOFF"] + const.const_dict["FLUENZAVERBFON_CUTOFF"]*perc and df.loc[i,"FLUENZAVERBFON_PG"] > const.const_dict["FLUENZAVERBFON_CUTOFF"] - const.const_dict["FLUENZAVERBFON_CUTOFF"]*perc:
                df.loc[i,"FLUENZAVERBFON_ESITO"] = "LIMITI"
            elif df.loc[i,"FLUENZAVERBFON_PG"] < const.const_dict["FLUENZAVERBFON_CUTOFF"] - const.const_dict["FLUENZAVERBFON_CUTOFF"]*perc:
                df.loc[i,"FLUENZAVERBFON_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"FLUENZAVERBFON_ESITO"] = "NORMA"
        
        
        if df.loc[i,"TESTMATRICIATTENTIVE_PC"] == df.loc[i,"TESTMATRICIATTENTIVE_PC"]:
            if df.loc[i,"TESTMATRICIATTENTIVE_PC"] < const.const_dict["TESTMATRICIATTENTIVE_CUTOFF"] + const.const_dict["TESTMATRICIATTENTIVE_CUTOFF"]*perc and df.loc[i,"TESTMATRICIATTENTIVE_PC"] > const.const_dict["TESTMATRICIATTENTIVE_CUTOFF"] - const.const_dict["TESTMATRICIATTENTIVE_CUTOFF"]*perc:
                df.loc[i,"TESTMATRICIATTENTIVE_ESITO"] = "LIMITI"
            elif df.loc[i,"TESTMATRICIATTENTIVE_PC"] < const.const_dict["TESTMATRICIATTENTIVE_CUTOFF"] - const.const_dict["TESTMATRICIATTENTIVE_CUTOFF"]*perc:
                df.loc[i,"TESTMATRICIATTENTIVE_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"TESTMATRICIATTENTIVE_ESITO"] = "NORMA"
        if df.loc[i,"TESTMATRICIATTENTIVE_PG"] == df.loc[i,"TESTMATRICIATTENTIVE_PG"]:
            if df.loc[i,"TESTMATRICIATTENTIVE_PG"] < const.const_dict["TESTMATRICIATTENTIVE_CUTOFF"] + const.const_dict["TESTMATRICIATTENTIVE_CUTOFF"]*perc and df.loc[i,"TESTMATRICIATTENTIVE_PG"] > const.const_dict["TESTMATRICIATTENTIVE_CUTOFF"] - const.const_dict["TESTMATRICIATTENTIVE_CUTOFF"]*perc:
                df.loc[i,"TESTMATRICIATTENTIVE_ESITO"] = "LIMITI"
            elif df.loc[i,"TESTMATRICIATTENTIVE_PG"] < const.const_dict["TESTMATRICIATTENTIVE_CUTOFF"] - const.const_dict["TESTMATRICIATTENTIVE_CUTOFF"]*perc:
                df.loc[i,"TESTMATRICIATTENTIVE_ESITO"] = "DEFICIT"
            else:
                df.loc[i,"TESTMATRICIATTENTIVE_ESITO"] = "NORMA"
        

In [3]:
# Load dataset
df = pd.read_excel('TrainingSet.xlsx')
df_test = pd.read_excel('TestSet.xlsx')

# Convert Sex attribute in UpperCase
df['Sesso'] = df['Sesso'].str.upper()

# Remove CAP attribute
df = df.drop(columns =['CAP'])
df_test = df_test.drop(columns =['CAP'])
# df = df.drop(columns =['Sesso'])
# df = df.drop(columns =['ID'])

# Remove unlabeled instances
df = df.loc[df["Patologia"] == df["Patologia"]]

#Correct Esito columns
corr_esito(df)

dim = df.shape[0]
df.head()

Unnamed: 0,ID,Scolarita,Eta,Sesso,MMSE_PG,MMSE_PC,MMSE_PE,MMSE_ESITO,CLOCKTEST_PG,CLOCKTEST_PE,...,FAB_ESITO,FLUENZAVERBFON_PG,FLUENZAVERBFON_PC,FLUENZAVERBFON_PE,FLUENZAVERBFON_ESITO,TESTMATRICIATTENTIVE_PG,TESTMATRICIATTENTIVE_PC,TESTMATRICIATTENTIVE_PE,TESTMATRICIATTENTIVE_ESITO,Patologia
0,Row0,13.0,65.0,F,29.0,28.49,238.0,NORMA,5.0,7.0,...,,24.0,20.9,1.0,NORMA,47.0,45.75,3.0,NORMA,Malattia di Alzheimer
1,Row1,8.0,70.0,M,27.0,28.2,238.0,NORMA,6.0,5.0,...,,31.0,34.9,4.0,NORMA,,,,,Assenza di patologia
2,Row2,12.0,70.0,F,27.0,26.86,238.0,NORMA,,7.0,...,,24.0,27.9,3.0,NORMA,45.0,47.0,3.0,NORMA,Malattia di Alzheimer
3,Row3,8.0,66.0,M,17.0,17.53,238.0,DEFICIT,2.0,5.0,...,DEFICIT,4.0,7.2,0.0,DEFICIT,7.0,5.25,0.0,DEFICIT,Malattia di Alzheimer
4,Row4,7.0,82.0,M,25.0,25.0,238.0,LIMITI,1.0,3.0,...,DEFICIT,11.0,21.4,2.0,DEFICIT,37.0,41.75,2.0,LIMITI,Malattia di Alzheimer


## Imputation

In [4]:
# Imputation
num_col = [c for c in df.columns if not("_esito" in c.lower()) and not("_pe" in c.lower()) 
           and not("patologia" in c.lower()) and not("sesso" in c.lower()) and not("id" in c.lower())]
enum_col = [c for c in df.columns if("_esito" in c.lower()) or ("sesso" in c.lower())]

df_imp_num = df.loc[:,num_col]
df_imp_enum = df.loc[:,enum_col]

columns = df_imp_num.columns.append(df_imp_enum.columns).insert(0, "ID")

num_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# num_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 0)

num_imputer.fit(df_imp_num.values)
df_imp_num = pd.DataFrame(num_imputer.transform(df_imp_num.values))

enum_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
enum_imputer.fit(df_imp_enum.values)
df_imp_enum = pd.DataFrame(enum_imputer.transform(df_imp_enum.values))

df_imp_num.insert(0, "ID", df["ID"])

df_imp_enum.insert(0, "ID", df["ID"])
# df_imp_num

# df_imp_num.append(df_imp_enum, axis = 1)

result = pd.merge(df_imp_num, df_imp_enum, on = "ID")
result.columns = columns
result["Patologia"] = df["Patologia"]
result

# Test set imputation
num_col = [c for c in df_test.columns if not("_esito" in c.lower()) and not("_pe" in c.lower()) 
           and not("sesso" in c.lower()) and not("id" in c.lower())]
enum_col = [c for c in df_test.columns if("_esito" in c.lower()) or ("sesso" in c.lower())]

df_imp_num = df_test.loc[:,num_col]
df_imp_enum = df_test.loc[:,enum_col]

df_imp_num = pd.DataFrame(num_imputer.transform(df_imp_num.values))
df_imp_enum = pd.DataFrame(enum_imputer.transform(df_imp_enum.values))

df_imp_num.insert(0, "ID", df_test["ID"])
df_imp_enum.insert(0, "ID", df_test["ID"])

result_test = pd.merge(df_imp_num, df_imp_enum, on = "ID")
result_test.columns = columns


# Model Selection

In [5]:
# Splitting Training data and prediction
num_col = [c for c in df.columns if not("_esito" in c.lower()) and not("_pe" in c.lower()) 
           and not("patologia" in c.lower()) and not("sesso" in c.lower()) and not("id" in c.lower())]
nom_col = [c for c in df.columns if("_esito" in c.lower() or "sesso" in c.lower())]

df_data_num = result[num_col]
df_data_nom = result[nom_col]
df_data_num_nom = result.drop(["ID", "Patologia"], axis = 1)
df_target = result['Patologia']

# Encode nominal columns
df_data_nom = result[nom_col]

label_enc = LabelEncoder()

arr_enc = np.zeros((df_data_nom.shape[1], df_data_nom.shape[0]))
i = 0
for col in df_data_nom.columns:
    label_enc.fit(df_data_nom[col])
    arr_enc[i] = label_enc.transform(df_data_nom[col])
    i = i+1
arr_enc = arr_enc.T

df_data_nom_enc = pd.DataFrame(data = arr_enc, columns = df_data_nom.columns)
n_col = [c for c in result.columns if "esito" in c.lower()]
df_data_num_nom.drop(n_col, axis=1)

for c in df_data_nom_enc.columns:
    df_data_num_nom[c] = df_data_nom_enc[c]

df_data_nom = df_data_nom_enc


# Splitting Test Set data
num_col = [c for c in df_test.columns if not("_esito" in c.lower()) and not("_pe" in c.lower()) 
           and not("sesso" in c.lower()) and not("id" in c.lower())]
nom_col = [c for c in df_test.columns if("_esito" in c.lower()) or ("sesso" in c.lower())]

df_data_test_num = result_test[num_col]
df_data_test_nom = result_test[nom_col]
df_data_test_num_nom = result_test.drop(["ID"], axis = 1)


df_data_num_nom

Unnamed: 0,Scolarita,Eta,MMSE_PG,MMSE_PC,CLOCKTEST_PG,COPIAFIGURAREY_PG,COPIAFIGURAREY_PC,PAROLEREYIMM_PG,PAROLEREYIMM_PC,PAROLEREYDIFF_PG,...,Sesso,MMSE_ESITO,CLOCKTEST_ESITO,COPIAFIGURAREY_ESITO,PAROLEREYIMM_ESITO,PAROLEREYDIFF_ESITO,MEMORIAFIGURAREY_ESITO,FAB_ESITO,FLUENZAVERBFON_ESITO,TESTMATRICIATTENTIVE_ESITO
0,13.0,65.0,29.0,28.49,5.0,34.0,34.75,28.0,29.3,8.0,...,0.0,2.0,0.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0
1,8.0,70.0,27.0,28.20,6.0,34.0,36.00,28.0,33.9,6.0,...,1.0,2.0,2.0,2.0,1.0,2.0,3.0,2.0,2.0,2.0
2,12.0,70.0,27.0,26.86,8.0,31.0,18.50,21.0,26.9,6.0,...,0.0,2.0,0.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0
3,8.0,66.0,17.0,17.53,2.0,2.0,3.50,12.0,16.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,7.0,82.0,25.0,25.00,1.0,15.0,17.75,18.0,30.2,3.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,5.0,76.0,29.0,31.03,5.0,28.0,30.75,25.0,35.0,1.0,...,1.0,2.0,2.0,1.0,0.0,0.0,3.0,0.0,0.0,2.0
327,14.0,54.0,28.0,26.31,10.0,35.0,33.75,19.0,15.5,4.0,...,0.0,2.0,2.0,2.0,0.0,0.0,3.0,2.0,2.0,2.0
328,13.0,53.0,30.0,28.99,10.0,35.0,34.25,35.0,31.5,6.0,...,1.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0
329,5.0,67.0,28.0,29.27,8.0,31.0,33.00,40.0,46.1,7.0,...,0.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0


## Decision Tree



In [6]:
# Set up possible values of parameters to optimize over
p_grid = {"criterion": ["gini","entropy"],
          "min_samples_split": np.arange(2,20,1),
          "min_samples_leaf": np.arange(2,20,1)}

weights = {"Assenza di patologia" : 1,
           "Malattia di Alzheimer" : 1,
           "Disturbo cognitivo lieve" : 1,
           "Disturbo depressivo" : 1}

# Logistic Regression Classifier
model = DecisionTreeClassifier(splitter='best', min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None,
                               min_impurity_decrease=0.0, min_impurity_split=None, class_weight=weights, ccp_alpha=0.0)

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, scoring = 'accuracy', verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   24.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.613062004114017


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   22.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6375991771965912


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   21.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6405377607992948


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   23.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.643696738172201
0.6496841022627093
Mean score:  0.6369159565089626


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   21.8s finished


## Logistic Regression

In [28]:
# Normalize data
scaler = MinMaxScaler()
scaler.fit(df_data_num)
df_data_num_norm = scaler.transform(df_data_num)
df_data_num_norm = pd.DataFrame(data = df_data_num_norm, columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"C": np.arange(0.1,2,0.3),
          "solver": ["lbfgs", "newton-cg",],
          "multi_class": ["ovr", "multinomial"]}

weights = {"Assenza di patologia" : 1,
           "Malattia di Alzheimer" : 1,
           "Disturbo cognitivo lieve" : 2,
           "Disturbo depressivo" : 3}

# Logistic Regression Classifier
model = LogisticRegression(penalty='l2', tol=0.0001, fit_intercept=True, intercept_scaling=1, 
                           class_weight= weights, random_state=None, max_iter=1000, verbose=0, l1_ratio=None)

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, scoring = 'accuracy', verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   12.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6767190714075816


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   12.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6767190714075816


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6797678518953865


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   12.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.679841316485454
0.6765721422274464
Mean score:  0.6779238906846901


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.5s finished


## Multi Layer Perceptron

In [27]:
# Normalize data
scaler = MinMaxScaler()
scaler.fit(df_data_num)
df_data_num_norm = scaler.transform(df_data_num)
df_data_num_norm = pd.DataFrame(data = df_data_num_norm, columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"alpha": np.arange(0.01,1,0.1),
          "hidden_layer_sizes": [(19, ), (23, ), (11, )],
          # "momentum": np.arange(0.1,1,0.1),
          "activation": ["logistic", "identity", "tanh", "relu"]}

# Multi Layer Perceptron Classifier
model = MLPClassifier(solver='adam', learning_rate='adaptive', learning_rate_init=0.1, max_iter=1500, shuffle=True,
                      tol=0.001, verbose=0, nesterovs_momentum=False, early_stopping=True)

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, scoring = 'accuracy', verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   38.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6827431677931237


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   34.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6194166911548634


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   35.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6313179547458125


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   42.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6375257126065237
0.66169556273876
Mean score:  0.6465398178078167


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   40.0s finished


## KNN numerical

In [7]:
# Normalize data
scaler = MinMaxScaler()
scaler.fit(df_data_num)
df_data_num_norm = scaler.transform(df_data_num)
df_data_num_norm = pd.DataFrame(data = df_data_num_norm, columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"metric": ["manhattan", "euclidean"],
          "n_neighbors": np.arange(1,20,1),
          "weights": ["distance", "uniform"]}

# KNN Classifier
# model = NearestNeighbors(*, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None)
model = KNeighborsClassifier(radius=1.0, algorithm='auto', leaf_size=30, n_jobs=2)


# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, scoring = 'accuracy', verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7159124302086395


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7008154569497502


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    9.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7219732588892154


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6795841904202174
0.7008154569497502
Mean score:  0.7038201586835147


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.3s finished


## KNN nominal

In [13]:
# Set up possible values of parameters to optimize over
p_grid = {"metric": ["manhattan", "euclidean"],
          "n_neighbors": np.arange(1,20,1),
          "weights": ["distance", "uniform"]}

# KNN Classifier
# model = NearestNeighbors(*, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None)
model = KNeighborsClassifier(radius=1.0, algorithm='auto', leaf_size=30, n_jobs=2)


# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# # PROVE
# k = [c for c in df_data_num_nom.columns if "sesso" not in c.lower() and "scolarita" not in c.lower() 
#                                         and "eta" not in c.lower() and "_pg" not in c.lower()]
# df_data_num_nom_PROVA = df_data_num_nom[k]

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, scoring = 'accuracy', verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_nom.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6766088745224802


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.664707610931531


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6706582427270056


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6497942991478107
0.6766823391125477
Mean score:  0.6676902732882751


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.1s finished


## SVC with rbf kernel

In [29]:
# Normalize data
scaler = MinMaxScaler()
scaler.fit(df_data_num)
df_data_num_norm = scaler.transform(df_data_num)
df_data_num_norm = pd.DataFrame(data = df_data_num_norm, columns = df_data_num.columns)

# Set up possible values of parameters to optimize over
p_grid = {"C": np.arange(0.1,2,0.3),
          "gamma": np.arange(0.01,1,0.1),
          "degree":[1,2,3]}

# We will use a Support Vector Classifier
model = SVC(kernel="rbf")

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6978768733470466


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6767558037026155


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6919262415515721


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6949750220393769
0.6948280928592419
Mean score:  0.6912724066999706


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.2s finished


## SVC with polinomial kernel

In [10]:
# Normalize data
scaler = MinMaxScaler()
scaler.fit(df_data_num)
df_data_num_norm = scaler.transform(df_data_num)
df_data_num_norm = pd.DataFrame(data = df_data_num_norm, columns = df_data_num.columns)


# Set up possible values of parameters to optimize over
p_grid = {"C": np.arange(0.1,2,0.3),
          "gamma": np.arange(0.01,1,0.1),
          "degree":[1,2,3]}

weights = {"Assenza di patologia" : 1,
           "Malattia di Alzheimer" : 1,
           "Disturbo cognitivo lieve" : 1,
           "Disturbo depressivo" : 1.5}

# We will use a Support Vector Classifier
model = SVC(kernel="poly", class_weight = None)

# Number of random trials
NUM_TRIALS = 5

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# PROVE
# k = [c for c in df_data_num_nom_norm.columns if "sesso" not in c.lower() ]
# df_data_num_nom_norm_PROVA = df_data_num_nom_norm[k]

# Model selection
for i in range(NUM_TRIALS):
    
    # Choose cross-validation techniques for the inner and outer loops
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv, verbose = 0)

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=df_data_num_norm.values, y=df_target.values, cv=outer_cv, verbose = 1)
    print(nested_score.mean())
    nested_scores[i] = nested_score.mean()
    
print("Mean score: ", nested_scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6948648251542757


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.6949382897443432


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.7039377020276227


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.691962973846606
0.6737070232148106
Mean score:  0.6918821627975318


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   10.9s finished


# Selected models

## SVC Classifier


### Hyperparameter Selection SVC



In [7]:
# Split dataset (2/3 training , 1/3 test)
data_train, data_test, target_train, target_test = train_test_split(df_data_num.values, df_target.values, test_size=0.33, stratify = df_target)

# Normalize data
scaler = MinMaxScaler()
scaler.fit(data_train)

data_train_norm = scaler.transform(data_train)
data_train_norm = pd.DataFrame(data = data_train_norm, columns = df_data_num.columns)

data_test_norm = scaler.transform(data_test)
data_test_norm = pd.DataFrame(data = data_test_norm, columns = df_data_num.columns)

# data_train_norm = minmax_scale(data_train, feature_range=(0, 1), axis=0, copy=True)
# data_test_norm = minmax_scale(data_test, feature_range=(0, 1), axis=0, copy=True)

# Select best hyerparameter
p_grid = {"C": np.arange(0.1,2,0.3),
          "gamma": np.arange(0.01,1,0.1),
          "degree":[1,2,3]}


weights = {"Assenza di patologia" : 1,
           "Malattia di Alzheimer" : 1,
           "Disturbo cognitivo lieve" : 1,
           "Disturbo depressivo" : 1.5}

model = SVC(kernel="poly", class_weight = weights)
hyp_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
clf = GridSearchCV(estimator=model, param_grid=p_grid, cv= hyp_cv, verbose = 1, scoring = 'accuracy')

clf.fit(data_train_norm,target_train)

print("Best score: ", clf.best_score_)
print("Best params: ", clf.best_params_)


df_score = pd.DataFrame(clf.cv_results_)
df_score = df_score.sort_values(by=["rank_test_score"])
df_score = df_score.loc[:,"params":"rank_test_score"]
df_score.head()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 210 candidates, totalling 2100 fits
Best score:  0.733596837944664
Best params:  {'C': 1.9000000000000004, 'degree': 3, 'gamma': 0.21000000000000002}


[Parallel(n_jobs=1)]: Done 2100 out of 2100 | elapsed:   13.5s finished


Unnamed: 0,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
202,"{'C': 1.9000000000000004, 'degree': 3, 'gamma'...",0.608696,0.727273,0.772727,0.772727,0.863636,0.681818,0.681818,0.681818,0.818182,0.727273,0.733597,0.07131,1
195,"{'C': 1.9000000000000004, 'degree': 2, 'gamma'...",0.652174,0.818182,0.818182,0.772727,0.727273,0.681818,0.636364,0.681818,0.818182,0.727273,0.733399,0.066724,2
136,"{'C': 1.3000000000000003, 'degree': 2, 'gamma'...",0.652174,0.818182,0.818182,0.772727,0.727273,0.681818,0.636364,0.681818,0.818182,0.727273,0.733399,0.066724,2
78,"{'C': 0.7000000000000001, 'degree': 2, 'gamma'...",0.652174,0.818182,0.818182,0.772727,0.727273,0.681818,0.636364,0.681818,0.818182,0.727273,0.733399,0.066724,2
172,"{'C': 1.6000000000000003, 'degree': 3, 'gamma'...",0.565217,0.727273,0.772727,0.772727,0.863636,0.727273,0.681818,0.681818,0.772727,0.727273,0.729249,0.074262,5


### Model Evaluation

In [8]:
# Evaluating model on 1/3 of dataset
weights = {"Assenza di patologia" : 1,
           "Malattia di Alzheimer" : 1,
           "Disturbo cognitivo lieve" : 1,
           "Disturbo depressivo" : 1.5}

model = SVC(kernel="poly", C = 1,  degree = 3, gamma = 0.2, class_weight = weights)

model.fit(data_train_norm, target_train)
prediction = model.predict(data_test_norm)
# print("Accuracy: ", svm.score(data_test, target_test))
print("Accuracy: ", accuracy_score(target_test, prediction))
labels = ["Assenza di patologia", "Malattia di Alzheimer",  "Disturbo cognitivo lieve", "Disturbo depressivo"]
df_cm = pd.DataFrame(confusion_matrix(target_test, prediction, labels= labels, sample_weight=None, normalize=None), columns = labels)
df_cm.insert(0, "Conf. matrix", labels)
df_cm

Accuracy:  0.6818181818181818


Unnamed: 0,Conf. matrix,Assenza di patologia,Malattia di Alzheimer,Disturbo cognitivo lieve,Disturbo depressivo
0,Assenza di patologia,65,0,2,0
1,Malattia di Alzheimer,10,8,3,0
2,Disturbo cognitivo lieve,3,9,2,0
3,Disturbo depressivo,5,3,0,0


### New instances prediction


In [9]:
# Normalize data
scaler = MinMaxScaler()
scaler.fit(df_data_num)

df_data_num_norm = scaler.transform(df_data_num)
df_data_num_norm = pd.DataFrame(data = df_data_num_norm, columns = df_data_num.columns)

df_data_test_num_norm = scaler.transform(df_data_test_num)
df_data_test_num_norm = pd.DataFrame(data = df_data_test_num_norm, columns = df_data_test_num.columns)
# data_norm = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True)
# data_test_num_norm = minmax_scale(df_data_test_num, feature_range=(0, 1), axis=0, copy=True)

weights = {"Assenza di patologia" : 1,
           "Malattia di Alzheimer" : 1,
           "Disturbo cognitivo lieve" : 1,
           "Disturbo depressivo" : 1.5}

model = SVC(kernel="poly", C = 1,  degree = 3, gamma = 0.2, class_weight = weights)

model.fit(df_data_num_norm.values,df_target.values)
prediction = model.predict(df_data_test_num_norm.values)

df_sub = result_test[["ID"]]
df_sub.insert(len(df_sub.columns), "Patologia", prediction)

df_sub.groupby("Patologia").count().sort_values(by="Patologia")

Unnamed: 0_level_0,ID
Patologia,Unnamed: 1_level_1
Assenza di patologia,124
Disturbo cognitivo lieve,4
Disturbo depressivo,1
Malattia di Alzheimer,35


In [25]:
df_sub

Unnamed: 0,ID,Patologia
0,Row6,Disturbo cognitivo lieve
1,Row8,Malattia di Alzheimer
2,Row9,Assenza di patologia
3,Row10,Assenza di patologia
4,Row13,Malattia di Alzheimer
...,...,...
159,Row505,Assenza di patologia
160,Row506,Malattia di Alzheimer
161,Row507,Assenza di patologia
162,Row508,Assenza di patologia


In [26]:
df_sub.to_csv("SVC_poly_1_3_0p2.csv", index = False)

## KNN Classifier




### Hyperparameter Selection



In [10]:
# Split dataset (2/3 training , 1/3 test)
data_train, data_test, target_train, target_test = train_test_split(df_data_num.values, df_target.values, test_size=0.33, stratify = df_target)

# Normalize data
# data_train_norm = minmax_scale(data_train, feature_range=(0, 1), axis=0, copy=True)
# data_test_norm = minmax_scale(data_test, feature_range=(0, 1), axis=0, copy=True)

scaler = MinMaxScaler()
scaler.fit(data_train)

data_train_norm = scaler.transform(data_train)
data_train_norm = pd.DataFrame(data = data_train_norm, columns = df_data_num.columns)

data_test_norm = scaler.transform(data_test)
data_test_norm = pd.DataFrame(data = data_test_norm, columns = df_data_num.columns)

# Select best hyerparameter
p_grid = {"metric": ["manhattan", "euclidean"],
          "n_neighbors": np.arange(1,20,1),
          "weights": ["distance", "uniform"]}

# KNN Classifier
model = KNeighborsClassifier(radius=1.0, algorithm='auto', leaf_size=30, n_jobs=2)
hyp_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
clf = GridSearchCV(estimator=model, param_grid=p_grid, cv= hyp_cv, verbose = 1, scoring = 'accuracy')

clf.fit(data_train_norm.values,target_train)

print("Best score: ", clf.best_score_)
print("Best params: ", clf.best_params_)


df_score = pd.DataFrame(clf.cv_results_)
df_score = df_score.sort_values(by=["rank_test_score"])
df_score = df_score.loc[:,"params":"rank_test_score"]
df_score.head()

Fitting 10 folds for each of 76 candidates, totalling 760 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best score:  0.7373517786561266
Best params:  {'metric': 'manhattan', 'n_neighbors': 6, 'weights': 'distance'}


[Parallel(n_jobs=1)]: Done 760 out of 760 | elapsed:    5.8s finished


Unnamed: 0,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
10,"{'metric': 'manhattan', 'n_neighbors': 6, 'wei...",0.782609,0.681818,0.818182,0.818182,0.772727,0.818182,0.727273,0.772727,0.636364,0.545455,0.737352,0.086232,1
12,"{'metric': 'manhattan', 'n_neighbors': 7, 'wei...",0.73913,0.772727,0.772727,0.818182,0.772727,0.772727,0.681818,0.772727,0.636364,0.545455,0.728458,0.07881,2
46,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",0.782609,0.772727,0.681818,0.818182,0.727273,0.863636,0.636364,0.818182,0.590909,0.590909,0.728261,0.093682,3
8,"{'metric': 'manhattan', 'n_neighbors': 5, 'wei...",0.782609,0.636364,0.818182,0.818182,0.727273,0.818182,0.681818,0.818182,0.636364,0.545455,0.728261,0.093682,3
26,"{'metric': 'manhattan', 'n_neighbors': 14, 'we...",0.695652,0.772727,0.818182,0.818182,0.681818,0.772727,0.772727,0.636364,0.681818,0.590909,0.724111,0.073905,5


### Model Evaluation

In [12]:
# Evaluating model on 1/3 of dataset
model = KNeighborsClassifier(radius=1.0, algorithm='auto', leaf_size=30, n_jobs=2, metric = "manhattan", n_neighbors=8, weights="distance")

model.fit(data_train_norm, target_train)
prediction = model.predict(data_test_norm)
# print("Accuracy: ", svm.score(data_test, target_test))
print("Accuracy: ", accuracy_score(target_test, prediction))
labels = ["Assenza di patologia", "Malattia di Alzheimer",  "Disturbo cognitivo lieve", "Disturbo depressivo"]
df_cm = pd.DataFrame(confusion_matrix(target_test, prediction, labels= labels, sample_weight=None, normalize=None), columns = labels)
df_cm.insert(0, "Conf. matrix", labels)
df_cm

Accuracy:  0.6545454545454545


Unnamed: 0,Conf. matrix,Assenza di patologia,Malattia di Alzheimer,Disturbo cognitivo lieve,Disturbo depressivo
0,Assenza di patologia,61,4,1,1
1,Malattia di Alzheimer,12,8,1,0
2,Disturbo cognitivo lieve,6,5,3,0
3,Disturbo depressivo,6,1,1,0


### New instances prediction

In [15]:
# Normalize data
# data_norm = minmax_scale(df_data_num, feature_range=(0, 1), axis=0, copy=True)
# data_test_num_norm = minmax_scale(df_data_test_num, feature_range=(0, 1), axis=0, copy=True)

scaler = MinMaxScaler()
scaler.fit(df_data_num)

df_data_num_norm = scaler.transform(df_data_num)
df_data_num_norm = pd.DataFrame(data = df_data_num_norm, columns = df_data_num.columns)

df_data_test_num_norm = scaler.transform(df_data_test_num)
df_data_test_num_norm = pd.DataFrame(data = df_data_test_num_norm, columns = df_data_test_num.columns)

model = KNeighborsClassifier(radius=1.0, algorithm='auto', leaf_size=30, n_jobs=2, metric = "manhattan", n_neighbors=8, weights="uniform")

model.fit(data_norm,df_target.values)
prediction = model.predict(df_data_test_num_norm.values)

df_sub = result_test[["ID"]]
df_sub.insert(len(df_sub.columns), "Patologia", prediction)

df_sub.groupby("Patologia").count().sort_values(by="Patologia")

Unnamed: 0_level_0,ID
Patologia,Unnamed: 1_level_1
Assenza di patologia,129
Disturbo cognitivo lieve,12
Malattia di Alzheimer,23


In [16]:
df_sub

Unnamed: 0,ID,Patologia
0,Row6,Disturbo cognitivo lieve
1,Row8,Assenza di patologia
2,Row9,Assenza di patologia
3,Row10,Assenza di patologia
4,Row13,Assenza di patologia
...,...,...
159,Row505,Assenza di patologia
160,Row506,Assenza di patologia
161,Row507,Assenza di patologia
162,Row508,Assenza di patologia


In [17]:
df_sub.to_csv("KNN_manhattan_8_uniform.csv", index = False)