In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

In [3]:
df = pd.read_excel('data/IncidentiModificato.xlsx', index_col='VERBALE')
df = df.drop(columns=["DATA"])
X = df[['SESSO', 'ANNI', 'PESO', 'ALTEZZA', 'BMI', 'Tot Testa', 'Tot Torace', 'Tot Addome', 'Tot Scheletro']]
y = df['Mezzo']

df.head()

Unnamed: 0_level_0,SESSO,ANNI,PESO,ALTEZZA,BMI,Mezzo,Testa:Neurocranio,Testa:Splancnocranio,Testa:Telencefalo,Testa:Cervelletto,...,Scheletro:Rachide-cervicale,Scheletro:Rachide-toracico,Scheletro:Rachide-lombare,Scheletro:Bacino-e-sacro,Scheletro:Complesso-sterno/claveo/costale,Tot Testa,Tot Torace,Tot Addome,Tot Scheletro,Totale
VERBALE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
85567,0,81,84.0,1.75,27.428571,0,1,0,0,0,...,0,3,0,3,3,2,0,3,9,14
85829,1,69,69.0,1.62,26.291724,1,4,4,4,4,...,0,0,0,0,4,20,7,1,4,32
85977,1,71,67.0,1.55,27.887617,1,2,0,1,1,...,0,0,0,0,4,6,0,0,4,10
86220,1,54,60.0,1.59,23.733238,1,4,0,0,1,...,0,0,0,0,4,5,3,2,4,14
86247,1,78,69.0,1.67,24.740937,1,2,0,0,0,...,0,0,0,0,4,2,0,2,4,8


In [4]:
pd.options.mode.chained_assignment = None  # default='warn'

def integer_perturbation(n, x):
    if n + x >= 0 and n + x <= 4:
        return n + x
    else:
        return n - x

def get_X_pca_totals(dataframe, exclude=None):
    features = ['SESSO', 'ANNI', 'PESO', 'ALTEZZA','BMI']

    if exclude:
        try:
            features.remove(exclude)
        except:
            features = [f for f in features if f not in exclude]
        
    X_pca = dataframe[features]

    for parte_corpo in ['Testa', 'Torace', 'Addome', 'Scheletro']:
        pca = PCA(n_components=1).fit_transform(dataframe.filter(regex=parte_corpo+":"))
        X_pca['PCA ' + parte_corpo] = pca
    
    return X_pca

def contains_at_least_one(exclude, ls):
    return True in list(map(lambda x: x in ls, exclude))

def add_new_elements_pca(df, perturbation_kind="personal_data", error_distrib = None, n_elements=200, n_columns=3, exclude=None, excluded_feature=None):
    assert(perturbation_kind in ['personal_data', 'body_parts', 'both'])
    df_no_totals = df.drop(columns=df.columns[-5:]) #tolgo i totali per ricalcolarli dopo
    dati_persona = ['SESSO', 'ANNI', 'PESO', 'ALTEZZA', 'BMI', 'Mezzo']
    if exclude:
        try:
            dati_persona.remove(exclude)
        except:
            dati_persona = [f for f in dati_persona if f not in exclude]
            
    df_dati_persona = df_no_totals[dati_persona]
    df_parti_corpo = df_no_totals.drop(columns=dati_persona)
    if error_distrib:
        x = error_distrib['x']
        px = error_distrib['px']
    
    count_elements = 0
    
    while len(df_no_totals) < n_elements:
        index = np.random.choice(df_parti_corpo.index)
        new_index = "FAKE_" + str(index) + "_" + str(count_elements)
        count_elements+=1
        
        parti_corpo = df_parti_corpo.loc[index]
        dati_persona = df_dati_persona.loc[index]
        
        if perturbation_kind == "body_parts" or perturbation_kind == "both":
            pert = parti_corpo.sample(n_columns) 

            for (i,e) in zip(pert.index, pert.values):
                perturbation = np.random.choice(x,p=px)
                parti_corpo[i] = integer_perturbation(e,perturbation)
        
        if perturbation_kind == "personal_data" or perturbation_kind == "both":
            for (i,e) in zip(dati_persona.index, dati_persona.values):
                if i == 'ANNI':
                    perturbation = np.random.normal(0, 1) #int(np.random.normal(0, 1)) #normale centrata in zero e std = 1 anno
                    dati_persona[i] += perturbation
                if i == 'PESO':
                    perturbation = np.random.normal(0, 2) #round(np.random.normal(0, 2), 1) #normale centrata in zero e std = 2kg
                    dati_persona[i] += perturbation
                if i == 'ALTEZZA':
                    perturbation = np.random.normal(0, 0.01) #round(np.random.normal(0, 0.01), 2) #normale centrata in zero e std = 1cm
                    dati_persona[i] += perturbation
                    
            #RICALCOLO BMI
            if type(exclude) == str:
                if exclude not in ['PESO', 'ALTEZZA', 'BMI']:
                    dati_persona['BMI'] = dati_persona['PESO'] / (dati_persona['ALTEZZA'] ** 2)
                else:
                    if exclude == 'PESO':
                        dati_persona['BMI'] = excluded_feature.loc[index] / (dati_persona['ALTEZZA'] ** 2)
                    if exclude == 'ALTEZZA': 
                        dati_persona['BMI'] = dati_persona['PESO'] / (excluded_feature.loc[index] ** 2)
            elif exclude:
                if contains_at_least_one(exclude, ['PESO', 'ALTEZZA', 'BMI']):
                    if 'BMI' not in exclude and 'ALTEZZA' not in exclude and 'PESO' in exclude :
                        dati_persona['BMI'] = excluded_feature.loc[index]['PESO'] / (dati_persona['ALTEZZA'] ** 2)
                    elif 'BMI' not in exclude and 'ALTEZZA' in exclude and 'PESO' not in exclude :
                        dati_persona['BMI'] = dati_persona['PESO'] / (excluded_feature.loc[index]['ALTEZZA'] ** 2)
            #logging.debug('len(df_no_totals): {0}, dati_persona: {1}'.format(len(df_no_totals), dati_persona))
                    
        new_elem = parti_corpo.append(dati_persona)
        df_no_totals.loc[new_index] = new_elem
        
        df_no_totals = df_no_totals.drop_duplicates()
    
    X_pca = get_X_pca_totals(df_no_totals, exclude)

    return X_pca, df_no_totals['Mezzo']

In [5]:
from sklearn.manifold import TSNE 

X_pca = get_X_pca_totals(df)

X_dati_continui = X_pca[['SESSO', 'ANNI', 'PESO', 'ALTEZZA', 'BMI']]
X_dati_PCA = X_pca[['PCA Testa', 'PCA Torace', 'PCA Addome', 'PCA Scheletro']]

X_dati_continui_TSNE = TSNE(n_components=3, perplexity=50).fit_transform(X_dati_continui)
X_dati_PCA_TSNE = TSNE(n_components=3, perplexity=50).fit_transform(X_dati_PCA)

In [6]:
list_TSNE = []
for i in range(len(X_dati_continui_TSNE)):
    list_TSNE.append(np.concatenate((X_dati_continui_TSNE[i], X_dati_PCA_TSNE[i])))
df_TSNE = pd.DataFrame(np.array(list_TSNE))

In [7]:
df_TSNE.head()

Unnamed: 0,0,1,2,3,4,5
0,-79.709282,-128.047806,-46.235157,61.105129,-134.531891,65.126869
1,-50.998131,-18.539673,-102.502388,126.280449,115.804047,-102.387695
2,-59.516243,-25.953857,27.409077,-42.387997,70.604538,-19.865768
3,-10.978515,92.173149,-86.82946,44.650078,106.496246,-109.215919
4,-36.407631,19.456057,9.240723,-7.741701,-71.685135,-120.707993


In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

params = {'hidden_layer_sizes': [[2], [4], [6], [10], [20], [4, 4], [10, 10], [50]],
          'learning_rate_init' : [0.01, 0.2, 0.001],
          'activation': ['identity', 'logistic', 'tanh', 'relu']}

mlp = MLPClassifier(max_iter=5000)

clf = GridSearchCV(mlp, params, n_jobs=-1, cv=5)
scores = cross_val_score(clf, df_TSNE, y, cv=3)
print("\nAccuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
scores


Accuracy: 0.47 (+/- 0.13)


array([0.40909091, 0.44186047, 0.55813953])