In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
pd.options.mode.chained_assignment = None  # default='warn'

import os
if os.path.exists('logging.log'):
    os.remove('logging.log')
    print("File removed")
else:
    print("The file does not exist") 

import logging
logging.basicConfig(filename='logging.log', format='%(asctime)s - %(levelname)s: \t%(message)s', datefmt='%d/%m/%Y %I:%M:%S: %p',  level=logging.DEBUG)
logging.info('File di logging inizializzato.')

The file does not exist


In [2]:
df = pd.read_excel('data/IncidentiModificato.xlsx', index_col='VERBALE')

X = df[['SESSO', 'ANNI', 'PESO', 'ALTEZZA', 'BMI', 'Tot Testa', 'Tot Torace', 'Tot Addome', 'Tot Scheletro']]
y = df['Mezzo']

df.head()

Unnamed: 0_level_0,DATA,SESSO,ANNI,PESO,ALTEZZA,BMI,Mezzo,Testa:Neurocranio,Testa:Splancnocranio,Testa:Telencefalo,...,Scheletro:Rachide-cervicale,Scheletro:Rachide-toracico,Scheletro:Rachide-lombare,Scheletro:Bacino-e-sacro,Scheletro:Complesso-sterno/claveo/costale,Tot Testa,Tot Torace,Tot Addome,Tot Scheletro,Totale
VERBALE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
85567,1999-10-29,0,81,84.0,1.75,27.428571,0,1,0,0,...,0,3,0,3,3,2,0,3,9,14
85829,2000-01-14,1,69,69.0,1.62,26.291724,1,4,4,4,...,0,0,0,0,4,20,7,1,4,32
85977,2000-03-10,1,71,67.0,1.55,27.887617,1,2,0,1,...,0,0,0,0,4,6,0,0,4,10
86220,2000-06-14,1,54,60.0,1.59,23.733238,1,4,0,0,...,0,0,0,0,4,5,3,2,4,14
86247,2000-06-22,1,78,69.0,1.67,24.740937,1,2,0,0,...,0,0,0,0,4,2,0,2,4,8


In [3]:
def get_X_pca_totals(dataframe, exclude=None):
    features = ['SESSO', 'ANNI', 'PESO', 'ALTEZZA','BMI']

    if exclude:
        features.remove(exclude)
        
    X_pca = dataframe[features]

    for parte_corpo in ['Testa', 'Torace', 'Addome', 'Scheletro']:
        pca = PCA(n_components=1).fit_transform(dataframe.filter(regex=parte_corpo+":"))
        X_pca['PCA ' + parte_corpo] = pca
    
    return X_pca

def perturb(n, x):
    if n + x >= 0 and n + x <= 4:
        return n + x
    else:
        return n - x
    

def add_new_elements_pca(df, perturbation_kind="personal_data", error_distrib = None, n_elements=200, n_columns=3, exclude=None, excluded_feature=None):
    assert(perturbation_kind in ['personal_data', 'body_parts', 'both'])
    df_no_totals = df.drop(columns=df.columns[-5:]) #tolgo i totali per ricalcolarli dopo
    dati_persona = ['DATA', 'SESSO', 'ANNI', 'PESO', 'ALTEZZA', 'BMI', 'Mezzo']
    if exclude:
        dati_persona.remove(exclude)
    df_dati_persona = df_no_totals[dati_persona]
    df_parti_corpo = df_no_totals.drop(columns=dati_persona)
    if error_distrib:
        x = error_distrib['x']
        px = error_distrib['px']
    
    count_elements = 0
    
    logging.info('Started augmentation excluding: {0}, n_elements: {1}'.format(exclude, n_elements))
    while len(df_no_totals) < n_elements:
        index = np.random.choice(df_parti_corpo.index)
        new_index = "FAKE_" + str(index) + "_" + str(count_elements)
        count_elements+=1
        
        parti_corpo = df_parti_corpo.loc[index]
        dati_persona = df_dati_persona.loc[index]
        
        if perturbation_kind == "body_parts" or perturbation_kind == "both":
            pert = parti_corpo.sample(n_columns) 

            for (i,e) in zip(pert.index, pert.values):
                perturbation = np.random.choice(x,p=px)
                parti_corpo[i] = perturb(e,perturbation)
        
        if perturbation_kind == "personal_data" or perturbation_kind == "both":
            for (i,e) in zip(dati_persona.index, dati_persona.values):
                if i == 'ANNI':
                    perturbation = int(np.random.normal(0, 1)) #normale centrata in zero e std = 1 anno
                    dati_persona[i] += perturbation
                if i == 'PESO':
                    perturbation = round(np.random.normal(0, 2), 1) #normale centrata in zero e std = 2kg
                    dati_persona[i] += perturbation
                if i == 'ALTEZZA':
                    perturbation = round(np.random.normal(0, 0.01), 2) #normale centrata in zero e std = 1cm
                    dati_persona[i] += perturbation                
            if exclude not in ['PESO', 'ALTEZZA', 'BMI']:
                dati_persona['BMI'] = dati_persona['PESO'] / (dati_persona['ALTEZZA'] ** 2) #ricalcolo il BMI
            else:
                if exclude == 'PESO':
                    dati_persona['BMI'] = excluded_feature.loc[index] / (dati_persona['ALTEZZA'] ** 2)
                if exclude == 'ALTEZZA': 
                    dati_persona['BMI'] = dati_persona['PESO'] / (excluded_feature.loc[index] ** 2)
                    
            
        
        new_elem = parti_corpo.append(dati_persona)
        df_no_totals.loc[new_index] = new_elem
        
        df_no_totals = df_no_totals.drop_duplicates()
    
    logging.info('Finished augmentation excluding: {0}, n_elements: {1}'.format(exclude, n_elements))
    X_pca = get_X_pca_totals(df_no_totals, exclude)

    return X_pca, df_no_totals['Mezzo']

In [None]:
def get_accuracies(df, params, _range, perturbation_kind, exclude=None, excluded_feature=None): 
    accuracies = {'n_data' : [], 'activation': [], 'hidden layer sizes' : [], 'alpha' : [], 'best score' : []}
    
    X = get_X_pca_totals(df, exclude)
    y = df['Mezzo']
    
    X_std = StandardScaler().fit_transform(X)
    
    mlp = MLPClassifier(learning_rate_init=0.001, max_iter=5000, learning_rate="adaptive")
    clf = GridSearchCV(mlp, params, n_jobs=-1, cv=5)
    clf.fit(X_std,y)

    accuracies['n_data'].append(len(y))
    accuracies['activation'].append(clf.best_estimator_.activation)
    accuracies['hidden layer sizes'].append(clf.best_estimator_.hidden_layer_sizes)
    accuracies['alpha'].append(clf.best_estimator_.alpha)
    accuracies['best score'].append(clf.best_score_)
    
    error_distrib = {
        'x' : [-2,-1,0,1,2],
        'px' : [.1,.2,.4,.2,.1]
    }   

    for n_data in _range:

        X_os, y_os = add_new_elements_pca(df=df, perturbation_kind=perturbation_kind, error_distrib=error_distrib, n_elements=n_data, n_columns=3, exclude=exclude, excluded_feature=excluded_feature)

        X_os_std = StandardScaler().fit_transform(X_os)

        mlp = MLPClassifier(learning_rate_init=0.001, max_iter=5000, learning_rate="adaptive")
        clf = GridSearchCV(mlp, params, n_jobs=-1, cv=5)
        clf.fit(X_os_std,y_os)

        accuracies['n_data'].append(len(y_os))
        accuracies['activation'].append(clf.best_estimator_.activation)
        accuracies['hidden layer sizes'].append(clf.best_estimator_.hidden_layer_sizes)
        accuracies['alpha'].append(clf.best_estimator_.alpha)
        accuracies['best score'].append(clf.best_score_)

    return pd.DataFrame(accuracies)


In [None]:
params = {'activation': ['relu', 'logistic', 'tanh'],
          'hidden_layer_sizes' : [(2), (3,3), (5), (6,3), (5,5),(6), (8), (10), (20)],
          #'alpha' : [0.0001, 0.001, 0.01, 0.1]
         }

_range = [x for x in range(140, 200, 10)] + [x for x in range(200,2000,100)] + [2**e for e in range(11,15)]

accuracies_excluding = {}
accuracies_excluding['NONE'] = get_accuracies(df, params, _range, perturbation_kind="personal_data")

In [None]:
for feature in['SESSO', 'ANNI', 'PESO', 'ALTEZZA', 'BMI']:
    df_new = df.drop(columns=feature)
    df_new['Mezzo'] = y
    accuracies_excluding[feature] = get_accuracies(df_new, params, _range, perturbation_kind="personal_data", exclude=feature, excluded_feature=df[feature])

In [None]:
accuracies_excluding_body_parts = {}
accuracies_excluding_body_parts['NONE'] = get_accuracies(df, params, _range, perturbation_kind="body_parts")

for feature in tqdm(['SESSO', 'ANNI', 'PESO', 'ALTEZZA', 'BMI']):
    df_new = df.drop(columns=feature)
    df_new['Mezzo'] = y
    accuracies_excluding_body_parts[feature] = get_accuracies(df_new, params, _range, perturbation_kind="body_parts", exclude=feature, excluded_feature=df[feature])