In [526]:
import pandas as pd

In [527]:
DATASET_LOCATION="./dataset/expandedDataSet.csv" 

In [528]:
ds = pd.read_csv(DATASET_LOCATION)

In [529]:
ds.head(10)

<h1><b>DESCRIZIONE DEGLI ATTRIBUTI</b></h1>
<ul>
    <li><b>Gender:</b> Gender of the student (male/female)</li>
    <li><b>EthnicGroup:</b> Ethnic group of the student (group A to E)</li>
    <li><b>ParentEduc:</b> Parent(s) education background (from some_highschool to master's degree)</li>
    <li><b>LunchType:</b> School lunch type (standard or free/reduced)</li>
    <li><b>TestPrep:</b> Test preparation course followed (completed or none)</li>
    <li><b>ParentMaritalStatus:</b> Parent(s) marital status (married/single/widowed/divorced)</li>
    <li><b>PracticeSport:</b> How often the student parctice sport (never/sometimes/regularly))</li>
    <li><b>IsFirstChild:</b> If the child is first child in the family or not (yes/no)</li>
    <li><b>NrSiblings:</b> Number of siblings the student has (0 to 7)</li>
    <li><b>TransportMeans:</b> Means of transport to school (schoolbus/private)</li>
    <li><b>WklyStudyHours:</b> Weekly self-study hours(less that 5hrs; between 5 and 10hrs; more than 10hrs)</li>
    <li><b>MathScore:</b> math test score(0-100)</li>
    <li><b>ReadingScore:</b> reading test score(0-100)</li>
    <li><b>WritingScore:</b> writing test score(0-100)</li>
</ul>


In [530]:
#elimino la colonna TestPrep perchè ha troppi dati mancanti
ds.drop("TestPrep",axis=1,inplace=True)

In [531]:
#elimino la colonna indice perchè non è significativa
ds.drop("Unnamed: 0",axis=1,inplace=True)

In [532]:
#elimino la colonna LunchType perchè non sono sicuro che sia significativa
ds.drop("LunchType",axis=1,inplace=True)

In [533]:
ds.head(10)

In [534]:
ds.shape

In [535]:
ds.info()

In [536]:
#eliminiamo le osservazioni che presentano dati mancanti
ds.dropna(inplace=True)

In [537]:
ds.info()

In [538]:
#verifichiamo se ci sono delle righe duplicate
ds.drop_duplicates(inplace=True)

In [539]:
ds.info()

In [540]:
#Mappiamo Gender su un vettore di variabili dummy
ds.Gender.replace({'male':1,'female':0},inplace=True)

In [541]:
ds.head(10)

In [542]:
ds_= pd.get_dummies(ds['EthnicGroup'], prefix='EG')
ds = ds.join(ds_).drop('EthnicGroup',axis=1)

In [543]:
ds.head(10)

In [544]:
ds_= pd.get_dummies(ds['ParentEduc'], prefix='ParEduc')
ds = ds.join(ds_).drop('ParentEduc',axis=1)

In [545]:
ds.head(10)

In [546]:
ds_= pd.get_dummies(ds['ParentMaritalStatus'], prefix='ParMarStat')
ds = ds.join(ds_).drop('ParentMaritalStatus',axis=1)

In [547]:
ds.head(10)

In [548]:
ds_= pd.get_dummies(ds['PracticeSport'], prefix='sport')
ds = ds.join(ds_).drop('PracticeSport',axis=1)

In [549]:
ds.head(10)

In [550]:
ds.IsFirstChild.replace({'yes':1,'no':0},inplace=True)

In [551]:
ds.TransportMeans.replace({'school_bus':1,'private':0},inplace=True)

In [552]:
ds_= pd.get_dummies(ds['WklyStudyHours'], prefix='studyHours')
ds = ds.join(ds_).drop('WklyStudyHours',axis=1)

In [553]:
ds.head(10)

In [554]:
#Analisi della correlazione (Da fare)
ds.corr()

In [555]:
#ds_copy = ds.copy() # questa riga serve per avere il dataset per provare la standardizzazione di sklearn ( alla fine)

In [556]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import*
#Introduciamo la standardizzazione automatica di scikit learn
lista = ds.columns.values.tolist()
diz = {}
for i in range(len(lista)):
    diz[i] = lista[i]
scaler = preprocessing.StandardScaler().fit(ds)
X_scaled = scaler.transform(ds)
#Viene mantenuto l'ordine delle colonne quando si standardizza, quindi andiamo a sostituire gli indici con i nomi
#delle colonne del Dataset, questo ci servirà per suddividere il Dataset in Train e Test -set
dsn = pd.DataFrame(X_scaled)
dsn.rename(diz, axis='columns', inplace=True) 

In [557]:
#Suddividiamo il Dataset standardizzato in Train e Test set
X = dsn.drop('MathScore',axis=1)
Y = dsn['MathScore']
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2)

In [558]:
import numpy as np
np.random.seed(50)

In [559]:
#Definiamo la funzione compute_performance così da poter poi stampare
#la tabella con le varie perfomance dei diversi modelli
#from sklearn.model_selection import cross_val_predict, cross_val_score

import time

pd.options.display.float_format = '{:.8f}'.format

def compute_performance(modelli,names,X,y):
    score_dict = {}
    score_dict['Modello'] = {}
    score_dict['mse'] = {}
    score_dict['variance'] = {}
    score_dict['mae'] = {}
    score_dict['mape'] = {}
    score_dict['median_ae'] = {}
    score_dict['r2'] = {}
    score_dict['adjusted_r2'] = {}
    
    

    for i in range(len(modelli)):
        value_predictions = modelli[i].predict(X)
        mse = mean_squared_error(y, value_predictions)
        variance = explained_variance_score(y,value_predictions)
        mae = mean_absolute_error(y,value_predictions)
        mape = mean_absolute_percentage_error(y, value_predictions)
        median_ae = median_absolute_error(y, value_predictions)
        r2 = r2_score(y, value_predictions)
        adjusted_r2 = 1 - ( 1-r2 ) * ( len(value_predictions) - 1 ) / ( len(value_predictions) - X.shape[1] - 1 )

        name = names[i]

        score_dict['Modello'][i] = name
        score_dict['mse'][i] = mse
        score_dict['variance'][i] = variance
        score_dict['mae'][i] = mae
        score_dict['mape'][i] = mape
        score_dict['median_ae'][i] = median_ae
        score_dict['r2'][i] = r2
        score_dict['adjusted_r2'][i] = adjusted_r2
        
        
    return score_dict

In [560]:
import pickle
regressori = []
nomi = []

<h1> Modelli SVM </h1>

In [561]:
modelli_svm = []

In [562]:
from sklearn.svm import SVR

#svr_reg = SVR()
#svr_reg.fit(X_train,y_train)

# Salviamo il modello sul disco
#filename = './Modelli/svr_reg.sav'
#pickle.dump(svr_reg, open(filename, 'wb'))

# Carichiamo il modello dal disco
filename = './Modelli/svr_reg.sav'
svr_reg = pickle.load(open(filename, 'rb'))

#print(lin_reg)

In [563]:
regressori.append(svr_reg)
nomi.append("SVR Regressor")
modelli_svm.append("SVR Regressor")

<h1>Modelli Lineari</h1></n>

In [564]:
modelli_lineari = []

<h2>Linear Regressor</h2>

In [565]:
from sklearn.linear_model import LinearRegression

#lin_reg = LinearRegression()
#lin_reg = lin_reg.fit(X_train, y_train)

# Salviamo il modello sul disco
#filename = './Modelli/lin_reg.sav'
#pickle.dump(lin_reg, open(filename, 'wb'))

# Carichiamo il modello dal disco
filename = './Modelli/lin_reg.sav'
lin_reg = pickle.load(open(filename, 'rb'))
#print(lin_reg)

In [566]:
regressori.append(lin_reg)
nomi.append("Linear Regressor")
modelli_lineari.append("Linear Regressor")

<h1>SGD Regressor</h1>

In [567]:
from sklearn.linear_model import SGDRegressor

#sgd_reg = SGDRegressor() #maybe random_state=42
#sgd_reg.fit(X_train, y_train)

#Salviamo il modello sul disco
#filename = './Modelli/sgd_reg.sav'
#pickle.dump(sgd_reg, open(filename, 'wb'))

#Carichiamo il modello dal disco
filename = './Modelli/sgd_reg.sav'
sgd_reg = pickle.load(open(filename, 'rb'))

#print(lin_reg)

In [568]:
regressori.append(sgd_reg)
nomi.append("SGD Regressor")
modelli_lineari.append("SGD Regressor")

<h1>Ensemble Models</h1>

In [569]:
modelli_ensemble = []

<h2>Random Forest</h2>

In [570]:
from sklearn.ensemble import RandomForestRegressor

#forest_reg = RandomForestRegressor() #n_estimators=10, random_state=42
#forest_reg.fit(X_train, y_train)

# Salviamo il modello sul disco
#filename = './Modelli/forest_reg.sav'
#pickle.dump(forest_reg, open(filename, 'wb'))

# Carichiamo il modello dal disco
filename = './Modelli/forest_reg.sav'
forest_reg = pickle.load(open(filename, 'rb'))
# print(lin_reg)

In [571]:
regressori.append(forest_reg)
nomi.append("Forest Regressor")
modelli_lineari.append("Forest Regressor")

<h2>AdaBoost Regressor</h2>

In [572]:
from sklearn.ensemble import AdaBoostRegressor

#ada_reg = AdaBoostRegressor()
#ada_reg.fit(X_train, y_train)

# Salviamo il modello sul disco
#filename = './Modelli/ada_reg.sav'
#pickle.dump(ada_reg, open(filename, 'wb'))

# Carichiamo il modello dal disco
filename = './Modelli/ada_reg.sav'
ada_reg = pickle.load(open(filename, 'rb'))
# print(lin_reg)

In [573]:
regressori.append(ada_reg)
nomi.append("Ada Regressor")
modelli_ensemble.append("Ada Regressor")

<h2>Bagging Regressor</h2>

In [574]:
from sklearn.ensemble import BaggingRegressor

#bagging_reg = BaggingRegressor(random_state=42)
#bagging_reg.fit(X_train, y_train)

# Salviamo il modello sul disco
#filename = './Modelli/bagging_reg.sav'
#pickle.dump(bagging_reg, open(filename, 'wb'))

# Carichiamo il modello dal disco
filename = './Modelli/bagging_reg.sav'
bagging_reg = pickle.load(open(filename, 'rb'))
# print(lin_reg)

In [575]:
regressori.append(bagging_reg)
nomi.append("Bagging Regressor")
modelli_ensemble.append("Bagging Regressor")

<h1>Neural networks</h1>

In [576]:
reti_neurali = []

In [577]:
from sklearn.neural_network import MLPRegressor

#mlp_reg = MLPRegressor()
#mlp_reg.fit(X_train, y_train)

# Salviamo il modello sul disco
#filename = './Modelli/mlp_reg.sav'
#pickle.dump(mlp_reg, open(filename, 'wb'))

# Carichiamo il modello dal disco
filename = './Modelli/mlp_reg.sav'
mlp_reg = pickle.load(open(filename, 'rb'))
# print(lin_reg)

In [578]:
regressori.append(mlp_reg)
nomi.append("MLP Regressor")
reti_neurali.append("MLP Regressor")

<h1>Modelli ad Albero</h1>


In [579]:
modelli_alberi = []

<h2>DecisionTree Regressor</h2>

In [580]:
from sklearn.tree import DecisionTreeRegressor

#tree_reg = DecisionTreeRegressor(random_state=42)
#tree_reg.fit(X_train, y_train)

# Salviamo il modello sul disco
#filename = './Modelli/tree_reg.sav'
#pickle.dump(tree_reg, open(filename, 'wb'))

# Carichiamo il modello dal disco
filename = './Modelli/tree_reg.sav'
tree_reg = pickle.load(open(filename, 'rb'))
# print(lin_reg)

In [581]:
regressori.append(tree_reg)
nomi.append("DecisionTree Regressor")
modelli_alberi.append("DecisionTree Regressor")

<h1>Regressori basati su Istance</h1>

In [582]:
modelli_istance = []

<h2>K Neighbour Regressor</h2>

In [583]:
from sklearn.neighbors import KNeighborsRegressor

#kn_reg = KNeighborsRegressor()
#kn_reg.fit(X_train, y_train)

# Salviamo il modello sul disco
#filename = './Modelli/kn_reg.sav'
#pickle.dump(kn_reg, open(filename, 'wb'))

# Carichiamo il modello dal disco
filename = './Modelli/kn_reg.sav'
kn_reg = pickle.load(open(filename, 'rb'))
# print(lin_reg)

In [584]:
regressori.append(kn_reg)
nomi.append("K Neighbor Regressor")
modelli_istance.append("K Neighbor Regressor")

<h1>Performance modelli - Training set</h1>

In [585]:
#performance_modelli_training = compute_performance(regressori, nomi, X_train,y_train)
#performance_train = pd.DataFrame(performance_modelli_training)
#performance_train

# Salviamo le performance ottenute su disco
#filename = './Performance/performance_train.csv'
#pickle.dump(performance_train, open(filename, 'wb'))


In [589]:
#Carichiamo il file delle performance dal disco
filename = './Performance/performance_train.csv'
performance_train = pickle.load(open(filename, 'rb'))
performance_train

<h1>Performance modelli - Test set</h1>

In [587]:
#performance_modelli_training = compute_performance(regressori, nomi, X_test,y_test)
#performance_test = pd.DataFrame(performance_modelli_training)
#performance_test

# Salviamo le performance ottenute su disco
#filename = './Performance/performance_test.csv'
#pickle.dump(performance_test, open(filename, 'wb'))


In [588]:
# Carichiamo il file delle performance dal disco
filename = './Performance/performance_test.csv'
performance_test = pickle.load(open(filename, 'rb'))
performance_test