# S09 T02: Aprenentatge Supervisat - Regressions
## Luis Pardina - Data Science - 07/06/2022

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.model_selection import cross_val_score

### *Exercici 1*: Crea almenys tres models de regressió diferents per intentar predir el millor possible l’endarreriment dels vols (ArrDelay) de DelayedFlights.csv.

Recupero els dos fitxers .csv de la tasca anterior. El fitxer mostra_X.csv conté una mostra del 10% (per a mantenir el temps de processat dins d'un marge raonable) del dataset de flights amb les següents modificacions:
 - esborrats els registres dels vols cancel·lats i desviats, i els camps de l'any i de la variable depenent (ArrDelay). 
 - afegides les següents columnes: les variables temporals relevants (hora programada de sortida, dia de la setmana i mes) normalitzades, les variables categòriques relevants dumificades (companyia aèria), i les variables numèriques relevants estandaritzades amb Robust Scaler.

El fitxer mostra_y conté una mostra del 10% de la variable depenent (ArrDelay). 

In [2]:
mostra_X = pd.read_csv('mostra_X.csv')
mostra_X.drop('Unnamed: 0', axis=1, inplace=True)
mostra_y = pd.read_csv('mostra_y.csv')
mostra_y.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
pd.set_option('display.max_columns', None)
mostra_X.tail()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,norm_Month,norm_DayOfWeek,norm_CRSDepTime,9E,AA,AQ,AS,B6,CO,DL,EV,F9,FL,HA,MQ,NW,OH,OO,UA,US,WN,XE,YV,stand_ActualElapsedTime,stand_CRSElapsedTime,stand_AirTime,stand_Distance,stand_DepDelay,stand_TaxiIn,stand_TaxiOut
192833,3,18,2,1418.0,14,1632.0,1607,XE,2995,N41104,194.0,187.0,178.0,18.0,EWR,MSP,1008,4.0,12.0,0.0,0.0,7.0,0.0,18.0,0.181818,0.166667,0.608696,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.917647,0.855422,1.1,0.609091,-0.146341,-0.5,-0.181818
192834,1,23,3,1347.0,13,1609.0,1615,WN,331,N311SW,142.0,165.0,131.0,17.0,MCO,CLE,895,4.0,7.0,,,,,,0.0,0.333333,0.565217,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.305882,0.590361,0.5125,0.437879,-0.170732,-0.5,-0.636364
192835,4,10,4,2205.0,21,2349.0,2325,AA,1509,N5DMAA,224.0,230.0,197.0,30.0,ORD,LAS,1515,7.0,20.0,0.0,0.0,24.0,0.0,0.0,0.272727,0.5,0.913043,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.270588,1.373494,1.3375,1.377273,0.146341,0.25,0.545455
192836,1,6,7,1258.0,12,1401.0,1310,MQ,3338,N263AE,63.0,55.0,30.0,43.0,TYR,DFW,103,19.0,14.0,0.0,0.0,8.0,0.0,43.0,0.0,1.0,0.521739,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.623529,-0.73494,-0.75,-0.762121,0.463415,3.25,0.0
192837,2,22,5,1701.0,16,1708.0,1635,WN,1660,N227WN,67.0,65.0,53.0,31.0,PHX,LAS,256,4.0,10.0,31.0,0.0,2.0,0.0,0.0,0.090909,0.666667,0.695652,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,-0.576471,-0.614458,-0.4625,-0.530303,0.170732,-0.5,-0.363636


In [4]:
mostra_y.tail()

Unnamed: 0,ArrDelay
192833,25.0
192834,-6.0
192835,24.0
192836,51.0
192837,33.0


Començo amb un numero reduit de variables, agafo les tres on vaig determinar a la tasca anterior que hi ha més correlació individual amb ArrDelay: DepDelay, TaxiOut, CRSDepTime. 

Al llarg del desenvolupament de les tasques he fet diverses proves, i he arribat a la conclusió de que la variable ArrDelay és l'única que permet arribar a un grau acceptable de correlació per a fer prediccions. 

Jo no vull utilitzar com a predictores les variables del desglòs de la variable depenent en les quatre variables que són un arbre de causes, perquè en aquest cas ja conec la relació que existeix entre les variables : ArrDelay = CarrierDelay + WeatherDelay + NASDelay + SecurityDelay + LateAircraftDelay (when ArrDelay > 15).

In [5]:
first_set_of_var = ['DepDelay', 'TaxiOut', 'TaxiIn']

In [6]:
raw = mostra_X[first_set_of_var]

In [7]:
raw.tail()

Unnamed: 0,DepDelay,TaxiOut,TaxiIn
192833,18.0,12.0,4.0
192834,17.0,7.0,4.0
192835,30.0,20.0,7.0
192836,43.0,14.0,19.0
192837,31.0,10.0,4.0


In [8]:
round(raw.describe(),2)

Unnamed: 0,DepDelay,TaxiOut,TaxiIn
count,192838.0,192838.0,192838.0
mean,43.01,18.25,6.8
std,52.54,14.33,5.28
min,6.0,0.0,0.0
25%,12.0,10.0,4.0
50%,24.0,14.0,6.0
75%,53.0,21.0,8.0
max,1303.0,337.0,199.0


Genero les submostres per a train i test:

In [9]:
y = np.array(mostra_y)
X = np.array(raw)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)

In [10]:
print("X_train:", X_train.shape, "\ny_train:", y_train.shape, "\nX_test:", X_test.shape, "\ny_test:", y_test.shape)

X_train: (134986, 3) 
y_train: (134986, 1) 
X_test: (57852, 3) 
y_test: (57852, 1)


Opto per tres models molt diferents de regressió: 
- La regressió lineal clàssica.
- RandomForest (la naturalesa no lineal d'un *random forest* pot donar-li una avantatge sobre els algorismes lineals, però és important tenir en compte que un *bosc aleatori* no pot extrapolar. En un problema de regressió, el rang de prediccions que pot fer un bosc aleatori està limitat per les etiquetes més altes i més baixes de les dades d'entrenament). Agafo un número de arbres limitat a 20 per a reduir el temps d'execució.
- GradientBoosting (generalment s'adaptarà a les dades d'entrenament molt millor que la regressió lineal, però això també vol dir que és propens a sobreajustar-se i s'interpreta amb menys facilitat).

In [11]:
lr_model = LinearRegression().fit(X_train, y_train)

In [12]:
rf_model = RandomForestRegressor(n_estimators = 20, random_state = 42).fit(X_train, y_train)

In [13]:
gb_model = GradientBoostingRegressor(random_state=0).fit(X_train, y_train)

In [14]:
lr_y = lr_model.predict(X_train)
rf_y = rf_model.predict(X_train)
gb_y = gb_model.predict(X_train)

In [15]:
print('Linear Regression (train subset) MSE & r2:\n', round(mean_squared_error(y_train, lr_y),2), round(r2_score(y_train, lr_y),4))
print('Random Forest Regression (train subset) MSE & r2:\n', round(mean_squared_error(y_train, rf_y),2), round(r2_score(y_train, rf_y),4))
print('Gradient Boosting Regression (train subset) MSE & r2:\n', round(mean_squared_error(y_train, gb_y),2),round(r2_score(y_train, gb_y),4))

Linear Regression (train subset) MSE & r2:
 134.73 0.9575
Random Forest Regression (train subset) MSE & r2:
 78.79 0.9752
Gradient Boosting Regression (train subset) MSE & r2:
 129.64 0.9591


**Conclusió**: tots tres models de regressió presenten una correlació bona, però com ja he comentat és degut a la forta correlació que existeix entre ArrDelay i DepDelay.

### *Exercici 2*: Compara’ls en base al MSE i al R2 .

Aniré ficant en una taula els resultats de MSE i R2 per als diferents models:

In [16]:
lr_y_pred = lr_model.predict(X_test)
rf_y_pred = rf_model.predict(X_test)
gb_y_pred = gb_model.predict(X_test)

In [17]:
mse = mean_squared_error(y_test, lr_y_pred)
r2 = r2_score(y_test, lr_y_pred)
table = pd.DataFrame({'Model': ['Linear Regression'],'MSE test': [mse], 'R2 test': [r2]})

In [18]:
table

Unnamed: 0,Model,MSE test,R2 test
0,Linear Regression,134.070814,0.956469


In [19]:
def results (method, test, pred):
    columns = ['Model', 'MSE test', 'R2 test']
    row_data = [method, mean_squared_error(test, pred), r2_score(test, pred)]
    return dict(zip(columns, row_data))

In [20]:
table = table.append(results('Random Forest', y_test, rf_y_pred), ignore_index=True)

In [21]:
table = table.append(results ('Gradient Boosting', y_test, gb_y_pred), ignore_index = True)

In [22]:
table

Unnamed: 0,Model,MSE test,R2 test
0,Linear Regression,134.070814,0.956469
1,Random Forest,160.183924,0.94799
2,Gradient Boosting,131.555991,0.957285


**Conclusió**: dels tres models testejats, el Random Forest té una mica de overfitting, mentre que la regressió lineal i el Gradient Boosting em donen un resultat molt consistent entre les dues submostres.

### *Exercici 3*: Entrena’ls utilitzant els diferents paràmetres que admeten.

Faré l'entrenament amb l'ajuda de GridSearch

In [23]:
LinearRegression().get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': 'deprecated',
 'positive': False}

In [81]:
#dictionary of search space
space = dict()
space['copy_X'] = [True, False]
space['fit_intercept'] = [True, False]
space['positive'] = [True, False]

search = GridSearchCV(LinearRegression(), space)

result = search.fit(X_train, y_train)

print('Best Hyperparameters: ', result.best_params_)

Best Hyperparameters:  {'copy_X': True, 'fit_intercept': True, 'positive': False}


In [24]:
lr_adj_model = LinearRegression(fit_intercept=False, positive=True).fit(X_train, y_train)

In [25]:
lr_adj_y_pred = lr_model.predict(X_test)

In [26]:
table = table.append(results ('Linear_Reg_Adj', y_test, lr_adj_y_pred), ignore_index = True)

In [27]:
table

Unnamed: 0,Model,MSE test,R2 test
0,Linear Regression,134.070814,0.956469
1,Random Forest,160.183924,0.94799
2,Gradient Boosting,131.555991,0.957285
3,Linear_Reg_Adj,134.070814,0.956469


In [28]:
RandomForestRegressor().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [87]:
#dictionary of search space
space = dict()
space['n_estimators'] = [20, 40]
space['max_depth'] = [None, 3]

search = GridSearchCV(RandomForestRegressor(), space)

result = search.fit(X_train, y_train)

print('Best Hyperparameters: ', result.best_params_)

Best Hyperparameters:  {'max_depth': None, 'n_estimators': 40}


In [29]:
rf_adj_model = RandomForestRegressor(n_estimators = 40, random_state = 42).fit(X_train, y_train)

In [30]:
rf_adj_y_pred = rf_adj_model.predict(X_test)

In [31]:
table = table.append(results ('RandomForest_Adj', y_test, rf_adj_y_pred), ignore_index = True)

In [32]:
table

Unnamed: 0,Model,MSE test,R2 test
0,Linear Regression,134.070814,0.956469
1,Random Forest,160.183924,0.94799
2,Gradient Boosting,131.555991,0.957285
3,Linear_Reg_Adj,134.070814,0.956469
4,RandomForest_Adj,158.338898,0.948589


In [33]:
GradientBoostingRegressor().get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [93]:
#dictionary of search space
space = dict()
space['n_estimators'] = [100, 200]
space['max_depth'] = [3,5]

search = GridSearchCV(GradientBoostingRegressor(), space)

result = search.fit(X_train, y_train)

print('Best Hyperparameters: ', result.best_params_)

Best Hyperparameters:  {'max_depth': 3, 'n_estimators': 200}


In [34]:
gb_adj_model = GradientBoostingRegressor(n_estimators = 200, random_state=0).fit(X_train, y_train)

In [35]:
gb_adj_y_pred = gb_adj_model.predict(X_test)

In [36]:
table = table.append(results ('GradientBoosting_Adj', y_test, gb_adj_y_pred), ignore_index = True)

In [37]:
table

Unnamed: 0,Model,MSE test,R2 test
0,Linear Regression,134.070814,0.956469
1,Random Forest,160.183924,0.94799
2,Gradient Boosting,131.555991,0.957285
3,Linear_Reg_Adj,134.070814,0.956469
4,RandomForest_Adj,158.338898,0.948589
5,GradientBoosting_Adj,131.228732,0.957391


**Conclusió**: L'ajust de paràmetres pot millorar la correlació i la capacitat de predicció dels models (amb l'excepció de la regressió lineal però perquè prácticament no té massa ajustos a fer), però aquesta millora és gradual, no un breakthrough.

###  *Exercici 4*: Compara el seu rendiment utilitzant l’aproximació traint/test o utilitzant totes les dades (validació interna)

Faig una cross validation per a cada model, utilitzant tant el R2 com el MSE com a paràmetre per a fer l'scoring. Per defecte fa 5 paquets, 4 els utilitza per train i el 5è per a test, i repeteix el procés 5 vegades (cada vegada amb un test subset diferent).

Tenim llavors 5 mesures de cada paràmetre en cada cas, que se'ns retornen com una array. Presento les mitjanes d'aquestes arrays per a cada model.

In [40]:
# apply cross validation for r2
r2_cv_lr = cross_val_score(LinearRegression(), X, y, scoring = 'r2')
mse_cv_lr = cross_val_score(LinearRegression(), X, y, scoring = 'neg_mean_squared_error')

In [41]:
print(round(mse_cv_lr.mean(),2), round(r2_cv_lr.mean(),4))

-134.54 0.9572


In [42]:
r2_cv_rf = cross_val_score(RandomForestRegressor(n_estimators = 20, random_state = 42), X, y, scoring = 'r2')
mse_cv_rf = cross_val_score(RandomForestRegressor(n_estimators = 20, random_state = 42), X, y, scoring = 'neg_mean_squared_error')

In [43]:
print(round(mse_cv_rf.mean(),2), round(r2_cv_rf.mean(),4))

-160.29 0.949


In [44]:
r2_cv_gb = cross_val_score(GradientBoostingRegressor(random_state=0), X, y, scoring = 'r2')
mse_cv_gb = cross_val_score(GradientBoostingRegressor(random_state=0), X, y, scoring = 'neg_mean_squared_error')

In [45]:
print(round(mse_cv_gb.mean(),2), round(r2_cv_gb.mean(),4))

-131.93 0.958


**Conclusió**: els resultats de mse i r2 de cada model són molt semblants amb totes dues aproximacions (train/test vs validació interna). En aquest cas no suposa cap millora la validació interna, probablement perquè el set de mostra és prou gran.

### *Exercici 5*: Realitza algun procés d’enginyeria de variables per millorar-ne la predicció

Utilitzaré les mateixes variables però standaritzades amb Robust Scaler, i hi afegiré les variables discretes de temps un cop normalitzades.

In [38]:
second_set_of_var = ['stand_DepDelay', 'stand_TaxiOut', 'stand_TaxiIn', 'norm_Month', 'norm_DayOfWeek', 'norm_CRSDepTime']

In [39]:
engineered = mostra_X[second_set_of_var]

In [40]:
engineered.tail()

Unnamed: 0,stand_DepDelay,stand_TaxiOut,stand_TaxiIn,norm_Month,norm_DayOfWeek,norm_CRSDepTime
192833,-0.146341,-0.181818,-0.5,0.181818,0.166667,0.608696
192834,-0.170732,-0.636364,-0.5,0.0,0.333333,0.565217
192835,0.146341,0.545455,0.25,0.272727,0.5,0.913043
192836,0.463415,0.0,3.25,0.0,1.0,0.521739
192837,0.170732,-0.363636,-0.5,0.090909,0.666667,0.695652


In [41]:
engineered.describe()

Unnamed: 0,stand_DepDelay,stand_TaxiOut,stand_TaxiIn,norm_Month,norm_DayOfWeek,norm_CRSDepTime
count,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0
mean,0.463719,0.386114,0.200329,0.465043,0.498002,0.626333
std,1.281416,1.30292,1.320633,0.31645,0.332628,0.184893
min,-0.439024,-1.272727,-1.5,0.0,0.0,0.0
25%,-0.292683,-0.363636,-0.5,0.181818,0.166667,0.478261
50%,0.0,0.0,0.0,0.454545,0.5,0.652174
75%,0.707317,0.636364,0.5,0.727273,0.833333,0.782609
max,31.195122,29.363636,48.25,1.0,1.0,1.0


In [42]:
mostra_y

Unnamed: 0,ArrDelay
0,98.0
1,23.0
2,12.0
3,31.0
4,47.0
...,...
192833,25.0
192834,-6.0
192835,24.0
192836,51.0


Només per a major claredat, utilitzo altres lletres diferents d'X i y per als subsets de train i test en aquest cas.

In [43]:
A = np.array(engineered)

A_train, A_test, b_train, b_test = train_test_split(A, y, test_size=0.30, random_state=40)

In [44]:
lr_eng_model = LinearRegression().fit(A_train, b_train)

In [45]:
rf_eng_model = RandomForestRegressor(n_estimators = 20, random_state = 42).fit(A_train, b_train)

In [46]:
gb_eng_model = GradientBoostingRegressor(random_state=0).fit(A_train, b_train)

In [47]:
lr_eng_y_pred = lr_eng_model.predict(A_test)
rf_eng_y_pred = rf_eng_model.predict(A_test)
gb_eng_y_pred = gb_eng_model.predict(A_test)

In [48]:
table = table.append(results ('Linear Regression Eng', b_test, lr_eng_y_pred), ignore_index = True)

In [49]:
table = table.append(results ('Random Forest Eng', b_test, rf_eng_y_pred), ignore_index = True)

In [50]:
table = table.append(results ('Gradient Boosting Eng', b_test, gb_eng_y_pred), ignore_index = True)

In [51]:
table

Unnamed: 0,Model,MSE test,R2 test
0,Linear Regression,134.070814,0.956469
1,Random Forest,160.183924,0.94799
2,Gradient Boosting,131.555991,0.957285
3,Linear_Reg_Adj,134.070814,0.956469
4,RandomForest_Adj,158.338898,0.948589
5,GradientBoosting_Adj,131.228732,0.957391
6,Linear Regression Eng,133.582175,0.956627
7,Random Forest Eng,147.006542,0.952268
8,Gradient Boosting Eng,129.615672,0.957915


**Conclusió**: He estat capaç de millorar la predicció, fent ús de l'enginyeria de variables. Tantmateix la millora és incremental i petita.

### *Exercici 6*: No utilitzis la variable DepDelay a l’hora de fer prediccions

Moment àlgid de l'exercici: a veure què es pot fer sense DepDelay. Probaré de "ficar a l'olla" el màxim de variables possible, això sí, un cop estandaritzades, dumificades o normalitzades.

Variables de temps normalitzades:
- norm_Month, norm_DayOfWeek, norm_CRSDepTime 	

Companyia aèria dumificada:
- 9E 	AA 	AQ 	AS 	B6 	CO 	DL 	EV 	F9 	FL 	HA 	MQ 	NW 	OH 	OO 	UA 	US 	WN 	XE 	YV

Variables numèriques estandaritzades amb Robust Scaler:
- stand_ActualElapsedTime, stand_CRSElapsedTime, stand_AirTime, stand_Distance, stand_TaxiIn, stand_TaxiOut


In [52]:
yeah = mostra_X.iloc[:,24:]

In [53]:
yeah.drop('stand_DepDelay', axis=1, inplace=True)

In [54]:
yeah.describe()

Unnamed: 0,norm_Month,norm_DayOfWeek,norm_CRSDepTime,9E,AA,AQ,AS,B6,CO,DL,EV,F9,FL,HA,MQ,NW,OH,OO,UA,US,WN,XE,YV,stand_ActualElapsedTime,stand_CRSElapsedTime,stand_AirTime,stand_Distance,stand_TaxiIn,stand_TaxiOut
count,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0,192838.0
mean,0.465043,0.498002,0.626333,0.026971,0.099918,0.000389,0.020333,0.027904,0.051867,0.059257,0.042129,0.014795,0.036528,0.004009,0.072605,0.041086,0.027194,0.067969,0.072901,0.050146,0.195869,0.053516,0.034615,0.20472,0.22005,0.229408,0.241355,0.200329,0.386114
std,0.31645,0.332628,0.184893,0.161999,0.299892,0.019717,0.141138,0.164699,0.22176,0.236106,0.200883,0.120731,0.187601,0.063186,0.259488,0.19849,0.162648,0.251693,0.259974,0.218246,0.396869,0.225061,0.182802,0.848488,0.85811,0.85831,0.868951,1.320633,1.30292
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.164706,-1.373494,-1.125,-0.872727,-1.5,-1.272727
25%,0.181818,0.166667,0.478261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.423529,-0.409639,-0.4,-0.406061,-0.5,-0.363636
50%,0.454545,0.5,0.652174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.727273,0.833333,0.782609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.576471,0.590361,0.6,0.593939,0.5,0.636364
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.670588,6.554217,6.8375,6.6,48.25,29.363636


Només per a major claredat, utilitzo altres lletres diferents d'X i y per als subsets de train i test també en aquest cas.

In [55]:
M = np.array(yeah)

M_train, M_test, n_train, n_test = train_test_split(M, y, test_size=0.30, random_state=40)

Utilitzaré el model que m'ha donat millor predicció(Gradient Boosting), ajustat amb el parametre que m'ha donat millor predicció:

In [56]:
final_model = GradientBoostingRegressor(n_estimators = 200, random_state=0).fit(M_train, n_train)

In [57]:
final_model_y_pred = final_model.predict(M_test)

In [58]:
table = table.append(results ('final model', n_test, final_model_y_pred), ignore_index = True)

In [59]:
table

Unnamed: 0,Model,MSE test,R2 test
0,Linear Regression,134.070814,0.956469
1,Random Forest,160.183924,0.94799
2,Gradient Boosting,131.555991,0.957285
3,Linear_Reg_Adj,134.070814,0.956469
4,RandomForest_Adj,158.338898,0.948589
5,GradientBoosting_Adj,131.228732,0.957391
6,Linear Regression Eng,133.582175,0.956627
7,Random Forest Eng,147.006542,0.952268
8,Gradient Boosting Eng,129.615672,0.957915
9,final model,2604.420292,0.154371


És un resultat molt dolent... Potser hi ha overfitting? Faig un check:

In [60]:
check = final_model.predict(M_train)

In [61]:
round(mean_squared_error(n_train, check),2)

2632.08

In [62]:
round(r2_score(n_train, check),4)

0.1698

No és un problema d'overfitting, es tracta d'una baixa capacitat de predicció de les variables independents utilitzades.

**Conclusió**: obtinc *només* un r2 de 0.15, però no s'ha de menystenir aquest resultat. Certament la combinació de totes aquestes variables i DepDelay em donarà una correlació molt i molt bona. Faig l'exercici per demostrar-ho:

In [77]:
final = mostra_X.iloc[:,24:]

In [78]:
final

Unnamed: 0,norm_Month,norm_DayOfWeek,norm_CRSDepTime,9E,AA,AQ,AS,B6,CO,DL,EV,F9,FL,HA,MQ,NW,OH,OO,UA,US,WN,XE,YV,stand_ActualElapsedTime,stand_CRSElapsedTime,stand_AirTime,stand_Distance,stand_DepDelay,stand_TaxiIn,stand_TaxiOut
0,0.636364,0.666667,0.608696,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.011765,-0.301205,-0.5125,-0.481818,1.219512,-0.75,4.454545
1,1.000000,0.000000,0.913043,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,-0.541176,-0.397590,-0.4500,-0.387879,0.292683,-1.00,0.000000
2,0.636364,0.500000,0.391304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3.188235,3.265060,3.3500,3.178788,-0.292683,0.75,0.545455
3,0.363636,0.166667,0.347826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.917647,0.771084,1.1750,0.759091,-0.170732,-0.50,-0.727273
4,0.181818,0.000000,0.347826,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.235294,0.506024,0.2875,0.218182,1.097561,0.75,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192833,0.181818,0.166667,0.608696,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.917647,0.855422,1.1000,0.609091,-0.146341,-0.50,-0.181818
192834,0.000000,0.333333,0.565217,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.305882,0.590361,0.5125,0.437879,-0.170732,-0.50,-0.636364
192835,0.272727,0.500000,0.913043,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.270588,1.373494,1.3375,1.377273,0.146341,0.25,0.545455
192836,0.000000,1.000000,0.521739,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.623529,-0.734940,-0.7500,-0.762121,0.463415,3.25,0.000000


In [79]:
P = np.array(final)

P_train, P_test, q_train, q_test = train_test_split(P, y, test_size=0.30, random_state=40)

In [80]:
totes_variables = GradientBoostingRegressor(n_estimators = 200, random_state=0).fit(P_train, q_train)

In [81]:
totes_variables_y_pred = totes_variables.predict(P_test)

In [101]:
table = table.append(results ('all variables', q_test, totes_variables_y_pred), ignore_index = True)

Et voilà: l'aportació de totes aquestes variables m'ajuda a obtenir un elevat grau de predicció!

In [102]:
table

Unnamed: 0,Model,MSE test,R2 test
0,Linear Regression,134.070814,0.956469
1,Random Forest,160.183924,0.94799
2,Gradient Boosting,131.555991,0.957285
3,Linear_Reg_Adj,134.070814,0.956469
4,RandomForest_Adj,158.338898,0.948589
5,GradientBoosting_Adj,131.228732,0.957391
6,Linear Regression Eng,133.582175,0.956627
7,Random Forest Eng,147.006542,0.952268
8,Gradient Boosting Eng,129.615672,0.957915
9,final model,2604.420292,0.154371
