# Entrega

## Preparacion

### Imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from joblib import dump, load

In [2]:
from preprocessing import reemplazarNulls,reemplazarCategoricas,reemplazarFechas,regularizar,targetBooleano

[###] All Done                                              


In [3]:
from utilities import score2, score, evalModels

### Test Holdout

In [4]:
df_feat = pd.read_csv("datasets/holdout_features.csv", low_memory=False)
df_targ = pd.read_csv("datasets/holdout_target.csv")

In [5]:
df_targ=targetBooleano(df_targ).llovieron_hamburguesas_al_dia_siguiente
reemplazarNulls(df_feat , inplace=True)
reemplazarCategoricas(df_feat , inplace=True)
reemplazarFechas(df_feat , inplace=True)
regularizar(df_feat , inplace=True)

Unnamed: 0,id,direccion_viento_tarde,direccion_viento_temprano,horas_de_sol,humedad_tarde,humedad_temprano,mm_lluvia_dia,nubosidad_tarde,presion_atmosferica_tarde,rafaga_viento_max_direccion,rafaga_viento_max_velocidad,velocidad_viendo_tarde
0,-0.114786,-0.311222,-0.773796,-2.018030,-1.372610,-0.832696,0.345041,0.916539,-1.384970,-0.220589,1.544505,0.834079
1,1.006900,-1.358982,-0.976624,-2.018030,1.085157,0.638263,-0.281713,0.916539,-2.080499,0.802215,0.221665,-0.642648
2,0.008030,-1.149430,-0.570969,-2.018030,-0.023248,1.058537,1.020008,0.916539,-0.064885,-0.016028,-0.219282,-0.869837
3,0.337019,-0.939878,-0.976624,0.653494,-0.360588,-0.044682,-0.281713,0.916539,-0.717831,1.620459,-0.219282,0.152513
4,1.129419,-1.149430,-1.179451,-2.018030,0.362285,0.165455,-0.281713,0.181553,0.559671,-1.038833,-0.219282,0.379701
...,...,...,...,...,...,...,...,...,...,...,...,...
11368,0.913813,-0.101670,-0.773796,-1.515466,0.217710,0.270523,-0.257607,0.916539,0.971310,-0.834272,-0.513247,-0.188270
11369,0.358833,-1.568534,0.645996,-2.018030,-0.264205,-0.517490,-0.281713,-1.655914,-0.121663,-0.220589,-0.660229,-1.097025
11370,-1.586621,-0.101670,-0.570969,0.045127,-1.468993,-1.515641,-0.281713,0.916539,-1.015914,0.393094,3.014328,3.333155
11371,-1.313430,1.574745,1.254478,-2.018030,1.085157,0.795865,-0.209395,0.916539,1.553283,1.211337,0.221665,-0.415459


### Modelos

In [6]:
arbol = load('models/Tree/arbol_r.sk')

In [7]:
knn = load('models/KNN/knn_auto.sk')

In [8]:
nb = load('models/NB/nb_reg.sk')

In [9]:
svm_lin = load('models/SVM/svm_lineal.sk')
svm_pol = load('models/SVM/svm_poly.sk')
svm_rad = load('models/SVM/svm_radial.sk')

In [10]:
nn = load('models/NN/mlp_cls_r.sk')

In [11]:
rand_f = load('models/Ensambles/forest_r.sk')

In [12]:
boost = load('models/Ensambles/boost_r.sk')

In [13]:
# vote = load('models/Ensambles/vote_svm.sk')

## Preprocesamientos

preprocesamiento | descripcion | funcion
:--:|:--:|:--:
 convertir target a booleano | Convierte los 'si' y 'no' por True y False | `targetBooleano`
 reemplazar nulls de features | Reemplaza los nulls de los features con un `simple imputer` | `reemplazarNulls`
 reemplazar categoricas de features | convierte las features categoricas en numericas | `reemplazarCategoricas`
 reemplazar fechas de features | convierte las features de fecha en numericas | `reemplazarFechas`
 regularizar features | Normaliza las features y elimina las menos significativas | `regularizar`

## Comparacion con Test Holdout

In [14]:
modelos = [
    ('arbol',arbol,"Todos"),
    ('knn',knn,"Todos"),
    ('naive bayes',nb,"Todos"),
    ('svm lineal',svm_lin,"Todos"),
    ('svm poly',svm_pol,"Todos"),
    ('svm radial',svm_rad,"Todos"),
    ('Red Neuronal',nn,"Todos"),
    ('Random Forest',rand_f,"Todos"),
    ('Boosting',boost,"Todos"),
]

In [15]:
evalModels(df_feat, df_targ, modelos)

Unnamed: 0,Modelo,Preprocesamientos,AUC-ROC,Accuracy,Precision,Recall,F1 score
0,arbol,Todos,0.85791,0.841027,0.722659,0.469941,0.569524
1,knn,Todos,0.870998,0.838565,0.770404,0.396857,0.523859
2,naive bayes,Todos,0.829405,0.826167,0.631603,0.53556,0.57963
3,svm lineal,Todos,0.857328,0.838301,0.743785,0.423183,0.539444
4,svm poly,Todos,0.860728,0.838917,0.765846,0.403536,0.528564
5,svm radial,Todos,0.856471,0.838741,0.774094,0.394499,0.522644
6,Red Neuronal,Todos,0.872029,0.846742,0.71536,0.523379,0.604493
7,Random Forest,Todos,0.873621,0.845511,0.760582,0.451866,0.566921
8,Boosting,Todos,0.883838,0.85149,0.737514,0.5222,0.611456


## Conclusion

**Modelo Recomendado:** Boosting, evaluando con el test-holdout es el que mejores metricas en todos los campos excepto  Precision

- qué modelo elegiríamos si se necesitase tener la menor cantidad de falsos positivos

> Ninguno, los modelos fueron entrenados para optimizar AUR-ROC, si quisiera minimizar la cantidad de FP entrenaria para optimizar la Presicion.
>
> De los modelos entrenados, el de mayor Precision en el test-holdout fue knn. 

- si necesitan tener una lista de todos los días que potencialmente lloverán hamburguesas al día siguiente sin preocuparse demasiado si metemos en la misma días que realmente no llovieron hamburguesas al día siguiente

> Ninguno, los modelos fueron entrenados para optimizar AUR-ROC, si quisiera minimizar la cantidad de FN entrenaria para optimizar el Recall.
>
> De los modelos entrenados, el de mayor Recall en el test-holdout fue Naive Bayes, que es bastante bajo. 

#### Comparacion con Base Line

In [16]:
df_feat_base = pd.read_csv("datasets/holdout_features.csv", low_memory=False)
reemplazarNulls(df_feat_base , inplace=True)

Unnamed: 0,id,barrio,dia,direccion_viento_tarde,direccion_viento_temprano,horas_de_sol,humedad_tarde,humedad_temprano,llovieron_hamburguesas_hoy,mm_evaporados_agua,...,presion_atmosferica_tarde,presion_atmosferica_temprano,rafaga_viento_max_direccion,rafaga_viento_max_velocidad,temp_max,temp_min,temperatura_tarde,temperatura_temprano,velocidad_viendo_tarde,velocidad_viendo_temprano
0,54297,Liniers,2017-02-19,Noroeste,Nornoreste,0.0,23.0,53.0,si,4.0,...,1005.5,1008.5,Noroeste,61.0,23.2,5.9,22.5,13.7,26.0,9.0
1,91989,Caballito,2011-01-17,Estenoreste,Noreste,0.0,74.0,81.0,no,4.0,...,1000.6,1003.5,Sursuroeste,43.0,28.6,18.0,26.3,22.4,13.0,7.0
2,58424,Coghlan,2014-12-04,Noreste,Norte,0.0,51.0,89.0,si,4.0,...,1014.8,1016.4,Oestenoroeste,37.0,31.6,20.1,31.0,21.4,11.0,13.0
3,69479,Villa Soldati,2013-07-29,Nornoreste,Noreste,10.1,44.0,68.0,no,4.4,...,1010.2,1014.7,Estesureste,37.0,32.3,17.6,29.6,24.4,20.0,13.0
4,96106,Barracas,2012-08-20,Noreste,Estenoreste,0.0,59.0,72.0,no,2.8,...,1019.2,1024.1,Noreste,37.0,19.1,7.8,17.9,15.0,22.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11368,88861,Villa Lugano,2016-08-19,Oestenoroeste,Nornoreste,1.9,56.0,74.0,no,3.0,...,1022.1,1024.8,Nornoreste,33.0,15.5,8.1,15.4,10.6,17.0,11.0
11369,70212,Barracas,2012-06-21,Este,suroeste,0.0,46.0,59.0,no,2.0,...,1014.4,1019.0,Noroeste,31.0,19.5,4.5,19.4,13.6,9.0,6.0
11370,4839,Villa Crespo,2015-08-11,Oestenoroeste,Norte,7.8,21.0,40.0,no,5.4,...,1008.1,1014.9,Oestesuroeste,81.0,20.6,4.0,19.0,10.1,48.0,19.0
11371,14019,Almagro,2016-01-16,Estesureste,Sursureste,0.0,74.0,84.0,no,4.0,...,1026.2,1027.6,Sursureste,43.0,22.7,13.2,19.9,16.8,15.0,7.0


In [17]:
def funcion_baseline(row):
    if row["llovieron_hamburguesas_hoy"] == "si":
        if row['horas_de_sol'] < 2:
            return True
        if row['nubosidad_tarde'] > 7:
            return True
        if row["humedad_tarde"] > 70:
            return True

    if row["mm_lluvia_dia"] > 10:
        return True
    if row["humedad_tarde"] > 80:
        return True

    return False


def baseline(df):
    return df.apply(funcion_baseline, axis=1)

In [18]:
pred_b = baseline(df_feat_base)
prob_b = pred_b.replace({True:80,False:20})
(roc_auc, acc, prec, rec, f1) = score2(df_targ, pred_b, prob_b)

In [19]:
evalModels(
    df_feat, df_targ,
    [('Boosting',boost,"Todos")],
    [('Baseline','Reemplazar Nulls',roc_auc, acc, prec, rec, f1)]
)

Unnamed: 0,Modelo,Preprocesamientos,AUC-ROC,Accuracy,Precision,Recall,F1 score
0,Boosting,Todos,0.883838,0.85149,0.737514,0.5222,0.611456
1,Baseline,Reemplazar Nulls,0.698283,0.798382,0.552941,0.517092,0.534416


Una mejora impresionante !

## Predicciones

In [20]:
try:
    pred_feat = df = pd.read_csv('predictions/pred_feat.csv', low_memory=False)
except:
    pred_feat = df = pd.read_csv('https://docs.google.com/spreadsheets/d/1mR_JNN0-ceiB5qV42Ff9hznz0HtWaoPF3B9zNGoNPY8/export?format=csv', low_memory=False)
    pred_feat.to_csv('predictions/pred_feat.csv')
pred_feat.drop('Unnamed: 0',axis=1,inplace=True)
pred_feat

Unnamed: 0,barrio,dia,direccion_viento_tarde,direccion_viento_temprano,horas_de_sol,humedad_tarde,humedad_temprano,id,llovieron_hamburguesas_hoy,mm_evaporados_agua,...,presion_atmosferica_tarde,presion_atmosferica_temprano,rafaga_viento_max_direccion,rafaga_viento_max_velocidad,temp_max,temp_min,temperatura_tarde,temperatura_temprano,velocidad_viendo_tarde,velocidad_viendo_temprano
0,Villa General Mitre,2014-12-16,Oestesuroeste,Sursureste,13.4,38.0,51.0,116706,,,...,1010.9,1014.4,suroeste,41.0,26.8,8.9,24.9,20.6,28.0,13.0
1,Nueva Pompeya,2010-10-21,Nornoreste,Estesureste,,39.0,57.0,58831,no,,...,1020.2,1023.8,Norte,28.0,23.3,5.0,21.5,14.7,11.0,6.0
2,Constitución,2013-04-09,Estesureste,Oestenoroeste,3.6,73.0,90.0,31981,si,2.4,...,1024.3,1026.7,Oestenoroeste,24.0,22.0,15.6,20.7,16.7,6.0,15.0
3,Agronomía,2016-02-05,Sureste,Sureste,,34.0,47.0,2533,no,,...,1015.8,1018.3,Sureste,30.0,29.9,14.2,27.0,20.0,11.0,15.0
4,Balvanera,2012-06-05,suroeste,Noroeste,,77.0,87.0,7270,no,2.0,...,1007.6,1006.0,suroeste,39.0,11.5,5.5,11.2,7.0,20.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29087,Parque Chas,2013-04-24,suroeste,Oestenoroeste,,71.0,77.0,73456,no,,...,1018.9,1021.2,Oeste,37.0,19.8,9.8,17.3,12.8,9.0,13.0
29088,Belgrano,2015-10-30,Norte,Noreste,,37.0,64.0,14471,no,,...,1017.9,1021.8,Nornoreste,41.0,29.3,15.6,27.8,20.2,15.0,28.0
29089,Villa Crespo,2011-08-09,Nornoreste,Norte,10.1,31.0,77.0,106482,no,3.2,...,1011.1,1016.3,suroeste,41.0,19.8,5.5,18.6,11.1,20.0,11.0
29090,Caballito,2017-04-25,Nornoreste,Norte,,81.0,90.0,21057,no,,...,1008.2,1014.6,Nornoreste,39.0,25.4,17.8,22.0,19.5,33.0,15.0


In [21]:
cols = ['id', 'barrio', 'dia', 'direccion_viento_tarde','direccion_viento_temprano', 'horas_de_sol', 'humedad_tarde','humedad_temprano', 'llovieron_hamburguesas_hoy', 'mm_evaporados_agua','mm_lluvia_dia', 'nubosidad_tarde', 'nubosidad_temprano','presion_atmosferica_tarde', 'presion_atmosferica_temprano','rafaga_viento_max_direccion', 'rafaga_viento_max_velocidad','temp_max', 'temp_min', 'temperatura_tarde', 'temperatura_temprano','velocidad_viendo_tarde', 'velocidad_viendo_temprano']
pred_feat = pred_feat.reindex(cols, axis=1)
ids = pred_feat.id

In [22]:
reemplazarNulls(pred_feat , inplace=True)
reemplazarCategoricas(pred_feat , inplace=True)
reemplazarFechas(pred_feat , inplace=True)
regularizar(pred_feat , inplace=True)

Unnamed: 0,id,direccion_viento_tarde,direccion_viento_temprano,horas_de_sol,humedad_tarde,humedad_temprano,mm_lluvia_dia,nubosidad_tarde,presion_atmosferica_tarde,rafaga_viento_max_direccion,rafaga_viento_max_velocidad,velocidad_viendo_tarde
0,1.742460,0.317434,1.254478,1.526368,-0.649737,-0.937764,-0.281713,0.916539,-0.618469,0.597655,0.074682,1.061268
1,0.020142,-0.939878,1.660133,-2.018030,-0.601546,-0.622559,-0.281713,-1.288420,0.701616,-0.629711,-0.880703,-0.869837
2,-0.778894,1.574745,0.037513,-1.065804,1.036966,1.111071,0.345041,-0.553434,1.283588,-0.016028,-1.174667,-1.437808
3,-1.655245,1.365193,1.457306,-2.018030,-0.842503,-1.147901,-0.281713,0.916539,0.077059,1.415898,-0.733720,-0.869837
4,-1.514276,0.526985,-0.165314,-2.018030,1.229732,0.953468,-0.257607,1.284033,-1.086887,0.597655,-0.072300,0.152513
...,...,...,...,...,...,...,...,...,...,...,...,...
29087,0.455372,0.526985,0.037513,-2.018030,0.940583,0.428126,-0.281713,0.916539,0.517088,0.188533,-0.219282,-1.097025
29088,-1.299979,-0.730326,-0.976624,-2.018030,-0.697929,-0.254819,-0.281713,-1.288420,0.375143,-0.834272,0.074682,-0.415459
29089,1.438201,-0.939878,-0.570969,0.653494,-0.987078,0.428126,-0.281713,-0.920927,-0.590081,0.597655,0.074682,0.152513
29090,-1.103985,-0.939878,-0.570969,-2.018030,1.422498,1.111071,-0.281713,0.916539,-1.001720,-0.834272,-0.072300,1.629239


In [23]:
%%time
for (name, model, preproc) in modelos:
    pred_targ = model.predict(pred_feat)
    pred_targ = pd.Series(pred_targ).replace( {False:'no', True:'si'} )
    pred_df = pd.concat([ids,pred_targ], keys=['id', 'llovieron_hamburguesas_al_dia_siguiente'], axis=1)
    pred_df.set_index('id').to_csv(f'predictions/{name}.csv')

CPU times: user 3min 16s, sys: 927 ms, total: 3min 17s
Wall time: 3min 18s
