# Entrega

## Preparacion

### Imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from joblib import dump, load

In [2]:
from preprocessing import reemplazarNulls,reemplazarCategoricas,reemplazarFechas,regularizar,targetBooleano
from preprocessing import reemplazarCategoricas_OHE, keepFeat_OHE, reemplazarNullsNum
from preprocessing import reemplazarCategoricas_HashTrick, normalizar_HashTrick

[###] Initial Preprocessings Done                           
[###] Aditional Preprocessings Done                                                   


In [3]:
from utilities import score2

### Test Holdout

In [4]:
df_feat = pd.read_csv("datasets/holdout_features.csv", low_memory=False).set_index('id')
df_targ = pd.read_csv("datasets/holdout_target.csv")

## Preprocesamientos

preprocesamiento | descripcion | funcion
:--:|:--:|:--:
convertir target a booleano | Convierte los 'si' y 'no' por True y False | `targetBooleano`
reemplazar nulls de todas las features | Reemplaza los nulls de los features con un `simple imputer` | `reemplazarNulls`
tratar missings numericos| Reemplaza missings por su media y una feature bool de missing |`reemplazarNullsNum`
reemplazar categoricas de features | convierte las features categoricas en numericas | `reemplazarCategoricas`
reemplazar fechas de features | convierte las features de fecha en numericas | `reemplazarFechas`
regularizar features | Normaliza las features y elimina las menos significativas mediante lasso | `regularizar`
escalar features | Luego de normalizar features pueden ser escaladas segun el peso asignado por lasso | `regularizar`
One Hot | Reemplazar features categoricas con one hot encoding | `reemplazarCategoricas_OHE`
Seleccion OHE | Selecciona los `N` features mas significativos | `keepFeat_OHE(N)`
Hash Trick | Reemplazar features categoricas con hash trick | `reemplazarCategoricas_HashTrick`
Normalizar HT | Normaliza las features resultantes de hash trick | `normalizar_HashTrick`

identificacion | preprocesamientos
:--:|:--:
`Comun` | `targetBooleano` `reemplazarFechas`
`BAS` | `Comun` `reemplazarNulls` `reemplazarCategoricas`
`REG` | `BAS` `regularizar`
`OHE` | `Comun` `reemplazarCategoricas_OHE`
`OHE(N)` | `OHE` `keepFeat_OHE(N)`
`HT` | `Comun` `reemplazarCategoricas_HashTrick`
`HTN` | `HT` `normalizar_HashTrick`

In [5]:
targetBooleano(df_targ, inplace=True)
df_targ = df_targ.llovieron_hamburguesas_al_dia_siguiente

ohe_feat = reemplazarCategoricas_OHE(df_feat)
ht_feat = reemplazarCategoricas_HashTrick(df_feat)

reemplazarNulls(df_feat , inplace=True)
reemplazarCategoricas(df_feat , inplace=True)
reemplazarFechas(df_feat , inplace=True)

df_reg = regularizar(df_feat)

reemplazarNullsNum(ohe_feat, inplace=True)
reemplazarFechas(ohe_feat , inplace=True)
ohe_feat2 = keepFeat_OHE(ohe_feat, 10)

reemplazarFechas(ht_feat , inplace=True)
reemplazarNullsNum(ht_feat, inplace=True)
ht_feat2 = normalizar_HashTrick(ht_feat)

## Comparacion con Test Holdout

In [6]:
predictions = pd.DataFrame()

In [7]:
def predict( model, name, preproc, feat ):
    pred = model.predict(feat)
    prob = model.predict_proba(feat)
    return score2( name, preproc, df_targ, pred, prob[:,1] )

### Modelos

#### Arbol

In [8]:
arbol = load('models/Tree/tree.sk')

In [9]:
pdf = predict(arbol,"Arbol","BAS",df_feat)
predictions = predictions.append( pdf )
pdf['AUC-ROC'][0]

0.8545815778158975

#### Knn

In [10]:
knn = load('models/KNN/knn.sk')

In [11]:
pdf = predict(knn,"KNN","REG",df_reg)
predictions = predictions.append( pdf )
pdf['AUC-ROC'][0]

0.8731459243361229

#### Naive Bayes

In [12]:
nb = load('models/NB/nb.sk')

In [13]:
pdf = predict(nb,"Naive Bayes","REG",df_reg)
predictions = predictions.append( pdf )
pdf['AUC-ROC'][0]

0.8294019163885583

#### SVM (Poly)

In [14]:
svm = load('models/SVM/svm.sk')

In [15]:
pdf = predict(svm,"SVM (Poly)","OHE",ohe_feat)
predictions = predictions.append( pdf )
pdf['AUC-ROC'][0]

0.8743984357683136

#### Red Neuronal

In [16]:
nn = load('models/NN/nn.sk')

In [17]:
pdf = predict(nn,"Red Neuronal","HTN",ht_feat2)
predictions = predictions.append( pdf )
pdf['AUC-ROC'][0]

0.8779010880721546

#### Random Forest

In [18]:
random_forest = load('models/Ensambles/random_forest.sk')

In [19]:
pdf = predict(random_forest,"Random Forest","BAS",df_feat)
predictions = predictions.append( pdf )
pdf['AUC-ROC'][0]

0.8733757476434599

#### Boosting

In [20]:
boost = load('models/Ensambles/boost.sk')

In [21]:
pdf = predict(boost,"BOOST","OHE",ohe_feat)
predictions = predictions.append( pdf )
pdf['AUC-ROC'][0]

0.9032807293813307

## Resultados

In [22]:
predictions

Unnamed: 0,Modelo,Preprocesamientos,Clase,AUC-ROC,Accuracy,Precision,Recall,F1 score,Support
0,Arbol,BAS,AVG,0.854582,0.840587,0.829657,0.840587,0.828618,11373
1,Arbol,BAS,True,,,0.713536,0.48055,0.574313,2545
2,Arbol,BAS,False,,,0.863133,0.944382,0.901931,8828
0,KNN,REG,AVG,0.873146,0.842786,0.833702,0.842786,0.825936,11373
1,KNN,REG,True,,,0.760137,0.434578,0.553,2545
2,KNN,REG,False,,,0.85491,0.960467,0.90462,8828
0,Naive Bayes,REG,AVG,0.829402,0.825991,0.817808,0.825991,0.820698,11373
1,Naive Bayes,REG,True,,,0.63114,0.535167,0.579205,2545
2,Naive Bayes,REG,False,,,0.871622,0.909832,0.890318,8828
0,SVM (Poly),OHE,AVG,0.874398,0.849028,0.84039,0.849028,0.835431,11373


## Conclusion

**Modelo Recomendado:** Boosting, evaluando con el test-holdout es el que mejores metricas en todos los campos excepto  Precision

- qué modelo elegiríamos si se necesitase tener la menor cantidad de falsos positivos

> Ninguno, los modelos fueron entrenados para optimizar AUR-ROC, si quisiera minimizar la cantidad de FP entrenaria para optimizar la Presicion.
>
> De los modelos entrenados, el de mayor Precision en el test-holdout fue knn. 

- si necesitan tener una lista de todos los días que potencialmente lloverán hamburguesas al día siguiente sin preocuparse demasiado si metemos en la misma días que realmente no llovieron hamburguesas al día siguiente

> Ninguno, los modelos fueron entrenados para optimizar AUR-ROC, si quisiera minimizar la cantidad de FN entrenaria para optimizar el Recall.
>
> De los modelos entrenados, el de mayor Recall en el test-holdout fue Naive Bayes, que es bastante bajo. 

#### Comparacion con Base Line

In [23]:
df_feat_base = pd.read_csv("datasets/holdout_features.csv", low_memory=False).set_index('id')
reemplazarNullsNum(df_feat_base , inplace=True)

Feature names unseen at fit time:
- id
Feature names must be in the same order as they were in fit.



ValueError: X has 23 features, but SimpleImputer is expecting 22 features as input.

In [None]:
def funcion_baseline(row):
    if row["llovieron_hamburguesas_hoy"] == "si":
        if row['horas_de_sol'] < 2:
            return True
        if row['nubosidad_tarde'] > 7:
            return True
        if row["humedad_tarde"] > 70:
            return True

    if row["mm_lluvia_dia"] > 10:
        return True
    if row["humedad_tarde"] > 80:
        return True

    return False

def baseline(df):
    return df.apply(funcion_baseline, axis=1)

In [None]:
pred = baseline(df_feat_base)
prob = pred_b.replace({True:80,False:20})
pdf = score2( "Baseline", "reemplazar nulls", df_targ, pred, prob[:,1] )
predictions = predictions.append( pdf )
pdf['AUC-ROC'][0]

In [None]:
predictions

Una mejora impresionante !

## Predicciones

In [None]:
try:
    df_feat = pd.read_csv('predictions/pred_feat.csv', low_memory=False)
except:
    df_feat = pd.read_csv('https://docs.google.com/spreadsheets/d/1mR_JNN0-ceiB5qV42Ff9hznz0HtWaoPF3B9zNGoNPY8/export?format=csv', low_memory=False)
    df_feat.to_csv('predictions/pred_feat.csv')
pred_feat.drop('Unnamed: 0',axis=1,inplace=True)
pred_feat

In [None]:
cols = ['id', 'barrio', 'dia', 'direccion_viento_tarde','direccion_viento_temprano', 'horas_de_sol', 'humedad_tarde','humedad_temprano', 'llovieron_hamburguesas_hoy', 'mm_evaporados_agua','mm_lluvia_dia', 'nubosidad_tarde', 'nubosidad_temprano','presion_atmosferica_tarde', 'presion_atmosferica_temprano','rafaga_viento_max_direccion', 'rafaga_viento_max_velocidad','temp_max', 'temp_min', 'temperatura_tarde', 'temperatura_temprano','velocidad_viendo_tarde', 'velocidad_viendo_temprano']
df_feat = df_feat.reindex(cols, axis=1).set_index("id")

In [None]:
ohe_feat = reemplazarCategoricas_OHE(df_feat)
ht_feat = reemplazarCategoricas_HashTrick(df_feat)

reemplazarNulls(df_feat , inplace=True)
reemplazarCategoricas(df_feat , inplace=True)
reemplazarFechas(df_feat , inplace=True)

df_reg = regularizar(df_feat)

reemplazarNullsNum(ohe_feat, inplace=True)
reemplazarFechas(ohe_feat , inplace=True)
ohe_feat2 = keepFeat_OHE(ohe_feat, 10)

reemplazarFechas(ht_feat , inplace=True)
reemplazarNullsNum(ht_feat, inplace=True)
ht_feat2 = normalizar_HashTrick(ht_feat)

In [None]:
def save_pred(name, model, feat):
    pred = model.predict(feat)
    pred = pd.Series(pred).replace( {False:'no', True:'si'} )
    #pred_df = pd.concat([ids,pred_targ], keys=['id', 'llovieron_hamburguesas_al_dia_siguiente'], axis=1)
    # pred_df.set_index('id').to_csv(f'predictions/{name}.csv')
    return pred

In [None]:
save_pred("arbol",arbol,df_feat)