# PIPELINE

In [1]:
# Librerias
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import datetime
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import joblib
from aux_func import print_metrics, matriz_confusion, curva_roc, curva_pr,ganancia, curva_lift, print_metrics_optimized,matriz_confusion_optimizada
#Librarias de modelos
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve

#Transformer
from sklearn.compose import ColumnTransformer

In [2]:
##Enconders
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder

In [3]:
#Configuración de visualización de notebook
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [8]:
#Cargamos la data
data = pd.read_csv('NCDB_1999_to_2014.csv', delim_whitespace=False)

In [9]:
#Cambiamos los nombres de las columnas para pasarlas a minúscula
data.columns= data.columns.str.lower()

In [10]:
data=data.drop_duplicates()

In [11]:
data.columns

Index(['c_year', 'c_mnth', 'c_wday', 'c_hour', 'c_sev', 'c_vehs', 'c_conf',
       'c_rcfg', 'c_wthr', 'c_rsur', 'c_raln', 'c_traf', 'v_id', 'v_type',
       'v_year', 'p_id', 'p_sex', 'p_age', 'p_psn', 'p_isev', 'p_safe',
       'p_user'],
      dtype='object')

In [12]:
# Primero definimos una nueva columna: accidente mortal, que tomará 1s o 0s y contamos cuantos 1 hay (muertos). 
data['acc_mortal'] = data['c_sev'] == 1
data["acc_mortal"] = data["acc_mortal"].astype(int)
data.acc_mortal.sum()

98616

In [13]:
#Vamos a crear un diccionario para poder cambiarlos

# Lista con los numeros str con formato '01','02'.'03'...
categories_00_format=['%.2d' % i for i in range(100)]
categories_00_format

#Esta segunda lista son números normales de 1 al 99 en formato int
mumbers_0_format=[i for i in range(100)]
mumbers_0_format

#Unimos las dos listas anteriores
categories_00_format.extend(mumbers_0_format)

#Esta tercera lista son numeros como string con formato normal que son con los que quiero terminar 
categories0_format=list([str(i) for i in range(100)])
categories0_format.extend(list([str(i) for i in range(100)]))


#Las unimos en un diccionario
zip_iterator = zip(categories_00_format, categories0_format)
diccionary = dict(zip_iterator)

In [14]:
#Quitamos la variable v_id, p_id, c_sev y p_isev ya que es info que no tendremos cuando suceda el accidente o que no aportan valor. 
data=data.drop(['v_id','p_id','c_sev','p_isev'],axis=1)


In [15]:
### df_nulls['v_year'] = df_nulls['v_year'].astype(int)
#Reemplazamos los distintos meses para que sean todos iguales ya que algunos tienne un formato como 01,02 o algunos son int
data['c_mnth']=data['c_mnth'].replace({'01':'1','02':'2',1:'1',2:'2',3:'3',4:'4',5:'5',6:'6',7:'7',8:'8',9:'9',10:'10',11:'11',12:'12'})
#Reeemplazamos c_wday para que todos sean categoricos, ya que algunos son ints 
data['c_wday']=data['c_wday'].replace({'01':'1','02':'2',1:'1',2:'2',3:'3',4:'4',5:'5',6:'6',7:'7',8:'8',9:'9',10:'10',11:'11',12:'12'})
#Reemplazamos c_vehs
data['c_vehs']=data['c_vehs'].replace(diccionary)
#Reemplazamos c_conf
data['c_conf']=data['c_conf'].replace(diccionary)

In [16]:
X=data.drop('acc_mortal',axis=1)
y=data.acc_mortal

In [17]:
#defnimos columnas para crear dos data frames en base al tipo de encoder que vamos a usar para cada columna

ohe_columns=['c_mnth','c_wday','c_hour']
catboost_columns=['c_vehs','p_age','v_year','c_year','c_conf','c_rcfg','c_wthr','c_rsur','c_raln','c_traf','v_type','p_sex','p_psn','p_safe','p_user']
to_be_ingnored= ['v_year','c_year','acc_mortal']


In [18]:
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.20,random_state=0,stratify=y)

In [19]:
y_test[y_test==1].head(50)

1030147    1
2438320    1
2718122    1
1680156    1
2871756    1
258223     1
4356449    1
4394070    1
4054737    1
3959691    1
2095178    1
5663031    1
5556451    1
4561529    1
1493881    1
3368800    1
2797344    1
884860     1
4432933    1
4188707    1
2759070    1
1347476    1
5357877    1
1300768    1
410495     1
51799      1
5101500    1
627344     1
3543120    1
4797981    1
4853394    1
4532430    1
2244056    1
3523777    1
4173882    1
4472897    1
4995909    1
5440293    1
1925856    1
2793085    1
2340327    1
1226294    1
1161281    1
4616914    1
3367197    1
2385153    1
2214418    1
218315     1
289953     1
638092     1
Name: acc_mortal, dtype: int64

In [20]:
print(X_test.iloc[289953].to_json())

{"c_year":2002,"c_mnth":"12","c_wday":"1","c_hour":"10","c_vehs":"2","c_conf":"23","c_rcfg":"02","c_wthr":"4","c_rsur":"3","c_raln":"1","c_traf":"18","v_type":"01","v_year":"1998","p_sex":"F","p_age":"15","p_psn":"11","p_safe":"02","p_user":"1"}


In [21]:
print(X_test.iloc[51799].to_json())

{"c_year":2008,"c_mnth":"10","c_wday":"2","c_hour":"20","c_vehs":"2","c_conf":"22","c_rcfg":"01","c_wthr":"1","c_rsur":"2","c_raln":"3","c_traf":"18","v_type":"01","v_year":"2004","p_sex":"M","p_age":"40","p_psn":"11","p_safe":"02","p_user":"1"}


In [22]:
ohe_transformer = Pipeline(steps=[
                                ('ohe',OneHotEncoder())])
cat_transformer=Pipeline(steps=[
                                ('cat',ce.cat_boost.CatBoostEncoder())])

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', ohe_transformer, ohe_columns),
        ('cat', cat_transformer, catboost_columns)])

In [24]:
from lightgbm import LGBMClassifier

In [25]:
classifier =  LGBMClassifier()
pipe = Pipeline(steps= [('preprocessor', preprocessor),
                         ('classifier',classifier)])

In [26]:
model_lgm_with_pipe = pipe.fit(X_train,y_train)

In [27]:
y_pred = pipe.predict(X_test)

In [28]:
prob_predictions = model_lgm_with_pipe.predict_proba(X_test)
yhat = prob_predictions[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, yhat)
gmeans = np.sqrt(tpr * (1-fpr))
ix = np.argmax(gmeans)
y_pred_best = (prob_predictions[:,1] >= thresholds[ix]).astype(int)

In [30]:
thresholds[ix]

0.018339461919462164

In [23]:
# Guardamos el modelo
#filename = '../Desktop/aprendizaje_automático/practica_asignatura/productivizacion/modelo.sav'
#joblib.dump(model_lgm_with_pipe, filename)

FileNotFoundError: [Errno 2] No such file or directory: '../Desktop/aprendizaje_automático/practica_asignatura/productivizacion/modelo.sav'

In [None]:
# guardamos las columnas
#model_columns = list(X_train.columns)
#joblib.dump(model_columns, 'model_columns.sav')

__####################################################################################__