# Library import

In [17]:
#Import de librerias basicas tablas y matrices
import numpy as np 
import pandas as pd 

#Gradient Boosting
import lightgbm as lgb

#Funciones auxiliares sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold #Split y cross Validation
from sklearn.metrics import cohen_kappa_score, accuracy_score, balanced_accuracy_score #Metricas
from sklearn.utils import shuffle 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

#Visualizacióon
from plotly import express as px

#Plot de matriz de confusion normalizada en actuals
import sys
sys.path.append('../Scripts')
from utils import plot_confusion_matrix

import os

#Optimizacion de hiperparametros
import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact

#Guardado de objetos en archivos joblib
from joblib import load, dump

# Env

In [18]:
# Paths para acceso archivos
#Este notebook asume la siguiente estructura de carpetas a partir de la ubicacion de base_dir 
#(dos niveles arriba de la carpeta donde se ejecuta el notebook). 
# /UA_MDM_LDI_II/
# /UA_MDM_LDI_II/input
# /UA_MDM_LDI_II/input/petfinder-adoption-prediction/            <- Aca deben ir todos los archivos de datos de la competencia 
# /UA_MDM_LDI_II/tutoriales/                       <- Aca deben poner los notebooks y scripts que les compartimos
# /UA_MDM_LDI_II/work/                             <- Resultados de notebooks iran dentro de esta carpeta en subcarpetas
# /UA_MDM_LDI_II/work/models/                     <- Modelos entrenados en archivos joblibs
# /UA_MDM_LDI_II/work/optuna_temp_artifacts/      <- Archivos que queremos dejar como artefacto de un trial de optuna (optuna los copiara a la carpeta de abajo)
# /UA_MDM_LDI_II/work/optuna_artifacts/           <- Archivos con artefactos que sibimos a optuna

#Subimos dos niveles para quedar en la carpeta que contiene input y UA_MDM_LDI_II
BASE_DIR = '../'

#Datos de entrenamiento 
PATH_TO_TRAIN = os.path.join(BASE_DIR, "input/petfinder-adoption-prediction/train/train.csv")
PATH_TO_TEST = os.path.join(BASE_DIR, "input/petfinder-adoption-prediction/test/test.csv")

#Salida de modelos entrenados
PATH_TO_MODELS = os.path.join(BASE_DIR, "work/models")

#Artefactos a subir a optuna
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "work/optuna_temp_artifacts")

#Artefactos que optuna gestiona
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "work/optuna_artifacts")


SEED = 42 #Semilla de procesos aleatorios (para poder replicar exactamente al volver a correr un modelo)
TEST_SIZE = 0.2 #Facción para train/test= split

In [19]:
# Datos Tabulares
df_train = pd.read_csv(PATH_TO_TRAIN)
df_train.shape

(14993, 24)

In [20]:
# Datos Tabulares
df_test = pd.read_csv(PATH_TO_TEST)
df_test.shape

(3972, 23)

### Categorical variables


| Variable       | Type         | Description                                                                           |
|----------------|--------------|---------------------------------------------------------------------------------------|
| `PetID`        | Categorical  | ID (Should be dropped)                                                                |
| `AdoptionSpeed`| Categorical  | Target variable                                                                       |
| `Type`         | Categorical  | 1 = Cat, 2 = Dog                                                                      |
| `Name`         | Categorical  | Name of pet                                                                           |
| `Breed1`       | Categorical  | See BreedLabels dictionary                                                            |
| `Breed2`       | Categorical  | See BreedLabels dictionary                                                            |
| `Gender`       | Categorical  | 1 = Male, 2 = Female, 3 = Mixed (used for groups)                                     |
| `Color1`       | Categorical  | See ColorLabels dictionary                                                            |
| `Color2`       | Categorical  | See ColorLabels dictionary                                                            |
| `Color3`       | Categorical  | See ColorLabels dictionary                                                            |
| `MaturitySize` | Categorical  | 1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large, 0 = Not Specified                  |
| `FurLength`    | Categorical  | 1 = Short, 2 = Medium, 3 = Long, 0 = Not Specified                                    |
| `Vaccinated`   | Categorical  | 1 = Yes, 2 = No, 3 = Not Sure                                                         |
| `Dewormed`     | Categorical  | 1 = Yes, 2 = No, 3 = Not Sure                                                         |
| `Sterilized`   | Categorical  | 1 = Yes, 2 = No, 3 = Not Sure                                                         |
| `Health`       | Categorical  | 1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified                  |
| `State`        | Categorical  | See StateLabels dictionary                                                            |
| `RescuerID`    | Categorical  | ID                                                                                    |
| `Description`  | Text         | Profile write-up for this pet. The primary language used is English, with some in Malay or Chinese. |

### Quantitative Variables

| Variable   | Type             |Description                                           |
|------------|------------------|------------------------------------------------------|
| `Age`      |  Numerical       |Age of pet when listed, in months                     |
| `Quantity` |  Numerical       |Number of pets represented in profile                 |
| `Fee`      |  Numerical       |Adoption fee (0 = Free)                               |
| `VideoAmt` |  Numerical       |Total uploaded videos for this pet                    |
| `PhotoAmt` |  Numerical       |Total uploaded photos for this pet                    |

# FE

In [21]:

def apply_fe(dataset):
    # Name
    # Feature to know if the pet has a name
    dataset['Name'] = np.where(dataset['Name'].str.lower().str.contains('name|puppies|kitten|puppy|unknown'), np.nan, dataset['Name'])
    dataset['HasName'] = dataset['Name'].apply(lambda x: 0 if pd.isnull(x) else 1)

    
    # Breed
    # Unify values in Breed1
    dataset['Breed1'] = np.where((dataset['Breed1']==0) & (dataset['Breed2']!=0), dataset['Breed2'], dataset['Breed1'])
    dataset['Breed2'] = np.where((dataset['Breed1']==dataset['Breed2']), 0, dataset['Breed2'])
    
    # Merge Breed1 and Breed2
    dataset['FullBreed'] = dataset['Breed1'].astype(str) + '_' + dataset['Breed2'].astype(str) 
    # Pure breed
    dataset['PureBreed'] = np.where((dataset['Breed2'] == 0) & (~dataset['Breed1'].isin([307, 266, 265, 264])), 1, 0)
    
    # Color
    # Merge Color1, Color2 and Color3
    dataset['Color'] = dataset['Color1'].astype(str) + '_' + dataset['Color2'].astype(str) + '_' + dataset['Color3'].astype(str)
    # Monochromatic
    dataset['Monochromatic'] = np.where((dataset['Color2'] == 0) & (dataset['Color3'] == 0), 1, 0)
    
    # Health
    # Merge Vaccinated, Dewormed and Sterilized to know if the pet is up to date with routine stuff
    dataset['Va_De_St'] = dataset['Vaccinated'].astype(str) + '_' + dataset['Dewormed'].astype(str) + '_' + dataset['Sterilized'].astype(str)

    # Code to add a min age for sterilization.... But EDA shows that they do it anyway
    #df['CanBeSter'] = np.where((df['Age'] > 9) & (df['Type'] == 1) | (df['Age'] > 6) & (df['Type'] == 2), 1, 0)
    
    # Fee
    # Create fee bins using log transformation
    dataset['Fee_bins'] = pd.cut(np.log1p(dataset.Fee), 5, labels=['Fee_{}'.format(e) for e in range(5)])

    #RescuerID
    rescuer_count = dataset.groupby(['RescuerID'])['PetID'].count().reset_index()
    rescuer_count.columns = ['RescuerID', 'Rescuer_count']
    dataset = dataset.merge(rescuer_count, how='left', on='RescuerID')

    # Age
    dataset['RelAge'] = np.where(dataset['Type'] == 1, dataset['Age']/144, dataset['Age']/180) # Vida media de un gato 12 años, de un perro 15

    # Multimedia
    dataset['Total_photo_video'] = dataset['PhotoAmt'] + dataset['VideoAmt']

    # State

    
    # Return the augmented dataset
    return dataset

In [22]:
df_train = apply_fe(df_train.copy())
df_test = apply_fe(df_test.copy())

In [23]:
# Calculate the mean adoption speed per state and add that value to train and test datasets
state_mean = df_train.groupby('State')['AdoptionSpeed'].mean().reset_index()
state_mean.columns = ['State', 'AdoptionSpeed_mean']
df_train = df_train.merge(state_mean, how='left', on='State')
df_test = df_test.merge(state_mean, how='left', on='State')

In [24]:
df_train.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed',
       'HasName', 'FullBreed', 'PureBreed', 'Color', 'Monochromatic',
       'Va_De_St', 'Fee_bins', 'Rescuer_count', 'RelAge', 'Total_photo_video',
       'AdoptionSpeed_mean'],
      dtype='object')

In [25]:
#Armo listas con features de texto y numericas
char_feats = ['Type', 'Name', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health',  'Fee', 'State', 'RescuerID',       
       'HasName', 'FullBreed', 'PureBreed', 'Color', 'Monochromatic',
       'Va_De_St', 'Fee_bins',   
       ]
numeric_feats = ['Age','Quantity','PhotoAmt','VideoAmt','Total_photo_video','RelAge','Rescuer_count','AdoptionSpeed_mean']

fe_drop = ['Description', 'PetID',  'AdoptionSpeed',]

In [26]:
target = 'AdoptionSpeed'
fe_drop = ['PetID', 'Description', 'AdoptionSpeed']
char_feats = list(set(char_feats) - set(fe_drop))
y = df_train[target]
X = df_train.drop(fe_drop, axis=1)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=TEST_SIZE, 
                                                    random_state=SEED, 
                                                    stratify=y)

In [28]:
len(X_train.columns.to_list()), len(char_feats)

(32, 24)

In [29]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(min_frequency= 30, handle_unknown= 'use_encoded_value', unknown_value= -1), char_feats)],
        remainder= 'passthrough')

In [30]:
X_train2 = preprocessor.fit_transform(X_train)
X_test2 = preprocessor.transform(X_test)

In [31]:
X_train.dtypes

Type                     int64
Name                    object
Age                      int64
Breed1                   int64
Breed2                   int64
Gender                   int64
Color1                   int64
Color2                   int64
Color3                   int64
MaturitySize             int64
FurLength                int64
Vaccinated               int64
Dewormed                 int64
Sterilized               int64
Health                   int64
Quantity                 int64
Fee                      int64
State                    int64
RescuerID               object
VideoAmt                 int64
PhotoAmt               float64
HasName                  int64
FullBreed               object
PureBreed                int64
Color                   object
Monochromatic            int64
Va_De_St                object
Fee_bins              category
Rescuer_count            int64
RelAge                 float64
Total_photo_video      float64
AdoptionSpeed_mean     float64
dtype: o

In [32]:
#Entreno un modelo inicial sin modificar hiperparametros. Solamente especifico el numero de clases y el tipo de modelo como clasificacoión
lgb_params = params = {
                        'objective': 'multiclass',
                        'num_class': 5
                        }


#genero el objeto Dataset que debo pasarle a lightgbm para que entrene
lgb_train_dataset = lgb.Dataset(data=X_train2,
                                label=y_train)

#entreno el modelo con los parametros por defecto
lgb_model = lgb.train(lgb_params,
                      lgb_train_dataset)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 630
[LightGBM] [Info] Number of data points in the train set: 11994, number of used features: 32
[LightGBM] [Info] Start training from score -3.599148
[LightGBM] [Info] Start training from score -1.579379
[LightGBM] [Info] Start training from score -1.311924
[LightGBM] [Info] Start training from score -1.526206
[LightGBM] [Info] Start training from score -1.273359


In [33]:
y_pred = lgb_model.predict(X_test2).argmax(axis=1)

#Calculo el Kappa
cohen_kappa_score(y_test,y_pred, weights = 'quadratic')

0.3537168666398812