# Library import

In [6]:
#Import de librerias basicas tablas y matrices
import numpy as np 
import pandas as pd 

#Gradient Boosting
import lightgbm as lgb

#Funciones auxiliares sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold #Split y cross Validation
from sklearn.metrics import cohen_kappa_score, accuracy_score, balanced_accuracy_score #Metricas
from sklearn.utils import shuffle 

#Visualizacióon
from plotly import express as px

#Plot de matriz de confusion normalizada en actuals
import sys
sys.path.append('../Scripts')
from utils import plot_confusion_matrix

import os

#Optimizacion de hiperparametros
import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact

#Guardado de objetos en archivos joblib
from joblib import load, dump

# Env

In [7]:
# Paths para acceso archivos
#Este notebook asume la siguiente estructura de carpetas a partir de la ubicacion de base_dir 
#(dos niveles arriba de la carpeta donde se ejecuta el notebook). 
# /UA_MDM_LDI_II/
# /UA_MDM_LDI_II/input
# /UA_MDM_LDI_II/input/petfinder-adoption-prediction/            <- Aca deben ir todos los archivos de datos de la competencia 
# /UA_MDM_LDI_II/tutoriales/                       <- Aca deben poner los notebooks y scripts que les compartimos
# /UA_MDM_LDI_II/work/                             <- Resultados de notebooks iran dentro de esta carpeta en subcarpetas
# /UA_MDM_LDI_II/work/models/                     <- Modelos entrenados en archivos joblibs
# /UA_MDM_LDI_II/work/optuna_temp_artifacts/      <- Archivos que queremos dejar como artefacto de un trial de optuna (optuna los copiara a la carpeta de abajo)
# /UA_MDM_LDI_II/work/optuna_artifacts/           <- Archivos con artefactos que sibimos a optuna

#Subimos dos niveles para quedar en la carpeta que contiene input y UA_MDM_LDI_II
BASE_DIR = '../'

#Datos de entrenamiento 
PATH_TO_TRAIN = os.path.join(BASE_DIR, "input/petfinder-adoption-prediction/train/train.csv")

#Salida de modelos entrenados
PATH_TO_MODELS = os.path.join(BASE_DIR, "work/models")

#Artefactos a subir a optuna
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "work/optuna_temp_artifacts")

#Artefactos que optuna gestiona
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "work/optuna_artifacts")


SEED = 42 #Semilla de procesos aleatorios (para poder replicar exactamente al volver a correr un modelo)
TEST_SIZE = 0.2 #Facción para train/test= split

In [23]:
# Datos Tabulares
dataset = pd.read_csv(PATH_TO_TRAIN)
dataset.shape
# Eliminate ID columns
dataset.drop(columns=['PetID', 'RescuerID'], axis=1, inplace=True)
dataset.shape

(14993, 22)

In [24]:
#Separo un 20% para test estratificado opr target
train, test = train_test_split(dataset,
                               test_size = TEST_SIZE,
                               random_state = SEED,
                               stratify = dataset.AdoptionSpeed)

## Categorical variables

PetID - categorical - ID --> Should be dropped

AdoptionSpeed - categorical - target variable

Type - categorical - 1 = Cat, 2 = Dog

Name - categorical - Name of pet

Breed1 - categorical - see BreedLabels dictionary

Breed2 - categorical - see BreedLabels dictionary

Gender - categorical 1 =male, 2 = female, 3 = mixed (used for groups)

Color1 - categorical - see ColorLabels dictionary)

Color2 - categorical - see ColorLabels dictionary)

Color3 - categorical - see ColorLabels dictionary)

MaturitySize - categorical - 1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large, 0 = Not Specified

FurLength - categorical - 1 = Short, 2 = Medium, 3 = Long, 0 = Not Specified)

Vaccinated - categorical - 1= Yes, 2 = No, 3 = Not Sure)

Dewormed - categorical - 1 = Yes, 2 = No, 3 = Not Sure)

Sterilized - categorical - 1 = Yes, 2 = No, 3 = Not Sure)

Health - categorical - 1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified)

State - categorical - see StateLabels dictionary)

RescuerID - categorical - ID

Description - Profile write-up for this pet. The primary language used is English, with some in Malay or Chinese.

## Cuantitaive variables

Age - Age of pet when listed, in months

Quantity - Number of pets represented in profile

Fee - Adoption fee (0 = Free)

VideoAmt - Total uploaded videos for this pet

PhotoAmt - Total uploaded photos for this pe

In [25]:
dataset.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'VideoAmt',
       'Description', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')

In [26]:
dataset.dtypes

Type               int64
Name              object
Age                int64
Breed1             int64
Breed2             int64
Gender             int64
Color1             int64
Color2             int64
Color3             int64
MaturitySize       int64
FurLength          int64
Vaccinated         int64
Dewormed           int64
Sterilized         int64
Health             int64
Quantity           int64
Fee                int64
State              int64
VideoAmt           int64
Description       object
PhotoAmt         float64
AdoptionSpeed      int64
dtype: object

In [28]:
#Armo listas con features de texto y numericas
char_feats = ['Type', 'Name', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'State', 'RescuerID',
       'Description', 'PetID', 'AdoptionSpeed']
numeric_feats = ['Age', 'Quantity', 'Fee', 'PhotoAmt', 'VideoAmt']

# FE

## Name
Se genera una columna para identificar si tiene o no nombre

In [39]:
dataset['HasName'] = dataset['Name'].apply(lambda x: 0 if pd.isnull(x) else 1)

## Breed

In [41]:
dataset[['Breed1', 'Breed2']]

Unnamed: 0,Breed1,Breed2
0,299,0
1,265,0
2,307,0
3,307,0
4,307,0
...,...,...
14988,266,0
14989,265,264
14990,265,266
14991,266,0


In [42]:
dataset['Breed2'].value_counts()

Breed2
0      10762
307     1727
266      599
265      321
299      138
       ...  
237        1
182        1
204        1
146        1
279        1
Name: count, Length: 135, dtype: int64

In [29]:
#Defino features a usar en un primer modelo de prueba
features = ['Type',
 'Age',
 'Breed1',
 'Breed2',
 'Gender',
 'Color1',
 'Color2',
 'Color3',
 'MaturitySize',
 'FurLength',
 'Vaccinated',
 'Dewormed',
 'Sterilized',
 'Health',
 'Quantity',
 'Fee',
 'State',
 'VideoAmt',
 'PhotoAmt']

label = 'AdoptionSpeed'

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,Description,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,...,2,2,1,1,100,41326,0,Nibble is a 3+ month old ball of cuteness. He ...,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,3,3,1,1,0,41401,0,I just found it alone yesterday near my apartm...,2.0,0


In [11]:
display(plot_confusion_matrix(y_test,y_pred))

NameError: name 'y_test' is not defined