In [196]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import strptime
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier

In [197]:
example_sub = pd.read_csv('i-hackathon-geam-insightlab/sample_submission.csv')
example_sub.head()

Unnamed: 0,animal_id,outcome_type
0,A725807,Adoption
1,A713176,Adoption
2,A752992,Adoption
3,A753923,Adoption
4,A763405,Adoption


In [198]:
train_df = pd.read_csv('i-hackathon-geam-insightlab/train.csv')

In [199]:
test_df = pd.read_csv('i-hackathon-geam-insightlab/test.csv')

### Tratando dados

In [200]:
train_df = train_df.drop(labels = ['name','outcome_subtype'], axis=1)
test_df = test_df.drop(labels = ['name'], axis=1)

In [201]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25570 entries, 0 to 25569
Data columns (total 7 columns):
animal_id           25570 non-null object
animal_type         25570 non-null object
breed               25570 non-null object
color               25570 non-null object
arrival_datetime    25570 non-null object
age_upon_outcome    25568 non-null object
sex_upon_outcome    25569 non-null object
dtypes: object(7)
memory usage: 1.4+ MB


In [202]:
animal_numero = {v:k for k,v in enumerate(train_df['animal_type'].unique())}
train_df['animal_type'] = train_df['animal_type'].map(animal_numero)
train_df = train_df.dropna(axis = 0)
test_df['animal_type'] = test_df['animal_type'].map(animal_numero)

In [203]:
breed_numero = {v:k for k,v in enumerate(train_df['breed'].unique())}
train_df['breed'] = train_df['breed'].map(breed_numero)
test_df['breed'] = test_df['breed'].map(breed_numero)

In [204]:
cor_numero = {v:k for k,v in enumerate(train_df['color'].unique())}
train_df['color'] = train_df['color'].map(cor_numero)
test_df['color'] = test_df['color'].map(cor_numero)

In [205]:
sexo_numero = {v:k for k,v in enumerate(train_df['sex_upon_outcome'].unique())}
train_df['sex_upon_outcome'] = train_df['sex_upon_outcome'].map(sexo_numero)
test_df['sex_upon_outcome'] = test_df['sex_upon_outcome'].map(sexo_numero)

#### label

In [206]:
label_numero = {v:k for k,v in enumerate(train_df['outcome_type'].unique())}
train_df['outcome_type'] = train_df['outcome_type'].map(label_numero)

### Idade

In [207]:
new_train = train_df['age_upon_outcome'].str.split(" ",n=1,expand = True)
train_df['numero'] = new_train[0]
train_df['tipo'] = new_train[1]

In [208]:
train_df['numero'] = train_df['numero'].apply(lambda x : int(x))

In [209]:
total_list = []
for i,j in zip(train_df['numero'],train_df['tipo']):
    total = 0
    if(j == 'year' or j == 'years'):
        total = i * 365
    elif(j == 'month' or j == 'months'):
        total = i * 30
    elif(j == 'week' or j == 'weeks'):
        total = i * 7
    else:
        total = i
    total_list.append(total)

In [210]:
train_df['age_upon_outcome'] = total_list
train_df.drop(labels = ['numero','tipo'], axis=1, inplace=True)

##### idade teste

In [211]:
test_df['age_upon_outcome'] = test_df['age_upon_outcome'].replace(to_replace = np.nan, value = '1 day')

In [212]:
new_test = test_df['age_upon_outcome'].str.split(" ",n=1,expand = True)
test_df['numero'] = new_test[0]
test_df['tipo'] = new_test[1]

test_df['numero'] = test_df['numero'].apply(lambda x : int(x))

In [213]:
total_list = []
for i,j in zip(test_df['numero'],test_df['tipo']):
    total = 0
    if(j == 'year' or j == 'years'):
        total = i * 365
    elif(j == 'month' or j == 'months'):
        total = i * 30
    elif(j == 'week' or j == 'weeks'):
        total = i * 7
    else:
        total = i
    total_list.append(total)

In [214]:
test_df['age_upon_outcome'] = total_list
test_df.drop(labels = ['numero','tipo'], axis=1, inplace=True)

In [215]:
# train_df = train_df.drop(labels = ['age_upon_outcome'], axis=1)
# test_df = test_df.drop(labels = ['age_upon_outcome'], axis=1)

#### DateTime

In [216]:
train_df['arrival_datetime'] = pd.to_datetime(train_df['arrival_datetime'])
test_df['arrival_datetime'] = pd.to_datetime(test_df['arrival_datetime'])

In [217]:
train_df['year'] = train_df['arrival_datetime'].dt.strftime('%Y').apply(lambda x : int(x))
train_df['month'] = train_df['arrival_datetime'].dt.strftime('%m').apply(lambda x : int(x))
train_df['day'] = train_df['arrival_datetime'].dt.strftime('%d').apply(lambda x : int(x))
train_df['hour'] = train_df['arrival_datetime'].dt.strftime('%H:%M').apply(lambda x: int(x[:2])*60 + int(x[3:]))

test_df['year'] = test_df['arrival_datetime'].dt.strftime('%Y').apply(lambda x : int(x))
test_df['month'] = test_df['arrival_datetime'].dt.strftime('%m').apply(lambda x : int(x))
test_df['day'] = test_df['arrival_datetime'].dt.strftime('%d').apply(lambda x : int(x))
test_df['hour'] = test_df['arrival_datetime'].dt.strftime('%H:%M').apply(lambda x: int(x[:2])*60 + int(x[3:]))

In [218]:
train_df.drop('arrival_datetime',axis=1, inplace=True)
test_df.drop('arrival_datetime',axis=1, inplace=True)

In [219]:
train_df.head()

Unnamed: 0,animal_id,animal_type,breed,color,age_upon_outcome,sex_upon_outcome,outcome_type,year,month,day,hour
0,A682590,0,0,0,150,0,0,2014,7,4,1027
1,A745909,0,1,0,365,1,0,2017,3,30,987
2,A741468,0,2,1,2920,1,0,2017,1,8,1112
3,A715697,0,3,2,30,0,0,2015,11,13,1033
4,A723449,1,4,3,365,2,1,2016,4,10,1103


In [220]:
test_df.head()

Unnamed: 0,animal_id,animal_type,breed,color,age_upon_outcome,sex_upon_outcome,year,month,day,hour
0,A725807,0,16.0,85.0,5110,1.0,2016,5,8,743
1,A713176,1,51.0,17.0,120,2.0,2016,1,9,1067
2,A752992,1,4.0,22.0,28,4.0,2017,7,5,877
3,A753923,3,68.0,4.0,14,3.0,2017,7,15,916
4,A763405,0,5.0,4.0,30,2.0,2017,12,9,0


### Create dataset

In [221]:
train_aux = train_df.drop(['animal_id','outcome_type'],axis=1)
test_aux = test_df.drop('animal_id',axis=1)

In [222]:
train_aux.to_csv("trainf.csv")
test_aux.to_csv("testf.csv")

### Pegar treino e test

In [223]:
X_train = train_aux.values
Y_train = train_df['outcome_type'].values
X_test = test_aux.values

In [224]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy = 'mean')
imp_mean.fit(X_train)
X_test = imp_mean.transform(X_test)

In [187]:
clf = OneVsRestClassifier(XGBClassifier(n_jobs=-1, max_depth=4))
#GradientBoostingClassifier() #RandomForestClassifier(n_estimators=100, max_depth=5, random_state= 42)

clf.fit(X_train, Y_train)

OneVsRestClassifier(estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                         init=None,
                                                         learning_rate=0.1,
                                                         loss='deviance',
                                                         max_depth=3,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         n_estimato

In [188]:
y_pred = clf.predict(X_test)

In [189]:
to_submit = pd.DataFrame(test_df['animal_id'])

In [190]:
to_submit['outcome_type'] = y_pred

In [191]:
to_submit.head()

Unnamed: 0,animal_id,outcome_type
0,A725807,3
1,A713176,2
2,A752992,2
3,A753923,2
4,A763405,2


In [192]:
label_inv = {v:k for k,v in label_numero.items()}

In [193]:
label_inv

{0: 'Adoption',
 1: 'Euthanasia',
 2: 'Transfer',
 3: 'Return to Owner',
 4: 'Died',
 5: 'Disposal',
 6: 'Missing',
 7: 'Relocate',
 8: 'Rto-Adopt'}

In [194]:
to_submit['outcome_type'] = to_submit['outcome_type'].map(label_inv)
to_submit.head()

Unnamed: 0,animal_id,outcome_type
0,A725807,Return to Owner
1,A713176,Transfer
2,A752992,Transfer
3,A753923,Transfer
4,A763405,Transfer


In [195]:
to_submit.to_csv("submite.csv",index=False)