# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [1]:
# Import the required packages
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# load the given labels
breed = pd.read_csv('../data/breed_labels.csv')
color = pd.read_csv('../data/color_labels.csv')
state = pd.read_csv('../data/state_labels.csv')

Now we take a look at the labels, just to understand what these are

In [3]:
breed.head()

Unnamed: 0,BreedID,Type,BreedName
0,1,1,Affenpinscher
1,2,1,Afghan Hound
2,3,1,Airedale Terrier
3,4,1,Akbash
4,5,1,Akita


In [4]:
color.head()

Unnamed: 0,ColorID,ColorName
0,1,Black
1,2,Brown
2,3,Golden
3,4,Yellow
4,5,Cream


In [5]:
state

Unnamed: 0,StateID,StateName
0,41336,Johor
1,41325,Kedah
2,41367,Kelantan
3,41401,Kuala Lumpur
4,41415,Labuan
5,41324,Melaka
6,41332,Negeri Sembilan
7,41335,Pahang
8,41330,Perak
9,41380,Perlis


And now we are ready to deal with the *original* dataset...

In [6]:
original_df = pd.read_csv('../data/train.csv')

In [7]:
original_df.columns

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'Description',
       'AdoptionSpeed', 'PID'],
      dtype='object')

In [8]:
original_df.describe()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,AdoptionSpeed,PID
count,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0
mean,1.454734,10.520412,265.469854,74.388868,1.779059,2.230675,3.236912,1.856738,1.860518,1.460971,1.72973,1.566528,1.912115,1.036666,1.584011,20.80996,41345.994613,2.5189,7477.025799
std,0.49797,18.374027,60.12149,123.43401,0.684763,1.743985,2.748595,2.974465,0.547535,0.593843,0.670791,0.701482,0.564041,0.198228,1.488348,78.397243,32.409109,1.176018,4310.921553
min,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,41324.0,0.0,0.0
25%,1.0,2.0,265.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,41326.0,2.0,3768.25
50%,1.0,3.0,266.0,0.0,2.0,2.0,2.0,0.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0,41326.0,2.0,7473.5
75%,2.0,12.0,307.0,188.0,2.0,3.0,6.0,5.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,41401.0,4.0,11200.75
max,2.0,255.0,307.0,307.0,3.0,7.0,7.0,7.0,4.0,3.0,3.0,3.0,3.0,3.0,20.0,3000.0,41415.0,4.0,14992.0


In [9]:
original_df.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,Description,AdoptionSpeed,PID
0,2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,Nibble is a 3+ month old ball of cuteness. He ...,2,0
1,1,4,307,0,2,1,2,0,2,1,1,1,2,1,1,150,41401,"Good guard dog, very alert, active, obedience ...",2,3
2,1,1,307,0,1,1,0,0,2,1,2,2,2,1,1,0,41326,This handsome yet cute boy is up for adoption....,2,4
3,2,3,266,0,2,5,6,0,2,1,2,2,2,1,1,0,41326,This is a stray kitten that came to my house. ...,2,5
4,2,12,264,264,1,1,0,0,2,3,2,2,3,1,1,300,41326,anyone within the area of ipoh or taiping who ...,1,6


Create a function to transform the datasets. This is done by means of a function so that the transformations are the same for the training and testing datasets... We replace the encodings just to make it easy to "visualize" the data

In [10]:
def transform_data(train_data_fname, test_data_fname):
    def transform_columns(df):
        df = df.drop(["Description"], axis=1)
        #df.Type = df.Type.replace({1: 'Dog', 2: 'Cat'})
        #df.Gender = df.Gender.replace({1:'Male', 2:'Female', 3:'Mixed'})
        #df.MaturitySize = df.MaturitySize.replace({1:'S', 2:'M', 3:'L', 4:'XL', 0:'N/A'})
        #df.FurLength = df.FurLength.replace({1:'S', 2:'M', 3:'L', 0:'N/A'})
        #df.Vaccinated = df.Vaccinated.replace({1:'T', 2:'N', 3:'N/A'})
        #df.Dewormed = df.Dewormed.replace({1:'T', 2:'F', 3:'N/A'})
        #df.Sterilized = df.Sterilized.replace({1:'T', 2:'F', 3:'N/A'})
        #df.Health = df.Health.replace({1:'Healthy', 2: 'MinorInjury', 3:'SeriousInjury', 0: 'N/A'})
        #df.Color1 = df.Color1.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        #df.Color2 = df.Color2.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        #df.Color3 = df.Color3.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        #df.Breed1 = df.Breed1.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        #df.Breed2 = df.Breed2.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        return df
    
    df_train = pd.read_csv(train_data_fname)
    df_train = transform_columns(df_train)
    df_test = pd.read_csv(test_data_fname)
    df_test = transform_columns(df_test)
    
    df = pd.concat([df_train, df_test], sort=True)

    # set dummy variables for everything
    # except from Age, Quantity, Fee
    df = pd.get_dummies(df)
    # get train and test back
    n = len(df_train)
    df_train = df.iloc[:n]
    df_test = df.iloc[n:]
    
    y = df_train['AdoptionSpeed']
    X = df_train.drop('AdoptionSpeed', axis=1)
    yy = None
    XX = df_test.drop('AdoptionSpeed', axis=1)

    return X, y, XX, yy

Load the data...

In [11]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

Create the model and evaluate it

In [12]:
# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it couldn be useful for you depending on your approach)
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

results = pd.DataFrame(columns=('clf', 'best_acc'))

In [13]:
# check class balance
y_train.value_counts(normalize=True)

4.0    0.277035
2.0    0.272715
3.0    0.217902
1.0    0.206156
0.0    0.026191
Name: AdoptionSpeed, dtype: float64

## Baseline
**Best Decision Tree accuracy:  0.3519622095560508**

## Entrenamientos

In [14]:
from sklearn.tree import DecisionTreeClassifier as DT
start_time = time.time()
tree_param = {'criterion':('gini', 'entropy'),
              'min_samples_leaf':(1, 2, 5),
              'min_samples_split':(2, 3, 5, 10, 50, 100),
              'min_impurity_split':(1,2, 3, 5, 10, 50, 100),
              'max_depth':[1,10,100,1000],
              'presort':[True, False]}
tree = DT(random_state=42)
tree_clf = GridSearchCV(tree, tree_param, scoring='accuracy', cv=5, iid=False, n_jobs=-1)
tree_clf.fit(X_train.drop(["PID"], axis=1), y_train)
best_tree_clf = tree_clf.best_estimator_
print('Best Decision Tree accuracy: ', tree_clf.best_score_)
print(best_tree_clf)
results = results.append({'clf': best_tree_clf, 'best_acc': tree_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])
print(f'Seconds: {time.time() - start_time}')

Best Decision Tree accuracy:  0.3719458387521687
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=2,
                       min_samples_leaf=1, min_samples_split=100,
                       min_weight_fraction_leaf=0.0, presort=True,
                       random_state=42, splitter='best')
The best classifier so far is: 
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=2,
                       min_samples_leaf=1, min_samples_split=100,
                       min_weight_fraction_leaf=0.0, presort=True,
                       random_state=42, splitter='best')
Seconds: 38.965412855148315




In [15]:
from sklearn.neighbors import KNeighborsClassifier

In [16]:
start_time = time.time()
knn_param = {
    'n_neighbors': [5,10,20,30,40,50],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : [1,2,3,4,5,10,20,30,40,50]
}

knn = KNeighborsClassifier()
knn_clf = GridSearchCV(knn, knn_param, scoring='accuracy', cv=5, iid=False, n_jobs=-1)
knn_clf.fit(X_train.drop(["PID"], axis=1), y_train)
best_knn_clf = knn_clf.best_estimator_
print('Best KNN accuracy: ', knn_clf.best_score_)
print(best_knn_clf)
results = results.append({'clf': best_knn_clf, 'best_acc': knn_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])
print(f'Seconds: {time.time() - start_time}')

Best KNN accuracy:  0.351816255320066
KNeighborsClassifier(algorithm='brute', leaf_size=1, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=50, p=2,
                     weights='uniform')
The best classifier so far is: 
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=2,
                       min_samples_leaf=1, min_samples_split=100,
                       min_weight_fraction_leaf=0.0, presort=True,
                       random_state=42, splitter='best')
Seconds: 93.81157088279724


In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
start_time = time.time()
rfc_param = {
    'n_estimators': [10, 100, 1000,5, 50, 500],
    'criterion': ['gini','entropy'],
    'max_depth': [1, 10, 100, 1000,5, 50, 500],
    'min_samples_split': [2, 5, 10, 100]
}

rfc = RandomForestClassifier(random_state=0)
rfc_clf = GridSearchCV(rfc, rfc_param, scoring='accuracy', cv=5, iid=False, n_jobs=-1)
rfc_clf.fit(X_train.drop(["PID"], axis=1), y_train)
best_rfc_clf = rfc_clf.best_estimator_
print('Best Random Forest accuracy: ', rfc_clf.best_score_)
print(best_rfc_clf)
results = results.append({'clf': best_rfc_clf, 'best_acc': rfc_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])
print(f'Seconds: {time.time() - start_time}')

Best Random Forest accuracy:  0.39300522928801646
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
The best classifier so far is: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random

In [25]:
from xgboost import XGBClassifier

In [27]:
start_time = time.time()
xgb_param = {'learning_rate':[1e-2,1e-1,1e0,1e1],
             'n_estimators': [10, 100, 1000],
             'max_depth': [1, 10, 100, 1000]}

xgb = XGBClassifier(random_state=0)
xgb_clf = GridSearchCV(xgb, xgb_param, scoring='accuracy', cv=5, iid=False, n_jobs=-1)
xgb_clf.fit(X_train.drop(["PID"], axis=1), y_train)
best_xgb_clf = xgb_clf.best_estimator_
print('Best XGBoost accuracy: ', xgb_clf.best_score_)
print(best_xgb_clf)
results = results.append({'clf': best_xgb_clf, 'best_acc': xgb_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])
print(f'Seconds: {time.time() - start_time}')



Best XGBoost accuracy:  0.386931165769554
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=1,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
The best classifier so far is: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
       

**And finally**, we predict the unknown label for the testing set

In [19]:
X.shape, XX.shape

((10582, 18), (4411, 18))

In [30]:
results

Unnamed: 0,clf,best_acc
0,"DecisionTreeClassifier(class_weight=None, crit...",0.371946
1,"KNeighborsClassifier(algorithm='brute', leaf_s...",0.351816
2,"(DecisionTreeClassifier(class_weight=None, cri...",0.393005
3,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.386931


In [31]:
results[results.best_acc == results.best_acc.max()].clf.item()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [32]:
yy = results[results.best_acc == results.best_acc.max()].clf.item().predict(XX.drop('PID',axis=1))
yy = yy.astype(np.int)

The last thing we do is generating a file that should be *submitted* on kaggle

In [33]:
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])

In [34]:
submission.to_csv("../data/submission.csv", header=True, index=False)