In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
import warnings
warnings.filterwarnings("ignore")

# Gradient Boosting Classifier https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR
from sklearn.datasets import make_hastie_10_2 
from sklearn.ensemble import GradientBoostingClassifier

# Plot
import matplotlib.pyplot as plt


In [None]:
# read data
breeds = pd.read_csv('../input/breed_labels.csv')
colors = pd.read_csv('../input/color_labels.csv')
states = pd.read_csv('../input/state_labels.csv')

train = pd.read_csv('../input/train/train.csv')
test = pd.read_csv('../input/test/test.csv')
sub = pd.read_csv('../input/test/sample_submission.csv')

train['dataset_type'] = 'train'
test['dataset_type'] = 'test'
all_data = pd.concat([train, test])

In [None]:
train.head(1)

In [None]:
# train input data (x) and train labels (y), at this time no seperation for validating, no normalization
tr_x = train.iloc[:,[2,3,4,5,6,7,8,9,11,12,13,14,19,22]].values
tr_y = (train.iloc[:,[23]]).values

# features as input
# 1. Age - Age of pet when listed, in months
# 2. Breed1 - Primary breed of pet (Refer to BreedLabels dictionary)
# 3. Breed2 - Secondary breed of pet, if pet is of mixed breed (Refer to BreedLabels dictionary)
# 4. Gender - Gender of pet (1 = Male, 2 = Female, 3 = Mixed, if profile represents group of pets)
# 5. Color1 - Color 1 of pet (Refer to ColorLabels dictionary)
# 6. Color2 - Color 2 of pet (Refer to ColorLabels dictionary)
# 7. Color3 - Color 3 of pet (Refer to ColorLabels dictionary)
# 8. MaturitySize - Size at maturity (1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large, 0 = Not Specified)
# 9. Vaccinated - Pet has been vaccinated (1 = Yes, 2 = No, 3 = Not Sure)
# 10. Dewormed - Pet has been dewormed (1 = Yes, 2 = No, 3 = Not Sure)
# 11. Sterilized - Pet has been spayed / neutered (1 = Yes, 2 = No, 3 = Not Sure)
# 12. Health - Health Condition (1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified)
# 13. VideoAmt - Total uploaded videos for this pet
# 14. PhotoAmt - Total uploaded photos for this pet

In [None]:
# Training for Gradient Boosting Classifier
clf = GradientBoostingClassifier(n_estimators=150, learning_rate=0.5, max_depth=12, random_state=0).fit(tr_x, tr_y)

In [None]:
# Plot feature importance https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR
feature_importance = clf.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
# plt.subplot(1, 2, 2)
plt.figure(figsize=(12, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, train.iloc[:,[2,3,4,5,6,7,8,9,11,12,13,14,19,22]].keys()[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
# For submission
test_x = test.iloc[:,[2,3,4,5,6,7,8,9,11,12,13,14,19,22]].values
prediction = clf.predict(test_x)
clf.score(tr_x, tr_y) 

In [None]:
# Create submission data
submission = pd.DataFrame({'PetID': sub.PetID, 'AdoptionSpeed': [int(i) for i in prediction]})
submission.head()

In [None]:
# Create submission file
submission.to_csv('submission.csv', index=False)