#  Model preparation 

Before the model building I had to deal with dates, class imbalance, dummyfication, standardization, and PCA.

__Dealing with dates__

In [71]:
# days as a feature
outcome_date = df['Outcome Date']
df['Outcome day'] = outcome_date.dt.dayofweek

outcome_date = df['Intake Date']
df['Intake day'] = outcome_date.dt.dayofweek

print(df.shape)

(9965, 23)


In [72]:
#seasons as a feature
df['Outcome month'] = outcome_date.dt.month
df['Intake month'] = outcome_date.dt.month

def outcome_to_season(row):
    if row['Outcome month'] in [1,2,12]:
        return 'winter'
    if row['Outcome month'] in [3,4,5]:
        return 'spring'
    if row['Outcome month'] in [6,7,8]:
        return 'summer'
    if row['Outcome month'] in [9,10,11]:
        return 'autumn'

def intake_to_season(row):
    if row['Intake month'] in [1,2,12]:
        return 'winter'
    if row['Intake month'] in [3,4,5]:
        return 'spring'
    if row['Intake month'] in [6,7,8]:
        return 'summer'
    if row['Intake month'] in [9,10,11]:
        return 'autumn'

df['Outcome season'] = df.apply(outcome_to_season, axis=1)
df['Intake season'] = df.apply(intake_to_season, axis=1)

In [73]:
dataset = df.drop(axis=0, columns=['Breed', 'Date Of Birth', 'Days in Shelter', 'Outcome Condition', 'Periods in shelter'])

dataset['Outcome Type'].value_counts()

print(dataset.shape)

(9965, 22)


In [74]:
features = dataset.drop(['Color','Breeds','Outcome Type','Outcome month', 'Intake month', 'Outcome Date', 'Intake Date', 'Outcome Subtype'], axis=1)
print(features.columns)
df_dum = pd.get_dummies(features, drop_first=True)
df_dum.head(2)

Index(['Type', 'Sex', 'Size', 'Intake Type', 'Intake Subtype',
       'Intake Condition', 'Name_given', 'Breed type', 'Simple color', 'Age',
       'Outcome day', 'Intake day', 'Outcome season', 'Intake season'],
      dtype='object')


Unnamed: 0,Age,Outcome day,Intake day,Type_DOG,Type_OTHER,Sex_Male,Sex_Neutered,Sex_Spayed,Size_LARGE,Size_MED,...,Simple color_Y BRINDLE,Simple color_Y BRINDLE mixed color,Simple color_YELLOW,Simple color_YELLOW mixed color,Outcome season_spring,Outcome season_summer,Outcome season_winter,Intake season_spring,Intake season_summer,Intake season_winter
1,3054,5,4,1,0,0,1,0,1,0,...,0,0,1,0,0,0,1,0,0,1
2,70,2,4,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1


### Modeling

- Importing the necessary packages

In [75]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
import scikitplot as skplt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

__Predictor and predicted variables__

In [76]:
#Creating the dependent variable class
factor = pd.factorize(dataset['Outcome Type'])
dataset['Outcome Type'] = factor[0]
definitions = factor[1]
print(dataset['Outcome Type'].head())
print(definitions)

1    0
2    1
3    1
4    0
5    1
Name: Outcome Type, dtype: int64
Index(['RETURN TO OWNER', 'ADOPTION', 'TRANSFER', 'EUTHANIZE', 'DIED'], dtype='object')


In [77]:
y = dataset['Outcome Type']

In [78]:
#the baseline accuracy:
dataset['Outcome Type'].value_counts().max()/len(dataset)

0.5029603612644254

In [79]:
X = df_dum
print(X.shape)

(9965, 264)


__Splitting dataset to train and test__

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1)

__Standardizing predictor matrices__

In [81]:
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


__PCA__

In [82]:
#PCA
from sklearn.decomposition import PCA

In [83]:
# pca = PCA(n_components=100)  
# X_train = pca.fit_transform(X_train_ss)  
# X_test = pca.transform(X_test_ss)  

In [84]:
# explained_variance = pca.explained_variance_ratio_
# explained_variance

The explained variance of principal components quite low, so there is no use of PCA in this project.

#### Methods for solving imbalanced classes

In [85]:
from imblearn.over_sampling import RandomOverSampler

In [86]:
sampler = RandomOverSampler(random_state=1)
X_resampled, y_resampled = sampler.fit_sample(X_train, y_train)

print(pd.Series(y_resampled).value_counts())

4    3505
3    3505
2    3505
1    3505
0    3505
dtype: int64
