In [1]:
from pandas import Series, DataFrame
import pandas as pd
from patsy import dmatrices
%pylab inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv('aac_intakes_outcomes.csv')
df['cat_or_dog'] = False

def valid_animal(x):
    valid = ['Dog','Cat']
    if x in valid:
        return True
    return False

df['cat_or_dog'] = df['animal_type'].apply(valid_animal)



mask = (df['cat_or_dog'] == True)
df = df[mask]

colors = df['color'].value_counts()>100
color_list = df['color'].value_counts()[colors].index.tolist()
df['color_bin'] = 'other'
def colortest(x):
    if x in color_list:
        return x
    return 'other'
df['color_bin'] = df['color'].apply(colortest)


In [3]:
categorical_columns_to_convert = ['intake_type', 'sex_upon_intake', 'intake_weekday', 'breed', 'color_bin', 'intake_condition', 'sex_upon_outcome', 'outcome_weekday']

new_categorical_columns = []

for column in categorical_columns_to_convert:
    df[column] = df[column].astype('category')
    new_column_name = column + "_coded"
    new_categorical_columns.append(new_column_name)
    df[new_column_name] = df[column].cat.codes

categorical_columns = ['intake_month', 'intake_hour', 'outcome_month', 'outcome_hour', 'dob_month']

all_categorical_columns = new_categorical_columns + categorical_columns                              
                                
numerical_columns = ['dob_year', 'age_upon_intake_(days)', 'age_upon_outcome_(days)', 'outcome_year', 'intake_year', 'time_in_shelter_days']

In [4]:
df = df[df['outcome_type'] != 'Return to Owner']

df['target'] = 0
mask = (df['outcome_type']=='Adoption')
df['target'][mask] = 1.0

def formula(numerical, categorical):
    formula = 'target ~ 0'
    for numerical_feature in numerical:
        formula += ' + Q("' + numerical_feature + '")'
    for categorical_feature in categorical:
        formula += ' + C(' + categorical_feature + ')'
    return formula
    
formula = formula(numerical_columns, all_categorical_columns)
formula

'target ~ 0 + Q("dob_year") + Q("age_upon_intake_(days)") + Q("age_upon_outcome_(days)") + Q("outcome_year") + Q("intake_year") + Q("time_in_shelter_days") + C(intake_type_coded) + C(sex_upon_intake_coded) + C(intake_weekday_coded) + C(breed_coded) + C(color_bin_coded) + C(intake_condition_coded) + C(sex_upon_outcome_coded) + C(outcome_weekday_coded) + C(intake_month) + C(intake_hour) + C(outcome_month) + C(outcome_hour) + C(dob_month)'

In [5]:

Y, X = dmatrices(formula, df, return_type='dataframe')
y = Y['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
model = LogisticRegression()
result = model.fit(X_train, y_train)
prediction_train = model.predict(X_train)
prediction_test = model.predict(X_test)

print 'Training accuracy = ', metrics.accuracy_score(y_train, prediction_train)
print 'Test accuracy = ', metrics.accuracy_score(y_test, prediction_test)

negative_examples_in_test = len(y_test[y_test==0])
total_examples_in_test = len(y_test)
print 'Baseline accuracy =', negative_examples_in_test * 1.0 / total_examples_in_test

weights = Series(model.coef_[0],
                 index=X.columns.values)
weights.sort_values()

Training accuracy =  0.845388487842
Test accuracy =  0.843315419137
Baseline accuracy = 0.44739320738


C(sex_upon_outcome_coded)[T.4]   -1.563859
C(sex_upon_intake_coded)[T.4]    -1.563859
C(sex_upon_outcome_coded)[T.1]   -1.356453
C(breed_coded)[T.1768]           -0.677914
C(intake_type_coded)[0]          -0.605772
C(outcome_month)[T.4]            -0.605316
C(intake_condition_coded)[T.7]   -0.457303
C(sex_upon_intake_coded)[T.3]    -0.443738
C(outcome_hour)[T.9]             -0.431439
C(breed_coded)[T.1521]           -0.429523
C(breed_coded)[T.910]            -0.429001
C(intake_hour)[T.20]             -0.412278
C(breed_coded)[T.37]             -0.398839
C(breed_coded)[T.74]             -0.371884
C(breed_coded)[T.63]             -0.353738
C(intake_type_coded)[2]          -0.340770
C(outcome_month)[T.5]            -0.340345
C(breed_coded)[T.1260]           -0.335290
C(breed_coded)[T.1290]           -0.320881
C(outcome_month)[T.3]            -0.315908
C(breed_coded)[T.741]            -0.308012
C(intake_condition_coded)[T.2]   -0.277560
C(color_bin_coded)[T.28]         -0.273734
C(breed_cod

In [18]:
df_dogs = df[df['animal_type'] == 'Dog']

Y_dog, X_dog = dmatrices(formula, df_dogs, return_type='dataframe')
y_dog = Y_dog['target'].values
X_train_dog, X_test_dog, y_train_dog, y_test_dog = train_test_split(X_dog, y_dog, test_size=0.3, random_state=1)
model_dog = LogisticRegression()
result = model_dog.fit(X_train_dog, y_train_dog)
prediction_train_dog = model_dog.predict(X_train_dog)
prediction_test_dog = model_dog.predict(X_test_dog)

print 'Training accuracy = ', metrics.accuracy_score(y_train_dog, prediction_train_dog)
print 'Test accuracy = ', metrics.accuracy_score(y_test_dog, prediction_test_dog)

negative_examples_in_test_dog = len(y_test_dog[y_test_dog==0])
total_examples_in_test_dog = len(y_test_dog)
print 'Baseline accuracy =', negative_examples_in_test_dog * 1.0 / total_examples_in_test_dog

weights_dog = Series(model_dog.coef_[0],
                 index=X_dog.columns.values)
weights_dog.sort_values()

 Training accuracy =  0.820387269085
Test accuracy =  0.815226422933
Baseline accuracy = 0.360303282094


C(intake_type_coded)[0]          -1.013525
C(breed_coded)[T.1768]           -0.952154
C(sex_upon_outcome_coded)[T.4]   -0.853027
C(sex_upon_intake_coded)[T.4]    -0.853027
C(sex_upon_outcome_coded)[T.1]   -0.740447
C(intake_condition_coded)[T.7]   -0.604588
C(breed_coded)[T.1767]           -0.524429
C(breed_coded)[T.427]            -0.498088
C(breed_coded)[T.1290]           -0.496213
C(breed_coded)[T.881]            -0.458304
C(breed_coded)[T.74]             -0.434310
C(breed_coded)[T.1521]           -0.426801
C(breed_coded)[T.1627]           -0.425386
C(breed_coded)[T.1260]           -0.422119
C(color_bin_coded)[T.28]         -0.397715
C(breed_coded)[T.1569]           -0.395820
C(breed_coded)[T.63]             -0.391012
C(breed_coded)[T.1069]           -0.386118
C(color_bin_coded)[T.35]         -0.364226
C(intake_condition_coded)[T.2]   -0.357381
C(breed_coded)[T.742]            -0.356809
C(intake_weekday_coded)[T.5]     -0.334552
C(breed_coded)[T.791]            -0.333347
C(outcome_h

In [19]:
df_cats = df[df['animal_type'] == 'Cat']

Y_cat, X_cat = dmatrices(formula, df_cats, return_type='dataframe')
y_cat = Y_cat['target'].values
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X, y, test_size=0.3, random_state=1)
model_cat = LogisticRegression()
result = model_cat.fit(X_train_cat, y_train_cat)
prediction_train_cat = model_cat.predict(X_train_cat)
prediction_test_cat = model_cat.predict(X_test_cat)

print 'Training accuracy = ', metrics.accuracy_score(y_train_cat, prediction_train_cat)
print 'Test accuracy = ', metrics.accuracy_score(y_test_cat, prediction_test_cat)

negative_examples_in_test_cat = len(y_test_cat[y_test_cat==0])
total_examples_in_test_cat = len(y_test_cat)
print 'Baseline accuracy =', negative_examples_in_test_cat * 1.0 / total_examples_in_test_cat

weights_cat = Series(model_cat.coef_[0],
                 index=X_cat.columns.values)
weights_cat.sort_values()

Training accuracy =  0.883646358223
Test accuracy =  0.876855480347
Baseline accuracy = 0.546253414084


C(sex_upon_intake_coded)[T.4]    -2.099189
C(sex_upon_outcome_coded)[T.4]   -2.099189
C(sex_upon_outcome_coded)[T.1]   -1.406018
C(intake_condition_coded)[T.1]   -1.299232
C(outcome_month)[T.4]            -0.837268
C(breed_coded)[T.904]            -0.787540
C(sex_upon_intake_coded)[T.3]    -0.771217
C(breed_coded)[T.1792]           -0.578247
C(intake_type_coded)[2]          -0.534343
C(breed_coded)[T.909]            -0.506617
C(breed_coded)[T.910]            -0.501694
C(intake_hour)[T.20]             -0.498565
C(outcome_month)[T.3]            -0.491721
C(breed_coded)[T.72]             -0.485878
C(dob_month)[T.12]               -0.410691
C(breed_coded)[T.905]            -0.407730
C(intake_type_coded)[0]          -0.363543
C(outcome_month)[T.5]            -0.348175
C(outcome_hour)[T.9]             -0.346991
C(intake_condition_coded)[T.5]   -0.346749
C(breed_coded)[T.1826]           -0.342111
C(outcome_weekday_coded)[T.6]    -0.332742
C(color_bin_coded)[T.64]         -0.315360
C(outcome_w

In [None]:
df['sex_upon_outcome']