In [8]:
from pandas import Series, DataFrame
import pandas as pd
from patsy import dmatrices
%pylab inline

# We will ignore some silly warnings that pop up due to scikit-learn
import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


# Adoption Data

In [87]:
df = pd.read_csv('aac_intakes_outcomes.csv')


#Seperate the age into more reasonable buckets
def seperation(x):
    y = x.split(' ')
    return (y[0])
df['just_age'] = df['age_upon_outcome'].apply(seperation)

#Now seperate the breed into bins based off frequency, arbitrarily set threshold at 100
num = df['breed'].value_counts()>100
a = df['breed'].value_counts()[num].index.tolist()
df['breed_bin'] = 'other'
def breedtest(x):
    if x in a:
        return x
    return 'other'
df['breed_bin'] = df['breed'].apply(breedtest)
df['breed_bin'].value_counts()

#Get breeds and colors where the count is greater than 100, more relevant I guess this way


Domestic Shorthair Mix                23423
other                                 13082
Pit Bull Mix                           6256
Chihuahua Shorthair Mix                4831
Labrador Retriever Mix                 4789
Domestic Medium Hair Mix               2326
German Shepherd Mix                    1950
Bat Mix                                1381
Domestic Longhair Mix                  1248
Australian Cattle Dog Mix              1099
Siamese Mix                             996
Bat                                     827
Dachshund Mix                           811
Boxer Mix                               683
Miniature Poodle Mix                    662
Border Collie Mix                       662
Catahoula Mix                           480
Raccoon Mix                             471
Rat Terrier Mix                         469
Australian Shepherd Mix                 468
Yorkshire Terrier Mix                   447
Siberian Husky Mix                      441
Jack Russell Terrier Mix        

In [88]:
# Setting up regressions using naive-bayes

# Should we filter out return to owner? seems logical because we want to look at actual strays, not ones that just got lost for a few days

mask = (df['outcome_type'] != 'Return to Owner')
df1 = df[mask]
#df1['outcome_type'].value_counts()

# df1 now excludes all pets that were returned to owner, future analysis may include ones that were returned 
# after an extended period of time, for now it will only include long-term strays
df1['target_adopt'] = 0
mask = (df1['outcome_type']=='Adoption')
df1['target_adopt'][mask] = 1.0
#df1['target_adopt'].value_counts()

#Important to use categorical, most factors seem categorical except time in shelter (days), will qcut
df1['time_spent_shelter']= pd.qcut(df['time_in_shelter_days'],5)
#df1['time_spent_shelter'].value_counts()

#df1['DOB_bucket']= pd.qcut(df1['date_of_birth'],5)
df1['DOB_DT'] = pd.to_datetime(df['date_of_birth'])
#df1['DOB_DT'][:5]
df1['DOB_bucket'] = pd.qcut(df1['DOB_DT'],5)
#df1['DOB_bucket'].value_counts()

df1['age_upon_outcome_numeric']= pd.to_numeric(df1['just_age'])
df1['age_bucket'] = pd.qcut(df1['age_upon_outcome_numeric'],5)
df1['age_bucket'].value_counts()

(1.0, 2.0]       19226
(-0.001, 1.0]    16723
(5.0, 25.0]      11168
(3.0, 5.0]        9187
(2.0, 3.0]        8577
Name: age_bucket, dtype: int64

In [89]:
print df1.columns.values

#Going to use age_upon_outcome, date_of_birth, outcome_type, sex_upon_outocome, outcome_monthyear, outcome_weekday, outcome_hour, animal_type
#breed,color, intake_coundition, time_in_shelter_days

['age_upon_outcome' 'animal_id_outcome' 'date_of_birth' 'outcome_subtype'
 'outcome_type' 'sex_upon_outcome' 'age_upon_outcome_(days)'
 'age_upon_outcome_(years)' 'age_upon_outcome_age_group' 'outcome_datetime'
 'outcome_month' 'outcome_year' 'outcome_monthyear' 'outcome_weekday'
 'outcome_hour' 'outcome_number' 'dob_year' 'dob_month' 'dob_monthyear'
 'age_upon_intake' 'animal_id_intake' 'animal_type' 'breed' 'color'
 'found_location' 'intake_condition' 'intake_type' 'sex_upon_intake'
 'count' 'age_upon_intake_(days)' 'age_upon_intake_(years)'
 'age_upon_intake_age_group' 'intake_datetime' 'intake_month' 'intake_year'
 'intake_monthyear' 'intake_weekday' 'intake_hour' 'intake_number'
 'time_in_shelter' 'time_in_shelter_days' 'just_age' 'breed_bin'
 'target_adopt' 'time_spent_shelter' 'DOB_DT' 'DOB_bucket'
 'age_upon_outcome_numeric' 'age_bucket']


In [90]:
#For simplification, ignoring breed and color for now
categorical_columns = ['age_bucket','DOB_bucket','sex_upon_outcome','animal_type',
                       'intake_condition','time_spent_shelter','breed_bin']
df_dummies = pd.get_dummies(df1[categorical_columns],
                            prefix=categorical_columns,
                            columns=categorical_columns)
dummy_column_names = df_dummies.columns.values
dummy_column_names[:10]


array(['age_bucket_(-0.001, 1.0]', 'age_bucket_(1.0, 2.0]',
       'age_bucket_(2.0, 3.0]', 'age_bucket_(3.0, 5.0]',
       'age_bucket_(5.0, 25.0]',
       'DOB_bucket_(1991-12-10 23:59:59.999999999, 2012-10-24]',
       'DOB_bucket_(2012-10-24, 2014-04-03]',
       'DOB_bucket_(2014-04-03, 2015-04-18]',
       'DOB_bucket_(2015-04-18, 2016-04-24]',
       'DOB_bucket_(2016-04-24, 2018-03-25]'], dtype=object)

In [91]:
#Concatanate the columns and throw them together, then format
df2 = pd.concat([df1, df_dummies], axis=1)
formula = 'target_adopt ~ 0 + {}'.format(' + '.join(['Q("{}")'.format(x) for x in dummy_column_names]))
print formula

target_adopt ~ 0 + Q("age_bucket_(-0.001, 1.0]") + Q("age_bucket_(1.0, 2.0]") + Q("age_bucket_(2.0, 3.0]") + Q("age_bucket_(3.0, 5.0]") + Q("age_bucket_(5.0, 25.0]") + Q("DOB_bucket_(1991-12-10 23:59:59.999999999, 2012-10-24]") + Q("DOB_bucket_(2012-10-24, 2014-04-03]") + Q("DOB_bucket_(2014-04-03, 2015-04-18]") + Q("DOB_bucket_(2015-04-18, 2016-04-24]") + Q("DOB_bucket_(2016-04-24, 2018-03-25]") + Q("sex_upon_outcome_Intact Female") + Q("sex_upon_outcome_Intact Male") + Q("sex_upon_outcome_Neutered Male") + Q("sex_upon_outcome_Spayed Female") + Q("sex_upon_outcome_Unknown") + Q("animal_type_Bird") + Q("animal_type_Cat") + Q("animal_type_Dog") + Q("animal_type_Other") + Q("intake_condition_Aged") + Q("intake_condition_Feral") + Q("intake_condition_Injured") + Q("intake_condition_Normal") + Q("intake_condition_Nursing") + Q("intake_condition_Other") + Q("intake_condition_Pregnant") + Q("intake_condition_Sick") + Q("time_spent_shelter_(-0.001, 0.844]") + Q("time_spent_shelter_(0.844, 4.0

In [92]:
#Make the matrix
Y, X = dmatrices(formula, df2, return_type='dataframe')
y = Y['target_adopt'].values
y[:10]

array([ 0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.])

In [93]:
from sklearn import naive_bayes
model = naive_bayes.MultinomialNB()
model.fit(X,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [94]:
print 'Prediction'
print model.predict(X[:10])
print 'Actual'
print y[:10]

Prediction
[ 1.  1.  1.  0.  1.  0.  0.  0.  1.  1.]
Actual
[ 0.  0.  1.  0.  1.  0.  0.  0.  0.  1.]


In [95]:
from sklearn import metrics
prediction_train = model.predict(X)
print metrics.accuracy_score(y, prediction_train)

0.822505818344


In [96]:
print 'Prior probability for the negative class is',
print exp(model.class_log_prior_[0])
print 'Prior probability for the positive class is',
print exp(model.class_log_prior_[1])

Prior probability for the negative class is 0.482221297452
Prior probability for the positive class is 0.517778702548


In [97]:
feature_importances = abs(model.feature_log_prob_[1] - model.feature_log_prob_[0])
feature_importance_series = Series(feature_importances, index=X.columns.values)
top_10_feature_indices = feature_importance_series.sort_values(ascending=False)[:10].index.values
inter_class_differences = model.feature_log_prob_[1] - model.feature_log_prob_[0]
new_feature_importance_series = Series(inter_class_differences, index=X.columns.values)

new_feature_importance_series[top_10_feature_indices]


Q("breed_bin_Bat Mix")                    -7.302396
Q("breed_bin_Bat")                        -6.790123
Q("breed_bin_Raccoon Mix")                -6.228088
Q("breed_bin_Raccoon")                    -5.534941
Q("breed_bin_Opossum Mix")                -5.264066
Q("breed_bin_Opossum")                    -4.725070
Q("sex_upon_outcome_Unknown")             -4.623029
Q("time_spent_shelter_(-0.001, 0.844]")   -3.479858
Q("animal_type_Other")                    -2.994655
Q("breed_bin_Guinea Pig Mix")             -2.952179
dtype: float64