In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import re
%matplotlib inline

In [None]:
shelterDf = pd.read_csv('Austin_Animal_Center_Outcomes.csv')

In [None]:
shelterDf.head()

In [None]:
shelterDf.info()

In [None]:
shelterDf.nunique()

In [None]:
shelterDf['Animal Type'].value_counts()

We're going to focus on cats and dogs

**Clean Data**

There is a lot of subjectivity in determining an animals breed and color e.g. many of the colors/breeds have only one animal assigned to that type. Therefore, we are only going to be looking at the top 10 breeds and drop the color feature

In [None]:
print('--Breeds--')
print(shelterDf['Breed'].value_counts().tail(10))
print('--Colors--')
print(shelterDf['Color'].value_counts().tail(10))

In [None]:
shelterDf = shelterDf[shelterDf.groupby('Breed').Breed.transform(len) > 1000]
print('Number of Breeds: ' + str(shelterDf['Breed'].nunique()))
print('Number of Animals: ' + str(shelterDf['Breed'].count()))

In [None]:
print(shelterDf[shelterDf['Animal Type'] == 'Other']['Breed'].value_counts())
print(shelterDf[shelterDf['Animal Type'] == 'Other']['Outcome Type'].value_counts())

The animals in the 'other' category are all bats and I highly doubt bats are up for adoption. They will be dropped from the data set

In [None]:
shelterDf = shelterDf[shelterDf['Animal Type'] != 'Other']

In [None]:
shelterDf['Outcome Type'].value_counts().plot(kind = 'bar', title = 'Number of Each Outcome')
plt.show()

Seems to be a pretty effective shelter although transfer doesn't mean adopted rather they were trasfered to another facility 

In [None]:
sns.heatmap(shelterDf.isnull())
plt.show()

In [None]:
shelterDf.isnull().sum()

Lets get rid of a few of those missing values 

In [None]:
shelterDf = shelterDf[shelterDf['Sex upon Outcome'].isnull() == False]
shelterDf = shelterDf[shelterDf['Outcome Type'].isnull() == False]

In [None]:
shelterDf.isnull().sum()

The missing 'Age upon Outcome' wont be a problem since I will be calculating it a different way below

Will be best to just drop Name and Outcome Subtype (will drop later)

In [None]:
# Add feature 'Was Adopted', 1 is adopted, 0 not adopted
shelterDf['Was Adopted'] = shelterDf['Outcome Type'].apply(lambda x: 1 if x == 'Adoption' else 0)

In [None]:
# If the 'name' column has a float then its NaN
shelterDf['Has Name'] = shelterDf['Name'].apply(lambda x: 1 if not isinstance(x, float) else 0)

In [None]:
# Make 'datetime' and 'date_of_birth' datetime objects
# There is several typos in Date of Birth so we'll drop those
shelterDf['DateTime'] = shelterDf['DateTime'].apply(lambda x: datetime.datetime.strptime(x[0:-3], '%m/%d/%Y %H:%M:%S'))
shelterDf['Date of Birth'] = shelterDf['Date of Birth'].apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y'))

In [None]:
# Make 'month' and 'year' features
shelterDf['Month'] = shelterDf['DateTime'].apply(lambda x: x.month)
shelterDf['Year'] = shelterDf['DateTime'].apply(lambda x: x.year)

In [None]:
# Change 'age upon outcome' to days for cosistency
shelterDf['Age upon Outcome'] = shelterDf['DateTime'] - shelterDf['Date of Birth']
daysRegex = re.compile('\d+')
shelterDf['Age upon Outcome'] = shelterDf['Age upon Outcome'].apply(lambda x: round(int(daysRegex.search(str(x)).group())))

In [None]:
# Data starts in 2013 and is incomplete
shelterDf['Year'].value_counts()

They started data collection in 2013 and its only March 2019 currently, so its expected to have much less animals in those years

In [None]:
# To use 'year' as a predictor we can only use complete years
shelterDf = shelterDf[shelterDf['Year'] != 2013]
shelterDf = shelterDf[shelterDf['Year'] != 2019]

In [None]:
shelterDf['Sex upon Outcome'].value_counts()

In [None]:
# Dropping unknown sex upon outcome since they dont give us much information
shelterDf = shelterDf[shelterDf['Sex upon Outcome'] != 'Unknown']

In [None]:
# Seperate 'Sex upon outcome' into 'is male' and 'is fixed'
# 'is fixed' = 0 implies the animal is not spayed or neutered
sexRegex = re.compile('(\w*)\s*(Male|Female|Unknown)')
sexRegex.search('Unknown').groups()

shelterDf['is male'] = shelterDf['Sex upon Outcome'].apply(lambda x: int(sexRegex.search(str(x)).group(2) == 'Male'))
shelterDf['is fixed'] = shelterDf['Sex upon Outcome'].apply(lambda x: int(sexRegex.search(str(x)).group(1) in ['Spayed', 'Neutered']))


*Drop any columns that are no longer needed*

In [None]:
# Drop Name and Outcome Subtype: too many NaN
# Drop MonthYear: redundant
# Drop Color
# Drop Animal ID
# Drop DateTime
# Drop Date of Birth
# Drop Sex upon Outcome
shelterDf.drop(['Date of Birth','DateTime', 'Name', 'Outcome Subtype', 'Outcome Type', 'MonthYear', 'Color', 'Animal ID', 'Sex upon Outcome'], axis = 1, inplace = True)
shelterDf.dropna(inplace = True)

In [None]:
shelterDf.isnull().sum()

In [None]:
shelterDf.reset_index(inplace=True, drop=True)
shelterDf.head(3)

In [None]:
shelterDf.info()

In [None]:
# Graphs of adoptions per year and per month
fig, axes = plt.subplots(figsize = (15,10), ncols =2, nrows = 2)
sns.countplot(shelterDf['Year'], hue = shelterDf['Was Adopted'], ax = axes[0,0]).set_title('Adoptions per Year')
sns.countplot(shelterDf['Month'],hue = shelterDf['Was Adopted'], ax = axes[0,1]).set_title('Adoptions per Month')
sns.countplot(shelterDf['Year'], ax = axes[1,0], color = 'cornflowerblue').set_title('Total Animals at Shelter per Year')
sns.countplot(shelterDf['Month'], ax = axes[1,1], color = 'cornflowerblue').set_title('Total Animals at Shelter per Month')

axes[0,0].set_ylabel('Number of Animals')
axes[1,0].set_ylabel('Number of Animals')
axes[0,1].set_ylabel('')
axes[1,1].set_ylabel('')
plt.show()

There appears to be a slight increase in adoptions from 2014 to 2018 while the total number of animals remains roughly the same. 

Adoptions in early months are significantly less than in later months but there are also less animals in the shelter in total. This makes sense since the increase in adoption at the end of the year will result in people not needing a pet and less animals in the shelter. The late year influx in adoptions is likely do to the holiday season and new pets being given as gifts

In [None]:
plt.figure(figsize= (12,5))
sns.countplot(x = 'Breed', data = shelterDf, hue = 'Was Adopted')
plt.tight_layout()
plt.xticks(rotation = 90)
plt.ylabel('Number of Animals')
plt.show()

It wouldn't seem that breed has much impact on whether an animal get adopted either and should be dropped during pre-processing. 'Domestic Shorthair Mix' is the most common breed. I assume that is because its difficult to classify cats any further than 'Short hair', 'Long hair', etc.

In [None]:
plt.figure(figsize= (10,5))
df = shelterDf.sample(frac = 0.01).reset_index(drop=True)

sns.countplot(shelterDf['Has Name'], hue = shelterDf['Was Adopted']).set_ylabel('Number of Animals')
plt.show()

It is clear that animals with names were much more likely to get adopted. This could be because a named animal is more likely to be well behaved and healthy, since it would likely have come from a home rather than the streets. However, there could also be cases were the shelter names found animals themselves. 

In [None]:
f,axes = plt.subplots(ncols = 2, figsize = (17,5))
sns.countplot(data = shelterDf, x = 'is male', hue = 'Was Adopted', ax = axes[0])
sns.countplot(data = shelterDf, x = 'is fixed', hue = 'Was Adopted', ax = axes[1])
axes[0].set_xlabel('Sex')
axes[0].set_ylabel('Number of Animals')
axes[0].set_xticklabels(['Female', 'Male'])
axes[1].set_xlabel('Fixed')
axes[1].set_ylabel('Number of Animals')
axes[1].set_xticklabels(['Was not Fixed', 'Was Fixed'])

plt.show()

The sex of the animal doesnt seem to matter much in terms of adoption. In the 'Fixed' graph you can clearly see that very few animals that were adopted were not fixed. 'is fixed' will likely be a very strong predictor of animal adoption

In [None]:
plt.figure(figsize = (10,5))
sns.countplot(data = shelterDf, x = 'Animal Type', hue = 'Was Adopted')
plt.ylabel('Number of Animals')
plt.show()

Adoption across based on Animal Type seems to be about the same with a little less than half of both cats and dogs being adopted.

In [None]:
f = sns.FacetGrid(shelterDf, hue="Was Adopted", height=7)
f.map(sns.kdeplot, "Age upon Outcome") 
f.add_legend()
plt.show()

Clear that majority of adoptions and outcomes in general happen when the animals are young. There is a peak at about the 2 year mark (730 days). This could be due to the shelter having an age limit when an animals needs to be adopted, transfered, etc. This will likely be a strong predictor for the later models.

**Preproccessing for machine learning**

Based on the previous graphs 'Breed', 'Animal Type', and 'is male' dont seem to have a significant impact on whether an animal will be adopted so they will be dropped.

In [None]:
shelterDf.drop(['Animal Type', 'Breed', 'is male'], axis = 1, inplace = True)

'Sex upon outcome' needs to be encoded to dummy variables to be used by a machine learning model

In [None]:
shelterDf.head(3)

In [None]:
shelterDf.info()

Standardize values of 'Month' and 'Age upon Outcome'

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
shelterDf['Age upon Outcome'] = scaler.fit_transform(shelterDf['Age upon Outcome'].values.reshape(-1,1))

In [None]:
shelterDf['Month'] = scaler.fit_transform(shelterDf['Month'].values.reshape(-1,1))

In [None]:
shelterDf['Year'] = scaler.fit_transform(shelterDf['Year'].values.reshape(-1,1))

In [None]:
shelterDf.head()

**Logistical Regression**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = shelterDf.drop(['Was Adopted'], axis =1)
y = shelterDf['Was Adopted']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logModel = LogisticRegression(solver='lbfgs')

In [None]:
logModel.fit(X_train, y_train)

In [None]:
predictions = logModel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
logClass = classification_report(y_test, predictions)
logConf = confusion_matrix(y_test, predictions)

print(logClass)
print(logConf)

Logistic regression yielded fairly accurate results with an f1-score of 0.76. Let's perform cross-validation to confirm that logisitical regression was a good choice of model

**Cross Validation**

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(logModel, X, y, cv=5)

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

About the same accuracy across the several folds

**Conclusion**

Given how subjective some of the data we were given from this data set, Logistical Regression was still fairly accurate with an acuraccy of ~0.75. One improvement that could be made on this project would be feature engineering the 'breed' and 'color' features. For example, you could combine combine 'Brown/Black', 'Brown', 'Brown/White/Orange' all into one binary feature 'Is brown' and see if that has any relation with adoptions but I doubt an animals color is too important when someone is considering adoption.

Furthermore, if you wanted to predict multiple outcomes other than just adoption such as Adoption, Transfer, Other then this project could easily be modified to do so by not combining all non-adoption outcomes. However, based on the results of only trying to predict adoption I find it likely the predictions will be very innacurate.