In [169]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn import cross_validation, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score,log_loss, f1_score
from sklearn.grid_search import GridSearchCV 
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


In [170]:
def animalDF(file):
    df = pd.read_csv(file)
    print(df.info())
    return df

# Data Exploration

Importing in the training data. Here we can see that some of the features have null values in them. Lets see how exactly how much from each column.

In [171]:
animals = animalDF('files/train.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 10 columns):
AnimalID          26729 non-null object
Name              19038 non-null object
DateTime          26729 non-null object
OutcomeType       26729 non-null object
OutcomeSubtype    13117 non-null object
AnimalType        26729 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26711 non-null object
Breed             26729 non-null object
Color             26729 non-null object
dtypes: object(10)
memory usage: 2.0+ MB
None


Here we see that we have four features that have missing values "Name", "OutcomeSubtype", "SexuponOutcome", and "AgeuponOutcome." Both "Name" and "OutcomeSubtype" are missing a significant amount of data within them while "SexuponOutcome" and "AgeuponOutcome" aren't missing that many. We'll fix these in a moment.

In [172]:
animals.isnull().sum()

AnimalID              0
Name               7691
DateTime              0
OutcomeType           0
OutcomeSubtype    13612
AnimalType            0
SexuponOutcome        1
AgeuponOutcome       18
Breed                 0
Color                 0
dtype: int64

Lets look at the first five rows of this data set.

In [173]:
animals.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


Above I can see several things that I want to change. I see values such as "Neutered Male" and "Spayed Female" in the "SexuponOutcome" feature. This can be separated into two different features itself, the sex of the animal and whether or not the animal is spayed, neutered, or intact. Also the feature "AgeuponOutcome" I would like to get into one catergory of age such as years, months, weeks, or days. "Breed" and "Color" are also two features that I may want to change.

In [174]:
# Splitting SexuponOutcome feature into sex and neutered.
def sexSplit(data):
    sex = str(data)
    if sex.find('Male') >= 0:
        return 'Male'
    elif sex.find('Female') >= 0:
        return 'Female'
    else:
        return 'Unknown'
    
def intactSplit(data):
    intact = str(data)
    if intact.find('Neutered') >= 0 or intact.find('Spayed') >= 0:
        return 'Neutered'
    else:
        return "Unknown"
    
animals['Sex'] = animals.SexuponOutcome.apply(sexSplit)
animals['Neutered'] = animals.SexuponOutcome.apply(intactSplit)

In [175]:
animals = animals.drop(['SexuponOutcome', 'OutcomeSubtype'], axis=1)

In [176]:
from breedSize import smallDog, medDog, largeDog

In [177]:
smallBreed = smallDog()
medBreed = medDog()
largeBreed = largeDog()

In [178]:
def breedSize(data):
    size = str(data)
    for small in smallBreed:
        if size.find(small) >= 0:
            return 'Small'
        else:
            return 'Unknown'
animals['Size'] = animals.Breed.apply(breedSize)

In [179]:
animals['Size'].value_counts()

Unknown    26719
Small         10
Name: Size, dtype: int64

In [180]:
animals

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,AnimalType,AgeuponOutcome,Breed,Color,Sex,Neutered,Size
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,Dog,1 year,Shetland Sheepdog Mix,Brown/White,Male,Neutered,Unknown
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Cat,1 year,Domestic Shorthair Mix,Cream Tabby,Female,Neutered,Unknown
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Dog,2 years,Pit Bull Mix,Blue/White,Male,Neutered,Unknown
3,A683430,,2014-07-11 19:09:00,Transfer,Cat,3 weeks,Domestic Shorthair Mix,Blue Cream,Male,Unknown,Unknown
4,A667013,,2013-11-15 12:52:00,Transfer,Dog,2 years,Lhasa Apso/Miniature Poodle,Tan,Male,Neutered,Unknown
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Dog,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan,Female,Unknown,Unknown
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Cat,3 weeks,Domestic Shorthair Mix,Blue Tabby,Male,Unknown,Unknown
7,A701489,,2015-04-30 17:02:00,Transfer,Cat,3 weeks,Domestic Shorthair Mix,Brown Tabby,Unknown,Unknown,Unknown
8,A671784,Lucy,2014-02-04 17:17:00,Adoption,Dog,5 months,American Pit Bull Terrier Mix,Red/White,Female,Neutered,Unknown
9,A677747,,2014-05-03 07:48:00,Adoption,Dog,1 year,Cairn Terrier,White,Female,Neutered,Unknown
