In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,log_loss

### Split Given Training Data into Training and Test Sets

In [2]:
train = pd.read_csv("data/train.csv")

# separate out neutered status from sex
train = train.ix[ train.SexuponOutcome.dropna().index ]
train["neutered_status"] = train.SexuponOutcome.apply(lambda x:x.split()[0])

# split into test and train subsets
frac = .6 # use 60% data for training
test_rows = np.random.choice(train.index.values
            ,int(round( len(train)*frac )),replace=False )
mytrain = train.ix[test_rows]
mytest  = train.drop(test_rows)

### Choose Features and Perform Linear Regression (Logit)

In [3]:
# prepare training data
features = ["AnimalType",'neutered_status']

mytrain_features = pd.get_dummies(mytrain[features])
mytrain_outcomes = mytrain.OutcomeType
mytest_features = pd.get_dummies(mytest[features])
mytest_outcomes = mytest.OutcomeType

# train
classifier = LogisticRegression()
classifier.fit(mytrain_features,mytrain_outcomes);

### Measure Success/Failure

50 percentile on Kaggle has log-loss = 0.84434

In [4]:
predictions = classifier.predict(mytest_features)
prediction_probabilities = classifier.predict_proba(mytest_features)

# check performance
acc = accuracy_score(mytest_outcomes, predictions)
los = log_loss( pd.get_dummies( mytest.OutcomeType ).values, prediction_probabilities)
print("accuracy = {0:1.2f} \nlog-loss = {1:1.3f}".format(acc,los) )

accuracy = 0.61 
log-loss = 0.996


### Specular the Reason of Success/Failure

In [5]:
mytrain_outcomes.value_counts()

Adoption           6467
Transfer           5595
Return_to_owner    2900
Euthanasia          949
Died                126
Name: OutcomeType, dtype: int64

In [6]:
pd.Series(predictions).value_counts()

Adoption    7461
Transfer    3230
dtype: int64

In [7]:
mytest_outcomes.value_counts()

Adoption           4302
Transfer           3827
Return_to_owner    1885
Euthanasia          606
Died                 71
Name: OutcomeType, dtype: int64

logistic regression only captures the most dominant input/output correlation?