### Predict Blood Donations hosted by DataDriven.org
#### Entry level supervised multivariate classification prediction problem
#### Matthew Grant 

In [1]:
import numpy as np
import pandas as pd

# import our graphics tools
%matplotlib inline
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns  

# nice defaults for matplotlib styles
set2 = sns.color_palette('Set2')

# add on some settings from 'Bayesian Methods for Hackers'
plt.style.use('ggplot')

In [2]:
# Import Train and Test data 
# 
df = pd.read_csv('Train.csv')
Test = pd.read_csv('Test.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007
0,619,2,50,12500,98,1
1,664,0,13,3250,28,1
2,441,1,16,4000,35,1
3,160,2,20,5000,45,1
4,358,1,24,6000,77,0


In [4]:
Test.head()

Unnamed: 0.1,Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation
0,659,2,12,3000,52
1,276,21,7,1750,38
2,263,4,1,250,4
3,303,11,11,2750,38
4,83,4,12,3000,34


In [5]:
#Very small dataset 
#576 records in train and 200 in test
df.shape, Test.shape

((576, 6), (200, 5))

In [8]:
# save the current col names 
original_col_names = df.columns

# Clean up the column names
names = ['ID','recency', 'frequency', 'cc', 'time', 'donated']
df.columns = names
namesTest = ['ID','recency', 'frequency', 'cc', 'time']
Test.columns = namesTest

df.head()

Unnamed: 0,ID,recency,frequency,cc,time,donated
0,619,2,50,12500,98,1
1,664,0,13,3250,28,1
2,441,1,16,4000,35,1
3,160,2,20,5000,45,1
4,358,1,24,6000,77,0


In [54]:
# Save the Ids 
Ids = Test.ID
Ids.shape

(200,)

In [75]:
#Check for missing values
df.isnull().sum()

ID              0
recency         0
frequency       0
donated_Time    0
last_first      0
time            0
donated         0
dtype: int64

In [77]:
#save feature column names 
feature_column_names = df.columns[:-1]
print feature_column_names

Index([u'ID', u'recency', u'frequency', u'donated_Time', u'last_first',
       u'time'],
      dtype='object')


In [78]:
feature_column_names_test = Test.columns[:]
print feature_column_names_test

Index([u'ID', u'recency', u'frequency', u'donated_Time', u'last_first',
       u'time'],
      dtype='object')


In [79]:
label_column_name = df.columns[-1]
print df.columns[-1]

donated


In [65]:
#add donations per period (# donations/time)
donate_time = df.frequency / df.time
donate_timetest = Test.frequency / Test.time

#Ratio between months since last and months since first donation. (recency/time)
last_first = df.recency/df.time
last_firstTest = Test.recency/Test.time

In [67]:
# add in new features to dataframes
df['donated_Time'] = donate_time
Test['donated_Time'] = donate_timetest

df['last_first'] = last_first
Test['last_first'] = last_firstTest

In [71]:
# Rearange dataframe columns 
df = df[['ID','recency','frequency','donated_Time','last_first','time','donated']]

In [73]:
# Rearange dataframe columns 
Test = Test[['ID','recency','frequency','donated_Time','last_first','time']]

In [74]:
Test.head()

Unnamed: 0,ID,recency,frequency,donated_Time,last_first,time
0,659,2,12,0.230769,0.038462,52
1,276,21,7,0.184211,0.552632,38
2,263,4,1,0.25,1.0,4
3,303,11,11,0.289474,0.289474,38
4,83,4,12,0.352941,0.117647,34


In [80]:
#Set the training features
x_train = df[feature_column_names].astype(float)
x_train.shape

(576, 6)

In [81]:
#Set the training target
y_train = df.donated.ravel()
y_train.shape

(576,)

In [82]:
#Set the test features 
x_test = Test[feature_column_names_test].astype(float)
x_test.shape

(200, 6)

In [83]:
# Simple function for logistic Regression with all features included
def logReg(x_train,y_train):
    from sklearn.linear_model import LogisticRegression
    # Create classifier
    clf = LogisticRegression(penalty='l2', fit_intercept=True)
    # Fit to training data
    fitted = clf.fit(x_train, y_train)
    # Select 2nd column (prob its 1) 
    output = fitted.predict_proba(x_test)[:,1] 
    return output

In [89]:
def Rf(x_train,y_train):
    from sklearn.tree import DecisionTreeClassifier
    clf_tree = DecisionTreeClassifier(max_depth=3)
    fitted = clf_tree.fit(x_train, y_train)
    output = fitted.predict_proba(x_test)[:,1] 
    return output

In [84]:
output = logReg(x_train,y_train)
output.shape

(200,)

In [90]:
outputRf = Rf(x_train,y_train)

In [93]:
def submission(output,name):
    submission = pd.DataFrame({"":Ids,"Made Donation in March 2007":output})
    submission.to_csv('submission'+name+'.csv', index=False)

In [95]:
submission(output,'')