In [1]:
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import linear_model, metrics
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

from matplotlib import rcParams
#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (3, 2)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
rcParams['agg.path.chunksize'] = 10000



In [2]:
# Reading datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submit = pd.read_csv("Sample_Submission.csv")

In [3]:
## Cleaning Data
train['Self_Employed'].fillna('No',inplace=True)
test['Self_Employed'].fillna('No',inplace=True)
table = train.pivot_table(values='LoanAmount', index='Self_Employed' ,columns='Education', aggfunc=np.median)
def get_train_loan_amount(x):
    return table.loc[x['Self_Employed'],x['Education']]
train['LoanAmount'].fillna(train[train['LoanAmount'].isnull()].apply(get_train_loan_amount, axis=1), inplace=True)
## Repeating this for test data
test_table = test.pivot_table(values='LoanAmount', index='Self_Employed' ,columns='Education', aggfunc=np.median)
test_table

def get_test_loan_amount(x):
    return test_table.loc[x['Self_Employed'],x['Education']]

# Replace missing values
test['LoanAmount'].fillna(test[test['LoanAmount'].isnull()].apply(get_test_loan_amount, axis=1), inplace=True)
train['LoanAmount_log'] = np.log(train['LoanAmount'])
test['LoanAmount_log'] = np.log(test['LoanAmount'])
train['TotalIncome'] = train['ApplicantIncome'] + train['CoapplicantIncome']
train['TotalIncome_log'] = np.log(train['TotalIncome'])
test['TotalIncome'] = test['ApplicantIncome'] + test['CoapplicantIncome']
test['TotalIncome_log'] = np.log(test['TotalIncome'])
train['Gender'].fillna('Male', inplace=True)
test['Gender'].fillna('Male', inplace=True)

train['Married'].fillna('Yes', inplace=True)
test['Married'].fillna('Yes', inplace=True)

train['Dependents'].fillna('0', inplace=True)
test['Dependents'].fillna('0', inplace=True)

train['Loan_Amount_Term'].fillna(360, inplace=True)
test['Loan_Amount_Term'].fillna(360, inplace=True)

train['Credit_History'].fillna(0, inplace=True)
test['Credit_History'].fillna(0, inplace=True)

In [4]:
var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']
le = LabelEncoder()
for i in var_mod:
    train[i] = le.fit_transform(train[i].astype('str'))
test_copy = test.copy()
var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area']
le = LabelEncoder()
for i in var_mod:
    test[i] = le.fit_transform(test[i].astype('str'))

In [5]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,LoanAmount_log,TotalIncome,TotalIncome_log
0,LP001002,1,0,0,0,0,5849,0.0,130.0,360.0,1.0,2,1,4.867534,5849.0,8.674026
1,LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0,4.85203,6091.0,8.714568
2,LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1,4.189655,3000.0,8.006368
3,LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1,4.787492,4941.0,8.505323
4,LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1,4.94876,6000.0,8.699515


In [6]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
    gs.fit(X, y)
    print("BEST", gs.best_params_, gs.best_score_, gs.grid_scores_)
    best = gs.best_estimator_
    return best
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, mask=None, reuse_split=None, score_func=None, n_folds=5, n_jobs=1):
    subdf=indf[featurenames]
    X=subdf.values
    y=(indf[targetname].values==target1val)*1
    if mask !=None:
        print("using mask")
        Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    if reuse_split !=None:
        print("using reuse split")
        Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
    if parameters:
        clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_jobs=n_jobs, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print("############# based on standard predict ################")
    print("Accuracy on training data: %0.2f" % (training_accuracy))
    print("Accuracy on test data:     %0.2f" % (test_accuracy))
    print(confusion_matrix(ytest, clf.predict(Xtest)))
    print("########################################################")
    return clf, Xtrain, ytrain, Xtest, ytest

In [7]:
itrain, itest = train_test_split(range(train.shape[0]), train_size=0.6)
mask=np.ones(train.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
mask[:10]

array([ True, False, False, False, False, False, False, False, False, False], dtype=bool)

In [8]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
parameters = {"criterion": ["gini", "entropy"],
              "min_samples_split": [2, 10, 20],
              "max_depth": [None, 2, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              "max_leaf_nodes": [None, 5, 10, 20],
              }
itrain, itest = train_test_split(range(train.shape[0]), train_size=0.6)
predictors = ['Credit_History', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'TotalIncome_log', 'LoanAmount_log']
clfsvm, Xtrain, ytrain, Xtest, ytest=do_classify(model, parameters, train, predictors,'Loan_Status', 1, mask=mask)
Xtr=np.concatenate((Xtrain, Xtest))



using mask
BEST {'criterion': 'entropy', 'max_depth': 2, 'max_leaf_nodes': 10, 'min_samples_leaf': 1, 'min_samples_split': 2} 0.7554347826086957 [mean: 0.59511, std: 0.02381, params: {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2}, mean: 0.62772, std: 0.02072, params: {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 10}, mean: 0.64674, std: 0.01753, params: {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 20}, mean: 0.62500, std: 0.03620, params: {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 5, 'min_samples_split': 2}, mean: 0.62500, std: 0.03620, params: {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 5, 'min_samples_split': 10}, mean: 0.67935, std: 0.02474, params: {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None,

In [100]:
y= train['Loan_Status']
y = y.values
subdf=train.ix[:, 1:]
subdf = subdf.ix[:, subdf.columns != 'Loan_Status']

In [81]:
subdf.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,LoanAmount_log,TotalIncome,TotalIncome_log
0,1,0,0,0,0,5849,0.0,130.0,360.0,1.0,2,4.867534,5849.0,8.674026
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,4.85203,6091.0,8.714568
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,4.189655,3000.0,8.006368
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,4.787492,4941.0,8.505323
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,4.94876,6000.0,8.699515


In [84]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth =  10, n_estimators = 40)
model.fit(subdf, y)

test = test.ix[:, 1:]
predictions = model.predict(test)
# parameters = {"criterion": ["gini", "entropy"],
#               "min_samples_split": [2, 10, 20],
#               "max_depth": [None, 2, 5, 10],
#               "min_samples_leaf": [1, 5, 10],
#               "max_leaf_nodes": [None, 5, 10, 20],
#               "n_estimators": [20, 40, 60]
#               }
#itrain, itest = train_test_split(range(train.shape[0]), train_size=0.6)
#clfsvm, Xtrain, ytrain, Xtest, ytest=do_classify(model, parameters, train, predictors,'Loan_Status', 1, mask=mask)
#Xtr=np.concatenate((Xtrain, Xtest))

In [108]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth =  10, n_estimators = 40)
parameters = {"criterion": ["gini", "entropy"],
              "min_samples_split": [2, 10, 20],
              "max_depth": [None, 2, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              "max_leaf_nodes": [None, 5, 10, 20],
              "n_estimators": [20, 40, 60]
              }
itrain, itest = train_test_split(range(train.shape[0]), train_size=0.6)
clfsvm, Xtrain, ytrain, Xtest, ytest=do_classify(model, parameters, train, predictors,'Loan_Status', 1, mask=mask)
Xtr=np.concatenate((Xtrain, Xtest))



using mask
BEST {'criterion': 'entropy', 'max_depth': 5, 'max_leaf_nodes': 10, 'min_samples_leaf': 10, 'min_samples_split': 20, 'n_estimators': 20} 0.7336956521739131 [mean: 0.70652, std: 0.03284, params: {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 20}, mean: 0.70109, std: 0.04339, params: {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 40}, mean: 0.70380, std: 0.04603, params: {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 60}, mean: 0.73098, std: 0.05003, params: {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 20}, mean: 0.71739, std: 0.03425, params: {'criterion': 'gini', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 10, '

In [85]:
final = test_copy[['Loan_ID']]

In [86]:
final['Loan_Status'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [87]:
final = final.apply(lambda x : x.map({1:'Y', 0: 'N'}))

In [88]:
final['Loan_ID'] = test_copy[['Loan_ID']]

In [89]:
final.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,N
4,LP001051,Y


In [90]:
final.to_csv("Sample_Submission.csv", index=False)