In [1]:
import pandas as pd
import numpy as np
import time
from copy import deepcopy
from pprint import pprint

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, \
GradientBoostingClassifier, AdaBoostClassifier

from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from evaluate_metrics import plot_learning_curve

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

### Prepare data

In [2]:
dftrain = pd.read_csv('../data/train.csv')
dftest = pd.read_csv('../data/test.csv')
dftrain.shape, dftest.shape

((10500, 15), (38342, 15))

In [3]:
# specify index, predictors and target
index  = 'Id'
target = 'earn_over_4k_euros_per_year'
predictors = [x for x in dftrain.columns if x not in [target,index]]
# base random forest classifier to compare missing value methods
rfc = RandomForestClassifier()

# convert to categorical data
continuous_predictors = ['age','income_from_investment_sources',\
                         'losses_from_investment_sources',\
                         'number_of_years_of_education',\
                         'working_hours_per_week']
categorical_predictors = [col for col in predictors if col not in continuous_predictors]

for col in categorical_predictors:
#     dftrain[col] = dftrain[col].astype('int64')
    dftrain[col] = dftrain[col].astype('category')
    
#     dftest[col] = dftrain[col].astype('int64')
    dftest[col] = dftrain[col].astype('category')

### Fill in missing value

In [4]:
dftrain_ffill = dftrain.fillna(method='ffill')
dftest_ffill = dftest.fillna(method='ffill')
# fill the first row using the last row
dftrain_ffill = dftrain_ffill.fillna(value=dftrain_ffill.iloc[-1])
dftest_ffill = dftrain_ffill.fillna(value=dftest_ffill.iloc[-1])

### Helper functions

In [5]:
def modelfit(estimator, model, dftrain, predictors, useTrainCV=True, cv_folds=10):
    """fit model with evaluation"""
    # cv
    tic = time.clock()
    cv_scores = cross_validation.cross_val_score(
        estimator=estimator, \
        X=dftrain[predictors].values, \
        y=dftrain[target].values, \
        scoring='accuracy',
        cv=10)
    toc = time.clock()
    
    print 'Model report ({})'.format(model)
    print 'Accuracy (CV): {}'.format(np.mean(cv_scores))
    print 'Time (Avg.): {}'.format((toc - tic) / cv_folds)
    
    # fit
    predictor = estimator.fit(dftrain[predictors].values, dftrain[target].values)
    
    return predictor
    

### LDA

In [7]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = 'Linear Discriminant Analysis'

ldc = LinearDiscriminantAnalysis()

ldc_predictor = modelfit(ldc, model, dftrain_ffill, predictors)

Model report (LDA)
Accuracy (CV): 0.821523809524
Time (Avg.): 0.0544422


### NMC

In [9]:
from sklearn.neighbors import KNeighborsClassifier

model = 'Nearest Neirghbor'

nmc = KNeighborsClassifier(n_neighbors=1)

nmc_predictor = modelfit(nmc, model, dftrain_ffill, predictors)

Model report (Nearest Neirghbor)
Accuracy (CV): 0.81980952381
Time (Avg.): 0.1672859


### Naive Bayes

In [10]:
from sklearn.naive_bayes import GaussianNB

model = 'Gaussian Naive Bayes'

nbc = GaussianNB()

nbc_predictor = modelfit(nbc, model, dftrain_ffill, predictors)

Model report (Gaussian Naive Bayes)
Accuracy (CV): 0.80780952381
Time (Avg.): 0.0121213


### SVM

In [None]:
from sklearn.svm import SVC

model = 'SVM'

svc = SVC(kernel='linear')

svc_predictor = modelfit(svc, model, dftrain_ffill, predictors)