In [3]:
import pandas as pd
from sklearn.cross_validation import KFold
import numpy as np
import matplotlib.pyplot as plt
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
import sklearn

N_ESTIMATORS = 100
# No feature selection above basics. I'll just let the Random Forest do its thing.
IGNORE_LIST = ['id', 'profit', 'responded', 'pmonths']
CUTOFF = 0.5

def k_split(df, k):
    """
    Split the training.csv set into k-folds, stored as a train, test dictionaries
    """
    train = {}
    test = {}
    kf = KFold(len(df), k, shuffle=True)
    i = 0
    for train_index, test_index in kf:
        train[i] = df.ix[train_index]
        test[i] = df.ix[test_index]
        i += 1
    return train, test


def refactor(df):
    """
    Make non-numerical data fields numerical with int factors
    Save the factor definitions in a dictionary for later
    """
    factors = {}
    for c in df.columns:
        if df[c].dtype not in [float, int]:
            factors[c] = {}
            i = 0
            for f in df[c].dropna().unique():
                factors[c][f] = i
                i += 1
    for c, d in factors.iteritems():
        for k, v in d.iteritems():
            df.ix[df[c] == k, c] = int(v)
    return df, factors


def rf_predict(train, test, col, n=100, prob=False, ignore_list=['id', 'profit', 'responded', 'pmonths'], classifier='AdaBoostClassifier'):
    """
    If you just want it to predict the field, leave prob=False. If you want the probability of 1 vs. 0, make it True.
    Don't fit on fields that still have nulls in them (or that are chosen to be ignored)
    Output the estimates for col in the test dataframe.
    classifier: RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
    """
    if prob:
        not_fit = ignore_list
        not_fit.extend(train.columns[train.isnull().sum() > 0])
        not_fit.extend(test.columns[test.isnull().sum() > 0])
        not_fit.append(col)
        not_fit = set(not_fit)
#         forest = RandomForestClassifier(n_estimators=n)
        forest = getattr(sklearn.ensemble, classifier)(n_estimators=n)
        forest = forest.fit(train[[c for c in train.columns if c not in not_fit]], train[col])
        output = [x[1] for x in forest.predict_proba(test[[c for c in test.columns if c not in not_fit]])]
    else:
        not_fit = ignore_list
        not_fit.extend(train.columns[train.isnull().sum() > 0])
        not_fit.extend(test.columns[test.isnull().sum() > 0])
        not_fit.append(col)
        not_fit = set(not_fit)
        forest = getattr(sklearn.ensemble, classifier)(n_estimators=n)
#         forest = RandomForestClassifier(n_estimators=n)
        forest = forest.fit(train[[c for c in train.columns if c not in not_fit]], train[col].values.astype(np.int32))
        output = forest.predict(test[[c for c in test.columns if c not in not_fit]])

    return output, forest


def fill_in_nan(df, col, n=100, classifier='AdaBoostClassifier'):
    """
    Use the rf_predict with prob=False to guess what missing values are.
    """
    output, forest = rf_predict(df[df[col].notnull()], df[df[col].isnull()], col, n, False, classifier = classifier)
    df.ix[df[col].isnull(), col] = output
    return df


def calc_profit(test, prediction, cutoff):
    """
    Use the profit function given to calculate profits for a certain cutoff on forest.predict_proba.
    This is what is to be optimized vs. cutoff choice.
    """
    test['prediction'] = prediction
    test.ix[test['profit'].isnull(), 'profit'] = -30
    return sum(test.ix[test['prediction'] >= cutoff, 'profit'])

In [32]:
def main():
    train = pd.read_csv('data/DataTraining.csv')
    train.loc[train.default == 'unknown', 'unknown'] = 'unknownnnn'
    train = train.replace("unknown", np.nan)
    test_original = pd.read_csv('data/DataPredict.csv')
    
#     print test_original.shape
#     print train.shape
    test_original.columns = train.columns[:-4]
    test = test_original.copy()
    test.loc[test.default == 'unknown', 'unknown'] = 'unknownnnn'
    test = test.replace("unknown", np.nan)
    
    model = 'AdaBoostClassifier'

    train, train_factors = refactor(train)
    test, test_factors = refactor(test)

    train = fill_in_nan(train, 'housing', N_ESTIMATORS, model)
    train = fill_in_nan(train, 'day_of_week', N_ESTIMATORS, model)
    train = fill_in_nan(train, 'schooling', N_ESTIMATORS, model)
    train = fill_in_nan(train, 'custAge', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'day_of_week', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'schooling', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'custAge', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'housing', N_ESTIMATORS, model)
    
    prediction, forest = rf_predict(train, test, 'responded', N_ESTIMATORS, prob=True, ignore_list=IGNORE_LIST, classifier=model)

    test_original['market_to'] = [1 if p > CUTOFF else 0 for p in prediction]

    test_original.to_csv('data/testingCandidate_output.csv', index=False)

if __name__=="__main__":
    main()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [4]:
models = ['RandomForestClassifier', 
         'AdaBoostClassifier', 
         'BaggingClassifier', 
         'ExtraTreesClassifier', 
         'GradientBoostingClassifier']
for model in models:
    print "###################   {}   ##########".format(model)
    N_ESTIMATORS = 100
    # df_original = pd.read_csv('data/new_train.csv')
    df_original = pd.read_csv('data/DataTraining.csv')
    df = df_original.copy()
    df.loc[df.default == 'unknown', 'unknown'] = 'unknownnnn'
    df = df.replace("unknown", np.nan)
    train_, test = k_split(df, 5)
    train_ = train_[0]
    test = test[0]
    print train_.shape, test.shape
    train_N = train_.loc[train_.responded == 'yes']
    train_N_e = pd.concat([train_N]*((len(train_)-len(train_N))/len(train_N)), ignore_index=True)
    train = pd.concat([train_, train_N_e], ignore_index=True)
    train, train_factors = refactor(train)
    test, test_factors = refactor(test)
    print train.shape

    train = fill_in_nan(train, 'housing', N_ESTIMATORS, model)
    train = fill_in_nan(train, 'day_of_week', N_ESTIMATORS, model)
    train = fill_in_nan(train, 'schooling', N_ESTIMATORS, model)
    train = fill_in_nan(train, 'custAge', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'day_of_week', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'schooling', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'custAge', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'housing', N_ESTIMATORS, model)

    prediction, forest = rf_predict(train, test, 
                                    'responded', 
                                    N_ESTIMATORS, 
                                    prob=True, 
                                    ignore_list=['id', 'profit', 'responded', 'pmonths'], 
                                    classifier = model)

    ground_truth = test['responded'].values.astype(np.int32)


    from sklearn.metrics import roc_auc_score
    print "roc auc score original: ",roc_auc_score(ground_truth, prediction)

    max_profit = -1000
    cutoff = -1
    for i in np.linspace(0, 1, 101):
        profit = calc_profit(test, prediction, i)
        if profit > max_profit:
            max_profit = profit
            cutoff = i
#         print '{} : {}'.format(i, profit)
    print "Best cutoff: ", cutoff
    print "Best profit: ", max_profit
    pred = (np.asarray(prediction)>cutoff)*1
    print "Accuracy: ",np.mean(pred == ground_truth)
    print "roc auc score: ",roc_auc_score(ground_truth, pred)

###################   RandomForestClassifier   ##########
(6509, 25) (1628, 25)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


(12029, 25)
roc auc score original:  0.7403961481786094
Best cutoff:  0.29
Best profit:  7256.0
Accuracy:  0.8716216216216216
roc auc score:  0.6879696671513263
###################   AdaBoostClassifier   ##########
(6509, 25) (1628, 25)
(11837, 25)
roc auc score original:  0.7863747793062277
Best cutoff:  0.5
Best profit:  10819.0
Accuracy:  0.8071253071253072
roc auc score:  0.7215638455969211
###################   BaggingClassifier   ##########
(6509, 25) (1628, 25)
(11909, 25)
roc auc score original:  0.6664438025959207
Best cutoff:  0.73
Best profit:  4900.0
Accuracy:  0.9035626535626535
roc auc score:  0.5779756810726001
###################   ExtraTreesClassifier   ##########
(6509, 25) (1628, 25)
(11813, 25)
roc auc score original:  0.7413472111155538
Best cutoff:  0.22
Best profit:  8794.0
Accuracy:  0.831081081081081
roc auc score:  0.6678411968545914
###################   GradientBoostingClassifier   ##########
(6509, 25) (1628, 25)
(11853, 25)
roc auc score original:  0.73404