In [1]:
import pandas as pd
from sklearn.cross_validation import KFold
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier


N_ESTIMATORS = 100
# No feature selection above basics. I'll just let the Random Forest do its thing.
IGNORE_LIST = ['id', 'profit', 'responded', 'pmonths']
CUTOFF = 0.22

def k_split(df, k):
    """
    Split the training.csv set into k-folds, stored as a train, test dictionaries
    """
    train = {}
    test = {}
    kf = KFold(len(df), k, shuffle=True)
    i = 0
    for train_index, test_index in kf:
        train[i] = df.ix[train_index]
        test[i] = df.ix[test_index]
        i += 1
    return train, test


def refactor(df):
    """
    Make non-numerical data fields numerical with int factors
    Save the factor definitions in a dictionary for later
    """
    factors = {}
    for c in df.columns:
        if df[c].dtype not in [float, int]:
            factors[c] = {}
            i = 0
            for f in df[c].dropna().unique():
                factors[c][f] = i
                i += 1
    for c, d in factors.iteritems():
        for k, v in d.iteritems():
            df.ix[df[c] == k, c] = int(v)
    return df, factors


def rf_predict(train, test, col, n=100, prob=False, ignore_list=['id', 'profit', 'responded', 'pmonths']):
    """
    If you just want it to predict the field, leave prob=False. If you want the probability of 1 vs. 0, make it True.
    Don't fit on fields that still have nulls in them (or that are chosen to be ignored)
    Output the estimates for col in the test dataframe.
    """
    if prob:
        not_fit = ignore_list
        not_fit.extend(train.columns[train.isnull().sum() > 0])
        not_fit.extend(test.columns[test.isnull().sum() > 0])
        not_fit.append(col)
        not_fit = set(not_fit)
        forest = RandomForestClassifier(n_estimators=n)
        forest = forest.fit(train[[c for c in train.columns if c not in not_fit]], train[col])
        output = [x[1] for x in forest.predict_proba(test[[c for c in test.columns if c not in not_fit]])]
    else:
        not_fit = ignore_list
        not_fit.extend(train.columns[train.isnull().sum() > 0])
        not_fit.extend(test.columns[test.isnull().sum() > 0])
        not_fit.append(col)
        not_fit = set(not_fit)
        forest = RandomForestClassifier(n_estimators=n)
        forest = forest.fit(train[[c for c in train.columns if c not in not_fit]], train[col].values.astype(np.int32))
        output = forest.predict(test[[c for c in test.columns if c not in not_fit]])

    return output, forest


def fill_in_nan(df, col, n=100):
    """
    Use the rf_predict with prob=False to guess what missing values are.
    """
    output, forest = rf_predict(df[df[col].notnull()], df[df[col].isnull()], col, n, False)
    df.ix[df[col].isnull(), col] = output
    return df


def calc_profit(test, prediction, cutoff):
    """
    Use the profit function given to calculate profits for a certain cutoff on forest.predict_proba.
    This is what is to be optimized vs. cutoff choice.
    """
    test['prediction'] = prediction
    test.ix[test['profit'].isnull(), 'profit'] = -30
    return sum(test.ix[test['prediction'] >= cutoff, 'profit'])



In [2]:
def main():
    train = pd.read_csv('data/DataTraining.csv')
#     train = train.replace("unknown", np.nan)
    test_original = pd.read_csv('data/DataPredict.csv')
#     test_original = test_original.replace("unknown", np.nan)
    test_original.columns = train.columns[:-3]
    
    test = test_original.copy()

    train, train_factors = refactor(train)
    test, test_factors = refactor(test)

    train = fill_in_nan(train, 'day_of_week', N_ESTIMATORS)
    train = fill_in_nan(train, 'schooling', N_ESTIMATORS)
    train = fill_in_nan(train, 'custAge', N_ESTIMATORS)
    test = fill_in_nan(test, 'day_of_week', N_ESTIMATORS)
    test = fill_in_nan(test, 'schooling', N_ESTIMATORS)
    test = fill_in_nan(test, 'custAge', N_ESTIMATORS)

    prediction, forest = rf_predict(train, test, 'responded', N_ESTIMATORS, prob=True, ignore_list=IGNORE_LIST)

    test_original['market_to'] = [1 if p > CUTOFF else 0 for p in prediction]

    test_original.to_csv('data/testingCandidate_output.csv', index=False)

if __name__=="__main__":
    main()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [22]:
IGNORE_LIST = ['id', 'profit', 'responded', 'pmonths']

choice = {}

df_original = pd.read_csv('data/DataTraining.csv')
df = df_original.copy()
df, factors = refactor(df)
df = fill_in_nan(df, 'day_of_week', 100)
df = fill_in_nan(df, 'schooling', 100)
df = fill_in_nan(df, 'custAge', 100)

for iterations in range(100):
    train, test = k_split(df, 10)

    prediction = {}
    forest = {}
    profit = {}
    profit['cutoff'] = np.linspace(0, 1, 101)

    for i in train:
        prediction[i], forest[i] = rf_predict(train[i], test[i], 'responded', 100, prob=True, ignore_list=IGNORE_LIST)
        profit[i] = []
        for c in profit['cutoff']:
            profit[i].append(calc_profit(test[i], prediction[i], c))

    df_profit = pd.DataFrame(profit)
    df_profit.set_index('cutoff', inplace=True)

    maxc = 0
    for c in enumerate(df_profit.sum(axis=1)):
        maxc = max(maxc, c[1])
        if maxc == c[1]:
            cutoff_choice = df_profit.index[c[0]]
            highest_profit = c[1] / len(train)
#     print 'Best choice for cutoff : {}'.format(cutoff_choice)
#     print '  with an expected profit of ${} per fold'.format(highest_profit)

#     df_profit.plot()

    try:
        choice[cutoff_choice] += 1
    except:
        choice[cutoff_choice] = 1

print choice

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


{0.25: 2, 0.18: 8, 0.2: 17, 0.22: 18, 0.24: 2, 0.17: 10, 0.19: 18, 0.21: 16, 0.23: 7, 0.16: 2}


In [19]:
df_original = pd.read_csv('data/DataTraining.csv')
df = df_original.copy()
train, test = k_split(df, 2)
train = train[0]
test = test[0]

train, train_factors = refactor(train)
test, test_factors = refactor(test)

train = fill_in_nan(train, 'day_of_week', 100)
train = fill_in_nan(train, 'schooling', 100)
train = fill_in_nan(train, 'custAge', 100)
test = fill_in_nan(test, 'day_of_week', 100)
test = fill_in_nan(test, 'schooling', 100)
test = fill_in_nan(test, 'custAge', 100)

prediction, forest = rf_predict(train, test, 'responded', 100, prob=True, ignore_list=['id', 'profit', 'responded', 'pmonths', 'nr.employed', 'cons.conf.idx', 'cons.price.idx', 'emp.var.rate', 'month', 'contact'])
pred = (np.asarray(prediction)>CUTOFF)*1
ground_truth = test['responded'].values.astype(np.int32)
print "Accuracy: ",np.mean(pred == ground_truth)

from sklearn.metrics import roc_auc_score
print "roc auc score original: ",roc_auc_score(ground_truth, prediction)
print "roc auc score: ",roc_auc_score(ground_truth, pred)
for i in np.linspace(0, 1, 101):
    profit = calc_profit(test, prediction, i)
    print '{} : {}'.format(i, profit)
    

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Accuracy:  0.87048414843942
roc auc score original:  0.7621166747639185
roc auc score:  0.6931891491513815
0.0 : -39764.0
0.01 : -35979.0
0.02 : -29721.0
0.03 : -21252.0
0.04 : -12349.0
0.05 : -4763.0
0.06 : 2488.0
0.07 : 9119.0
0.08 : 13604.0
0.09 : 16103.0
0.1 : 19698.0
0.11 : 23390.0
0.12 : 25916.0
0.13 : 26922.0
0.14 : 28559.0
0.15 : 28353.0
0.16 : 28665.0
0.17 : 29222.0
0.18 : 29591.0
0.19 : 29922.0
0.2 : 30434.0
0.21 : 30046.0
0.22 : 29707.0
0.23 : 27817.0
0.24 : 27808.0
0.25 : 27540.0
0.26 : 27442.0
0.27 : 27843.0
0.28 : 26819.0
0.29 : 26478.0
0.3 : 25797.0
0.31 : 25543.0
0.32 : 25311.0
0.33 : 23244.0
0.34 : 22846.0
0.35 : 21550.0
0.36 : 21547.0
0.37 : 21179.0
0.38 : 19553.0
0.39 : 18553.0
0.4 : 18373.0
0.41 : 17377.0
0.42 : 17407.0
0.43 : 17588.0
0.44 : 17738.0
0.45 : 16498.0
0.46 : 16348.0
0.47 : 15594.0
0.48 : 15594.0
0.49 : 14817.0
0.5 : 14237.0
0.51 : 13316.0
0.52 : 12735.0
0.53 : 12581.0
0.54 : 12190.0
0.55 : 11912.0
0.56 : 10819.0
0.57 : 9630.0
0.58 : 9660.0
0.59 : 9212.0

In [9]:
a = np.array(range(5))
print 1*(a>2)

[0 0 0 1 1]


In [24]:
import itertools

df_original = pd.read_csv('data/DataTraining.csv')
df = df_original.copy()
df, factors = refactor(df)
df = fill_in_nan(df, 'day_of_week', 100)
df = fill_in_nan(df, 'schooling', 100)
df = fill_in_nan(df, 'custAge', 100)
train, test = k_split(df, 2)

ignore_list = ['id', 'profit', 'responded']

selector = {}
# combo = itertools.combinations([c for c in df.columns if c not in ignore_list], 2)
for j in range(1, 7):
    combo = itertools.combinations(['cons.conf.idx', 'cons.price.idx', 'default', 'month', 'schooling', 'previous'], j)
    for c in combo:
        il = ignore_list[:]
        il.extend(c)
        print 'ignore_list : {}'.format(il)

        prediction = {}
        forest = {}
        profit = {}
        profit['cutoff'] = np.linspace(0, 1, 101)

        for t in train:
            prediction[t], forest[t] = rf_predict(train[t], test[t], 'responded', 100, prob=True, ignore_list=il)
            profit[t] = []
            for co in profit['cutoff']:
                profit[t].append(calc_profit(test[t], prediction[t], co))

        df_profit = pd.DataFrame(profit)
        df_profit.set_index('cutoff', inplace=True)

        maxc = 0
        for x in enumerate(df_profit.sum(axis=1)):
            maxc = max(maxc, x[1])
            if maxc == x[1]:
                cutoff_choice = df_profit.index[x[0]]
                highest_profit = x[1] / len(train)

        for i in c:
            try:
                selector[i] += highest_profit
            except:
                selector[i] = highest_profit
        print 'Best choice for cutoff : {}'.format(cutoff_choice)
        print '  with an expected profit of ${} per fold'.format(highest_profit)  
        print ''

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


ignore_list : ['id', 'profit', 'responded', 'cons.conf.idx']


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Best choice for cutoff : 0.17
  with an expected profit of $32773.0 per fold

ignore_list : ['id', 'profit', 'responded', 'cons.price.idx']


ValueError: Number of features of the model must match the input. Model n_features is 20 and input n_features is 21 

In [19]:
train = pd.read_csv('data/DataTraining.csv')
train = train.replace("unknown", np.nan)
test_original = pd.read_csv('data/DataPredict.csv')
test_original = test_original.replace("unknown", np.nan)
test = test_original.copy()

train, train_factors = refactor(train)
test, test_factors = refactor(test)
# print train, train_factors

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [9]:
train = fill_in_nan(train, 'day_of_week', N_ESTIMATORS)

Index([u'custAge', u'profession', u'marital', u'schooling', u'default',
       u'housing', u'loan', u'contact', u'month', u'day_of_week', u'campaign',
       u'pdays', u'previous', u'poutcome', u'emp.var.rate', u'cons.price.idx',
       u'cons.conf.idx', u'euribor3m', u'nr.employed', u'pmonths',
       u'pastEmail', u'responded', u'profit', u'id'],
      dtype='object')
[0 1 2 ... 2 2 2]
