In [1]:
import pandas as pd
from sklearn.cross_validation import KFold
import numpy as np
import matplotlib.pyplot as plt
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
import sklearn.ensemble

N_ESTIMATORS = 100

IGNORE_LIST = ['id', 'profit', 'responded', 'pmonths']
CUTOFF = 0.5

def k_split(df, k):
    """
    Split the training.csv set into k-folds, stored as a train, test dictionaries
    """
    train = {}
    test = {}
    kf = KFold(len(df), k, shuffle=True)
    i = 0
    for train_index, test_index in kf:
        train[i] = df.ix[train_index]
        test[i] = df.ix[test_index]
        i += 1
    return train, test


def refactor(df):
    """
    Make non-numerical data fields numerical with int factors
    Save the factor definitions in a dictionary for later
    """
    factors = {}
    for c in df.columns:
        if df[c].dtype not in [float, int]:
            factors[c] = {}
            i = 0
            for f in df[c].dropna().unique():
                factors[c][f] = i
                i += 1
    for c, d in factors.iteritems():
        for k, v in d.iteritems():
            df.ix[df[c] == k, c] = int(v)
    return df, factors


def rf_predict(train, test, col, n=100, prob=False, ignore_list=['id', 'profit', 'responded', 'pmonths'], classifier='AdaBoostClassifier'):
    """
    If you just want it to predict the field, leave prob=False. If you want the probability of 1 vs. 0, make it True.
    Don't fit on fields that still have nulls in them (or that are chosen to be ignored)
    Output the estimates for col in the test dataframe.
    classifier: RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
    """
    if prob:
        not_fit = ignore_list
        not_fit.extend(train.columns[train.isnull().sum() > 0])
        not_fit.extend(test.columns[test.isnull().sum() > 0])
        not_fit.append(col)
        not_fit = set(not_fit)
        forest = getattr(sklearn.ensemble, classifier)(n_estimators=n)
        forest = forest.fit(train[[c for c in train.columns if c not in not_fit]], train[col])
        output = [x[1] for x in forest.predict_proba(test[[c for c in test.columns if c not in not_fit]])]
    else:
        not_fit = ignore_list
        not_fit.extend(train.columns[train.isnull().sum() > 0])
        not_fit.extend(test.columns[test.isnull().sum() > 0])
        not_fit.append(col)
        not_fit = set(not_fit)
        forest = getattr(sklearn.ensemble, classifier)(n_estimators=n)
        forest = forest.fit(train[[c for c in train.columns if c not in not_fit]], train[col].values.astype(np.int32))
        output = forest.predict(test[[c for c in test.columns if c not in not_fit]])

    return output, forest


def fill_in_nan(df, col, n=100, classifier='AdaBoostClassifier'):
    """
    Use the rf_predict with prob=False to guess what missing values are.
    """
    if df[col].isnull().any() == False:
        return df
    else:
        output, forest = rf_predict(df[df[col].notnull()], df[df[col].isnull()], col, n, False, classifier = classifier)
        df.ix[df[col].isnull(), col] = output
        return df


def calc_profit(test, prediction, cutoff):
    """
    Use the profit function given to calculate profits for a certain cutoff on forest.predict_proba.
    This is what is to be optimized vs. cutoff choice.
    """
    test['prediction'] = prediction
    test.ix[test['profit'].isnull(), 'profit'] = -30
    return sum(test.ix[test['prediction'] >= cutoff, 'profit'])



### Choose model

In [2]:
models = ['RandomForestClassifier', 
         'AdaBoostClassifier', 
         'BaggingClassifier',
         'ExtraTreesClassifier', 
         'GradientBoostingClassifier']

for model in models:
    print "###################   {}   ##########".format(model)
    N_ESTIMATORS = 100
    
    df_original = pd.read_csv('data/DataTraining.csv')
    df = df_original.copy()
    df = df.replace("unknown", np.nan)
    # split the dataset, use 20% data to validate
    train_, test = k_split(df, 5)
    train_ = train_[0]
    test = test[0]
    print 'The shape of training and test dataset ',train_.shape, test.shape
    # resample the training data where responded is 'yes'.
    train_N = train_.loc[train_.responded == 'yes']
    train_N_e = pd.concat([train_N]*((len(train_)-len(train_N))/len(train_N)), ignore_index=True)
    train = pd.concat([train_, train_N_e], ignore_index=True)
    train, train_factors = refactor(train)
    test, test_factors = refactor(test)
    print 'Deal with imbalanced sample, resample the minority class ', train.shape
    
    # Using the existance data to predict the missing data except the 'default' column.
    train = fill_in_nan(train, 'profession', N_ESTIMATORS, model)
    train = fill_in_nan(train, 'marital', N_ESTIMATORS, model)
    train = fill_in_nan(train, 'loan', N_ESTIMATORS, model)
    train = fill_in_nan(train, 'housing', N_ESTIMATORS, model)
    train = fill_in_nan(train, 'day_of_week', N_ESTIMATORS, model)
    train = fill_in_nan(train, 'schooling', N_ESTIMATORS, model)
    train = fill_in_nan(train, 'custAge', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'day_of_week', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'schooling', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'custAge', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'housing', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'profession', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'marital', N_ESTIMATORS, model)
    test = fill_in_nan(test, 'loan', N_ESTIMATORS, model)
    train.default[train.default.isna()] = 1
    test.default[test.default.isna()] = 1
    
    prediction, forest = rf_predict(train, test, 
                                    'responded', 
                                    N_ESTIMATORS, 
                                    prob=True, 
                                    ignore_list=['id', 'profit', 'responded', 'pmonths'], 
                                    classifier = model)

    ground_truth = test['responded'].values.astype(np.int32)


    from sklearn.metrics import roc_auc_score
    print "roc auc score original: ",roc_auc_score(ground_truth, prediction)

    max_profit = -1000
    cutoff = -1
    for i in np.linspace(0, 1, 101):
        profit = calc_profit(test, prediction, i)
        if profit > max_profit:
            max_profit = profit
            cutoff = i
#         print '{} : {}'.format(i, profit)
    print "Best cutoff: ", cutoff
    print "Best profit: ", max_profit
    pred = (np.asarray(prediction)>cutoff)*1
    print "Accuracy: ",np.mean(pred == ground_truth)
    print "roc auc score: ",roc_auc_score(ground_truth, pred)

###################   RandomForestClassifier   ##########
The shape of training and test dataset  (6509, 24) (1628, 24)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Deal with imbalanced sample, resample the minority class  (12359, 24)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


roc auc score original:  0.7566474708656022
Best cutoff:  0.23
Best profit:  14991.0
Accuracy:  0.8433660933660934
roc auc score:  0.733550210842318
###################   AdaBoostClassifier   ##########
The shape of training and test dataset  (6509, 24) (1628, 24)
Deal with imbalanced sample, resample the minority class  (11797, 24)
roc auc score original:  0.7885364989369242
Best cutoff:  0.5
Best profit:  13341.0
Accuracy:  0.7794840294840295
roc auc score:  0.7330402320636856
###################   BaggingClassifier   ##########
The shape of training and test dataset  (6509, 24) (1628, 24)
Deal with imbalanced sample, resample the minority class  (11901, 24)
roc auc score original:  0.7544920793176028
Best cutoff:  0.37
Best profit:  11940.0
Accuracy:  0.8931203931203932
roc auc score:  0.7008397031128837
###################   ExtraTreesClassifier   ##########
The shape of training and test dataset  (6509, 24) (1628, 24)
Deal with imbalanced sample, resample the minority class  (1229

## Best model is GradientBoostingClassifier
#### If use AdaBoostClassifier, the responded probability doesn't reasonable, they almost close to 0.5, even though it has highest performance.   And we want to maximize the profit, so we chose GradientBoostingClassifier.  It has more reasonable probability distribution.
##### see more details in below

In [3]:
model = 'GradientBoostingClassifier'
# In experiment N_ESTIMATORS equal to 100 or 500 or 1000, have the similar performance
N_ESTIMATORS = 100
TRAIN_PATH = 'data/DataTraining.csv'
TEST_PATH = 'data/DataPredict.csv'
CUTOFF = cutoff

### Data clean
###### Using the existance data to predict the missing data except the 'default' column.  
###### In there, we suppose 'unknown' in 'default' is a variable, because the 'default' column just has 'no' and 'unknown'.

In [5]:
train = pd.read_csv(TRAIN_PATH)
train = train.replace("unknown", np.nan)
train, train_factors = refactor(train)
train = fill_in_nan(train, 'profession', N_ESTIMATORS, model)
train = fill_in_nan(train, 'marital', N_ESTIMATORS, model)
train = fill_in_nan(train, 'loan', N_ESTIMATORS, model)
train = fill_in_nan(train, 'housing', N_ESTIMATORS, model)
train = fill_in_nan(train, 'day_of_week', N_ESTIMATORS, model)
train = fill_in_nan(train, 'schooling', N_ESTIMATORS, model)
train = fill_in_nan(train, 'custAge', N_ESTIMATORS, model)
# train = fill_in_nan(train, 'default', N_ESTIMATORS, model)
train.default[train.default.isna()] = 1
    
train[train.profit.isna()==False].head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,custAge,profession,marital,schooling,default,housing,loan,contact,month,day_of_week,...,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,pmonths,pastEmail,responded,profit,id
7310,37.0,4,2,2,0,1,1,0,4,3,...,1.4,93.444,-36.1,4.964,5228.1,999.0,0,1,154.0,7311
7311,56.0,10,1,0,0,0,0,0,6,3,...,-3.4,92.649,-30.1,0.716,5017.5,0.1,1,1,464.0,7312
7312,57.0,2,2,1,1,0,1,1,2,0,...,1.4,94.465,-41.8,4.959,5228.1,999.0,0,1,-205.0,7313
7313,27.0,0,2,1,0,1,1,0,3,0,...,-1.1,94.601,-49.5,0.972,4963.6,999.0,3,1,146.0,7314
7314,53.0,4,2,1,0,1,1,0,7,2,...,-1.8,93.369,-34.8,0.655,5008.7,999.0,0,1,151.0,7315


### Creating a linear regression function to predict the profit for the responded people.
##### Getting training set

In [6]:
train_ls = train[train.profit.isna()==False]
X_train = train_ls.values.astype(np.float)[:, :-3]
y_train = train_ls.values.astype(np.float)[:, -2]
print X_train.shape, y_train.shape

(827, 21) (827,)


##### Checking all entries are not equal to np.nan 

In [7]:
np.argwhere(np.isnan(X_train)==True)

array([], shape=(0, 2), dtype=int64)

In [8]:
regr = LinearRegression()
regr.fit(X_train, y_train)

y_p = regr.predict(X_train)
from sklearn.metrics import mean_squared_error, r2_score
r2 = r2_score(y_p, y_train)
print r2

0.771199153853492


### Predict responded

In [9]:
test_original = pd.read_csv('data/DataPredict.csv')
test_original.columns = train.columns[:-3]
test_original.head()

Unnamed: 0,custAge,profession,marital,schooling,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,pmonths,pastEmail
0,31.0,services,single,high.school,no,no,no,cellular,jul,thu,...,999,0,nonexistent,1.4,93.918,-42.7,4.968,5228.1,999.0,0
1,32.0,services,married,basic.4y,no,no,no,cellular,nov,mon,...,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,999.0,0
2,39.0,blue-collar,married,basic.9y,no,yes,no,cellular,apr,wed,...,999,1,failure,-1.8,93.075,-47.1,1.445,5099.1,999.0,5
3,32.0,admin.,single,,no,yes,no,cellular,jul,thu,...,15,1,success,-1.7,94.215,-40.3,0.846,4991.6,0.5,1
4,,services,married,unknown,no,yes,no,telephone,jun,,...,999,0,nonexistent,1.4,94.465,-41.8,4.864,5228.1,999.0,0


#### deal with test set

In [10]:
test = test_original.copy()
test = test.replace("unknown", np.nan)
test, test_factors = refactor(test)
test = fill_in_nan(test, 'day_of_week', N_ESTIMATORS, model)
test = fill_in_nan(test, 'schooling', N_ESTIMATORS, model)
test = fill_in_nan(test, 'custAge', N_ESTIMATORS, model)
test = fill_in_nan(test, 'housing', N_ESTIMATORS, model)
test = fill_in_nan(test, 'profession', N_ESTIMATORS, model)
test = fill_in_nan(test, 'marital', N_ESTIMATORS, model)
test = fill_in_nan(test, 'loan', N_ESTIMATORS, model)
test.default[test.default.isna()] = 1

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [11]:
prediction, forest = rf_predict(train, test, 
                                'responded', 
                                N_ESTIMATORS, 
                                prob=True, 
                                ignore_list=IGNORE_LIST, 
                                classifier=model)

In [12]:
CUTOFF = 0.48
test_original['responded'] = prediction
a = np.asarray(prediction)
print a[:20]

[0.04312325 0.24771598 0.09010783 0.80000462 0.04384085 0.0461067
 0.22276779 0.05572345 0.33770916 0.19911338 0.03218517 0.04224419
 0.03824021 0.04802755 0.11776579 0.09190339 0.04294944 0.05509091
 0.05156615 0.03906581]


### Use Linear Regression model we get above to predict the profit in test set

In [13]:
X_test = test.values.astype(np.float)
print X_test.shape

(929, 21)


In [14]:
y_test_pred = regr.predict(X_test)
test_original['profit'] = y_test_pred
print y_test_pred[:20]

[371.91840679 393.08206816 408.22015996 425.49038038 389.61983914
 383.51565271 135.79500067 406.63475948 439.54427195 169.29444556
 365.27850153 439.10806842 440.97709716 429.94907913 177.85139189
 394.29286822 371.78189383 108.91289203 182.4252078  403.03748792]


#### $$profit = Pr[responded]\times pred\_profit + (-30)\times(1-Pr[responded])$$

In [15]:
Profit = a*y_test_pred + (-30)*(1-a)
test_original['market_to'] = 1*(Profit>0)
print 'Total profit prediction: ', np.sum(Profit[Profit>0])

Total profit prediction:  23204.6980913792


In [126]:
test_original.to_csv('data/testingCandidate_output.csv', index=False)