In [1]:
from sklearn import cross_validation, preprocessing
from sklearn import tree, metrics, neighbors, linear_model, svm, ensemble
import xgboost
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/wells_fargo_data.csv")

In [20]:
def data_label_split(df):
    label = df.normal_tot_bal.copy()
    label[label >= 0] = 1
    label[label < 0] = 0
    data = df.drop("normal_tot_bal", axis=1)
    return data, label
    
def folds_to_split(data,targets,train,test):
    data_tr = pd.DataFrame(data).iloc[train]
    data_te = pd.DataFrame(data).iloc[test]
    labels_tr = pd.DataFrame(targets).iloc[train]
    labels_te = pd.DataFrame(targets).iloc[test]
    return [data_tr, data_te, labels_tr, labels_te]

def run_cross_validation(x_train, y_train, classifiers):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=5, random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets, val_targets] = folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for classfier_name, clf in classifiers.iteritems():
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.roc_auc_score(prediction, val_targets)
            result_df.loc[foldnum, classfier_name] = accuracy
    return result_df

In [21]:
data, label = data_label_split(df)
x_train, x_test, y_train, y_test = cross_validation.train_test_split(
    data, label, test_size=0.1, random_state=0)

In [22]:
Classifiers = {"Decision tree": tree.DecisionTreeClassifier(),
               "Random Forest": ensemble.RandomForestClassifier(n_estimators=20),
               "KNN": neighbors.KNeighborsClassifier(),
               "Logistic regression": linear_model.LogisticRegression(),
               "Ada Boost": ensemble.AdaBoostClassifier(),
               "XGBoost": xgboost.XGBClassifier(n_estimators=20),
              }
print run_cross_validation(x_train, y_train, Classifiers)

        KNN  Decision tree   XGBoost  Logistic regression  Ada Boost  \
1  0.919199       0.960603  0.943619             0.900240   0.943823   
2  0.919711       0.957811  0.948497             0.895487   0.948755   
3  0.924496       0.957693  0.948577             0.907414   0.949126   
4  0.923235       0.956330  0.946406             0.909724   0.946338   
5  0.928655       0.962299  0.944673             0.902916   0.945131   

   Random Forest  
1       0.966207  
2       0.970797  
3       0.971510  
4       0.968790  
5       0.968524  
