In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('../data/heloc_dataset_v1.csv')

In [5]:
df=pd.DataFrame(df)

data_cutoffs = {}

cols=['ExternalRiskEstimate', 'NumTrades60Ever2DerogPubRec', 'NumTrades90Ever2DerogPubRec', 'MaxDelq2PublicRecLast12M', 'MaxDelqEver', 'NumInqLast6M', 'NumInqLast6Mexcl7days']
for i in cols:
    percentiles = pd.Series(df[i])
    x=percentiles.quantile([.4,.6])
    vals = x.values                            
    if i==cols[0] or i==cols[6]:
         data_cutoffs[i]=[vals[0],vals[1]]

    else:
        data_cutoffs[i]=[vals[1],vals[0]]

print(data_cutoffs)

{'ExternalRiskEstimate': [68.0, 74.0], 'NumTrades60Ever2DerogPubRec': [0.0, 0.0], 'NumTrades90Ever2DerogPubRec': [0.0, 0.0], 'MaxDelq2PublicRecLast12M': [7.0, 6.0], 'MaxDelqEver': [8.0, 6.0], 'NumInqLast6M': [1.0, 0.0], 'NumInqLast6Mexcl7days': [0.0, 1.0]}


In [8]:
def label_y(row):
    if row['RiskPerformance'] == 'Good':
        return 1
    else:
        return 0

In [9]:
df['y'] = df.apply(lambda row: label_y(row),axis=1)

In [10]:
del df['RiskPerformance']

In [11]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    
# The function `init_classifiers` returns a list of classifiers to be trained on the datasets
def init_classifiers():
    return([(SVC(), model_names[0], param_grid_svc), 
            (LogisticRegression(), model_names[1], param_grid_logistic),
            (KNeighborsClassifier(), model_names[2], param_grid_knn),
            (GaussianNB(), model_names[3], param_grid_nb),
            (DecisionTreeClassifier(), model_names[4], param_grid_tree),
            (RandomForestClassifier(), model_names[6], param_grid_rf),
            (AdaBoostClassifier(), model_names[7], param_grid_boost)
           ])

# 'model_names' contains the names  that we will use for the above classifiers
model_names = ['SVM','LR','KNN','NB','Tree','QDA','RF','Boosting']

# the training parameters of each model
param_grid_svc = [{'C':[0.1,1],'kernel':['rbf','linear'], 'max_iter':[-1],'random_state':[1]}]
param_grid_logistic = [{'C':[0.1,1], 'penalty':['l1','l2'],'random_state':[1]}]
param_grid_knn = [{},{'n_neighbors':[1,2,3,4]}]
param_grid_nb = [{}]
param_grid_tree = [{'random_state':[1]},{'criterion':['gini'], 'max_depth':[2,3], 'min_samples_split':[3,5],'random_state':[1]}]
param_grid_rf = [{'random_state':[1]},{'n_estimators':[10,30],'max_features':[0.2, 0.3], 'bootstrap':[True],'random_state':[1]}]
param_grid_boost = [{'random_state':[1]},{'n_estimators':[10,20],'learning_rate':[0.1,1],'random_state':[1]}]

In [21]:
X = df.drop('y', axis= 1)
X = X[:1000]
Y = df['y']
Y = Y[:1000]


In [22]:
models = init_classifiers()

In [23]:
for model in models:
    
    X_train, X_test, y_train, y_test = train_test_split(X,Y)

    clf = GridSearchCV(model[0],model[2],cv = 2)

    clf.fit(X_train,y_train)

    y_pred = clf.predict(X_test)

    print('Model ', model)
    acc = accuracy_score(y_test,y_pred)
    print('Accuracy ', acc)

Model  (SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), 'SVM', [{'C': [0.1, 1], 'kernel': ['rbf', 'linear'], 'max_iter': [-1], 'random_state': [1]}])
Accuracy  0.74
Model  (LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'LR', [{'C': [0.1, 1], 'penalty': ['l1', 'l2'], 'random_state': [1]}])
Accuracy  0.728
Model  (KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'), 'KNN', [{}, {'n_neighbors': [1, 2, 3, 4]}])
Accuracy  0.66
Model  (GaussianNB(priors=None), 'NB', [{}])
Accuracy  0.688
Mod