# Feature Engineering: Draft 1

1. Data Cleaning / EDA
2. Oversampling (SMOTE + ENN)
3. Feature Engineering/Creation
    1. Timeseries Clustering
        1. `PAY`, `PAY_AMT`
    2. Encoding Domain Expertise
        1. X
        2. X
        3. X
4. Outlier Removal
5. Feature Selection
    1. Weight-based
    2. PCA

# Preliminaries

* **Imports**
* **Data Loading**

In [1]:
# general tools
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#sns.set_style("dark")
#sns.set_context("paper")

# preprocessing
import scipy.cluster.hierarchy as sch

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm, metrics, preprocessing

from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours

from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

import get_variable_name

In [2]:
# data imports

### EDIT FILEPATH IF NECESSARY
root = '.'
data_dir = '/DataFiles/'

# form filepaths
data_path = root + data_dir
train_file = data_path + 'CreditCard_train.csv'
test_file = data_path + 'CreditCard_test.csv'

# load
_df_train = pd.read_csv(train_file, index_col=0, header=1).rename(columns={'PAY_0':'PAY_1', 'default payment next month':'DEFAULT'})
_df_test = pd.read_csv(test_file, index_col=0, header=1).rename(columns={'PAY_0':'PAY_1', 'default payment next month':'DEFAULT'})

# Data Cleaning & EDA


## Data Cleaning

To be cleaned:
* `AGE`: Creation of 'bins'
* `EDUCATION`: Grouping categories `4,5,6,0`
* `PAY_n`: Categorical/One-Hot Encoding
* `SEX`: Categorical/One-Hot Encoding
* `MARRIAGE`: Grouping categories `0` and `3`

In [3]:
# create copy df for handling
df_train = _df_train.copy()
df_test = _df_test.copy()

In [4]:
# this dataset can be changed for whatever after feature selection

# split X, y
X_train, y_train = df_train.iloc[:,:-1], df_train.iloc[:,-1:] #not necessary yet
X_test, y_test = df_test.iloc[:,:-1], df_test.iloc[:,-1:]

# reshape y into 1d column vector
y_train, y_test = np.reshape(y_train, (y_train.shape[0],)), np.reshape(y_test, (y_test.shape[0],))

X_train = preprocessing.scale(X_train)
# y_train = preprocessing.scale(y_train)

X_test = preprocessing.scale(X_test)
# y_test = preprocessing.scale(y_test)


# # unravel labels for SVM
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

## Pipelining Datasets

The Feature Engineering section has cleaned, augmented, and reduced the original dataset into **three sets of features**, each for a corresponding Feature Selection method. Each method and their corresponding DataFrame (excl. target var.) is listed below:

1. Weighted Approach: `df_weighted_features`
2. Heuristic Approach: `df_nbr_features`
3. PCA Approach: `df_pca_25`

In [5]:
# feat eng pipeline
def feat_eng_pipeline(df):
    '''
    Returns engineered dataset.
    '''
    # create bins for 'AGE'
    df['AGE_BIN'] = pd.cut(df['AGE'],
                              bins=[20,40,60,100],
                              labels=[1,2,3])
    # drop 'AGE'
    df.drop(['AGE'], axis=1, inplace=True)
    # Group 4,5,6,0 categories for 'EDUCATION'
    ed_map = {1:1, 2:2, 3:3, 4:4, 5:4, 6:4, 0:4}
    df.EDUCATION = df.EDUCATION.map(ed_map)
    # Group 0, 3 categories for 'MARRIAGE'
    marr_map = {0:0, 1:1, 2:2, 3:0}
    df.MARRIAGE = df.MARRIAGE.map(marr_map)
    # encoding categoricals
    categoricals = ['SEX', 'EDUCATION', 'MARRIAGE', 'AGE_BIN']
    for col in categoricals: 
        df[col] = df[col].astype('category')
    # create dummy cols, join, and drop old
    cat_df = df[categoricals]
    cat_df = pd.get_dummies(cat_df)
    df = df.join(cat_df).drop(categoricals, axis=1)
    
    # define features
    pay_features = ['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
    pay_amt_features = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
    
    # create PAY clusters
    df_pay = df[pay_features]
    pay_clusters = _kmeans_pay[3].predict(df_pay)
    df_pay_clusters = pd.DataFrame({'PAY_CLUSTER':pay_clusters})
    df_pay_clusters['PAY_CLUSTER'] = df_pay_clusters['PAY_CLUSTER'].astype('category')
    
    # one-hot encode
    cat_df_pc = pd.get_dummies(df_pay_clusters)
    cat_df_pc.index = df.index
    df = df.join(cat_df_pc)
    
    # create PAY_AMT clusters
    df_pay_amt = df[pay_amt_features]
    pay_amt_clusters = _kmeans_pay_amt[3].predict(df_pay_amt)
    df_pay_amt_clusters = pd.DataFrame({'PAY_AMT_CLUSTER':pay_amt_clusters})
    df_pay_amt_clusters['PAY_AMT_CLUSTER'] = df_pay_amt_clusters['PAY_AMT_CLUSTER'].astype('category')
    
    # one-hot encode
    cat_df_pamtc = pd.get_dummies(df_pay_amt_clusters)
    cat_df_pamtc.index = df.index
    df = df.join(cat_df_pamtc)
    
    # average repayment status
    df['AVG_PAY'] = get_avg(df, pay_features)
    
    # 'sufficiency'
    pay_amt_features = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
    bill_features = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
    df['AVG_BILL_AMT'] = get_avg(df, bill_features)
    df['AVG_PAY_AMT'] = get_avg(df, pay_amt_features)
    df['SUFF'] = np.where(df['AVG_BILL_AMT'] <= df['AVG_PAY_AMT'], 1, 0)
     
    # average change in MoM repayment status
    dummy_train = df.copy()
    dummy_train['DELTA_12'] = dummy_train['PAY_2'] - dummy_train['PAY_1']
    dummy_train['DELTA_23'] = dummy_train['PAY_3'] - dummy_train['PAY_2']
    dummy_train['DELTA_34'] = dummy_train['PAY_4'] - dummy_train['PAY_3']
    dummy_train['DELTA_45'] = dummy_train['PAY_5'] - dummy_train['PAY_4']
    dummy_train['DELTA_56'] = dummy_train['PAY_6'] - dummy_train['PAY_5']
    deltas = ['DELTA_12', 'DELTA_23', 'DELTA_34', 'DELTA_45', 'DELTA_56']
    dummy_train['AVG_DELTA'] = dummy_train[deltas].mean(axis=1)
    df['AVG_PAY_DELTA'] = dummy_train['AVG_DELTA']
    
    # frequency variables
    for pay_feature in pay_features:
        df['FREQ_{}'.format(pay_feature)] = np.where((df[pay_feature] >= 3),1, 
                                                    (np.where(df[pay_feature] <3,0, df[pay_feature])))
    pay_delays = ['FREQ_PAY_1', 'FREQ_PAY_2','FREQ_PAY_3','FREQ_PAY_4','FREQ_PAY_5','FREQ_PAY_6',]
    df['PAY_DELAY_FREQ'] = df[pay_delays].sum(axis=1)
    for pay_feature in pay_features:
        df['TIMELY_{}'.format(pay_feature)] = np.where((df[pay_feature] <= 0),1, 
                                                    (np.where(df[pay_feature] >0,0, df[pay_feature])))
    pay_timely = ['TIMELY_PAY_1', 'TIMELY_PAY_2','TIMELY_PAY_3','TIMELY_PAY_4','TIMELY_PAY_5','TIMELY_PAY_6',]
    df['PAY_TIMELY_FREQ'] = df[pay_timely].sum(axis=1)
    df.drop(pay_delays, axis=1, inplace=True)
    df.drop(pay_timely, axis=1, inplace=True)
    pay_amounts = ['PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']
    for pay_amt_feature in pay_amt_features:
        df['FREQ_{}'.format(pay_amt_feature)] = np.where((df[pay_amt_feature] > 0),1, 
                                                    (np.where(df[pay_amt_feature] <=0,0,
                                                     df[pay_amt_feature])))
    repayments = ['FREQ_PAY_AMT1', 'FREQ_PAY_AMT2','FREQ_PAY_AMT3','FREQ_PAY_AMT4','FREQ_PAY_AMT5','FREQ_PAY_AMT6',]
    df['REPAY_FREQ'] = df[repayments].sum(axis=1)
    df.drop(repayments, axis=1, inplace=True)
    
    return df

### Making Datasets

HA features:['AVG_BILL_AMT',
 'AVG_PAY_AMT',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'LIMIT_BAL',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6',
 'PAY_AMT_CLUSTER_0',
 'PAY_AMT_CLUSTER_1',
 'PAY_AMT_CLUSTER_2']

PA features:
array(['AGE_BIN_1', 'AGE_BIN_2', 'AGE_BIN_3', 'AVG_BILL_AMT', 'AVG_PAY',
       'AVG_PAY_AMT', 'AVG_PAY_DELTA', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'EDUCATION_1',
       'EDUCATION_2', 'EDUCATION_3', 'EDUCATION_4', 'LIMIT_BAL',
       'MARRIAGE_0', 'MARRIAGE_1', 'MARRIAGE_2', 'PAY_1', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'PAY_AMT1', 'PAY_AMT2',
       'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'PAY_AMT_CLUSTER_0', 'PAY_AMT_CLUSTER_1', 'PAY_AMT_CLUSTER_2',
       'PAY_CLUSTER_0', 'PAY_CLUSTER_1', 'PAY_CLUSTER_2', 'PAY_CLUSTER_3',
       'PAY_DELAY_FREQ', 'PAY_TIMELY_FREQ', 'REPAY_FREQ', 'SEX_1',
       'SEX_2', 'SUFF'], dtype=object)

PCA features: the same data from feature engineered

In [6]:
df_test_eng = feat_eng_pipeline(df_test) #preprocessing
df_train_eng = feat_eng_pipeline(df_train)

## SCALING
scaler = StandardScaler()

# scaling unscaled train data
df_weighted_features_scaled = pd.DataFrame(scaler.fit_transform(df_weighted_features.values), 
                        index=df_weighted_features.index, columns=df_weighted_features.columns)# scaling unscalled train data
df_nbr_features_scaled = pd.DataFrame(scaler.fit_transform(df_nbr_features.values), 
                        index=df_nbr_features.index, columns=df_nbr_features.columns)

# scaling test data
df_test_eng_scaled = pd.DataFrame(scaler.fit_transform(df_test_eng.values), 
                        index=df_test_eng.index, columns=df_test_eng.columns)


df_test_weighted_features = get_df_features(df_test_eng_scaled.copy(), df_weighted_features.columns.values)
df_test_nbr_features = get_df_features(df_test_eng_scaled.copy(), df_nbr_features.columns.values)
df_test_pca_features = df_test_eng_scaled.copy()


NameError: name '_kmeans_pay' is not defined

In [None]:
X_train_WA = df_weighted_features_scaled
X_train_HA = df_nbr_features_scaled
X_train_PA = df_pca_25

X_test_WA = df_test_weighted_features
X_test_HA = df_test_nbr_features 
X_test_PA = pca_25.transform(df_test_eng)

dataset_WA = [X_train_WA, X_test_WA]
dataset_HA = [X_train_HA, X_test_HA]
dataset_PA = [X_train_PA, X_test_PA]

datasets = [[dataset_WA, 'Weighted Approach Dataset'], [dataset_HA, 'Heuristic Approach Dataset'], [dataset_PA, 'PCA Approach Dataset']] 


# target var => y values



In [None]:
print(X_test_HA.shape, X_train_HA.shape)

In [None]:
X_train_WA.describe()

#Motivation:

In the larger scope of ML algorithms, we've now determinted that the most appropriate model for our situation is the SVM. 

However, these models have many intricacies  that vastly change their preformance and usabilty, thus, we will further our model selection to pick the most appropriate parameters for our model.

For this, we will take a "Darwinist" approach. By testing all these parameteres seperatly against a "baseline" SVM model (as defined by sklearn), and picking the best preforming one, we should be able to hypothetically create the "best" model for out dataset. However, there are some rammifications of not testing certain parameters together, which we will detail and talk about.

The SVM parameters we are evaulation are as such:

*  The type of kernel
*  The degree for polynomial kernels
*  The gamma kernel coefficent
* The shrinking heuristic
* The strength of the regularization parameter


However, it is also worth nothing that we won't experiment with some parameters. For example, we will be using the same tolerance for stoping criterion (1e-3) as we want to keep the bound on the relative error of our models comparable. 

In [None]:
datasets[2][0][0].describe()

In [None]:
# function used to evaluate different SVM models
def evalModel(model, model_name, dataset, verbose = False):

  # every model is evaluated against the same test dataset
  X_train, X_test = dataset[0][0], dataset[0][1]
  
  #fit model
  model.fit(X_train,y_train)
    
  # use the model passed as a parameter to make predictions, which we will use to judge the model 
  predicted = model.predict(X_test).round()

  # metrics used here are: Accuracy, Recall, Precision, ROC/AUC and F1.
  # these are the industry standard and provide a proper, unbiased benchmark for models.
  accuracy_score = metrics.accuracy_score(y_test, predicted)
  recall_score = metrics.recall_score(y_test, predicted, average='micro')
  precision_score = metrics.precision_score(y_test, predicted, average='micro')
  roc_auc_score = metrics.roc_auc_score(y_test, predicted, average='micro')
  f1_score = metrics.f1_score(y_test, predicted, average='micro')

  if(verbose):
    print("Metrics for model name: " + model_name + " on " + str(dataset[1]))
    print("Accuracy score: " + accuracy_score.astype(str))
    print("Recall score: " + recall_score.astype(str))
    print("Precision_score: " + precision_score.astype(str))
    print("ROC/AUC score: " + roc_auc_score.astype(str))
    print("F1 score: " + f1_score.astype(str))
    print("\n")
    
  # return data for sorting later
  return [model_name,accuracy_score,recall_score,precision_score,roc_auc_score,f1_score]


def evalMetrics(prediction, y_test):
    """
    returns selected metrics of the prediction
    """
    accuracy_score = metrics.accuracy_score(y_test, prediction)
    recall_score = metrics.recall_score(y_test, prediction)
    precision_score = metrics.precision_score(y_test, prediction)
    roc_auc_score = metrics.roc_auc_score(y_test, prediction)
    f1_score = metrics.f1_score(y_test, prediction)
    
    print("Accuracy score: " + accuracy_score.astype(str))
    print("Recall score: " + recall_score.astype(str))
    print("Precision_score: " + precision_score.astype(str))
    print("ROC/AUC score: " + roc_auc_score.astype(str))
    print("F1 score: " + f1_score.astype(str))
    print("\n")
    return


In [None]:
# here we have our "baseline" SVM, as defined by sklearn. 
# let's get its preformance on each dataset to compare it to our other parameters:


In [None]:
#X_train_WA.describe()
for dataset in datasets:
    baseline_SVM = svm.SVC()
    evalModel(baseline_SVM, 'baseline_SVM', dataset, True);

In [None]:
## training classifiers
SVM_WA = svm.SVC(kernel='poly',degree=1, C=10)
SVM_HA = svm.SVC(kernel='poly',degree=1, C=10)
SVM_PA = svm.SVC(kernel='poly',degree=3, C=10)

SVM_WA.fit(X_train_WA, y_train)
SVM_HA.fit(X_train_HA, y_train)
SVM_PA.fit(X_train_PA, y_train)

y_train_WA = SVM_WA.predict(X_train_WA)
y_train_HA = SVM_HA.predict(X_train_HA)
y_train_PA = SVM_PA.predict(X_train_PA)
y_train_predicted =  ((y_train_WA+y_train_HA+y_train_PA)/3).round()
evalMetrics(y_train_predicted,y_train)

y_test_WA = SVM_WA.predict(X_test_WA)
y_test_HA = SVM_HA.predict(X_test_HA)
y_test_PA = SVM_PA.predict(X_test_PA)

#hard voting classifier consisting of the three subclassifiers
y_predicted = ((y_test_WA+y_test_HA+y_test_PA)/3).round()
evalMetrics(y_predicted, y_test)

In [None]:
from sklearn import metrics
predicted = y_predicted 
evalMetrics(predicted, y_test)

In [None]:
y_train_WA = SVM_WA.predict(X_train_WA)
y_train_HA = SVM_HA.predict(X_train_HA)
y_train_PA = SVM_PA.predict(X_train_PA)

y_train_predicted =  ((y_train_WA+y_train_HA+y_train_PA)/3).round()
evalMetrics(y_train_predicted,y_train)

In [None]:
predicted = y_train_predicted
evalMetrics(predicted,y_train)

In [None]:
y_predicted = ((y_test_WA+y_test_HA+y_test_PA)/3).round()
from sklearn.metrics import accuracy_score
evalMetrics(y_predicted, y_test)

In [None]:
evalMetrics(y_train_PA, y_train)

In [None]:
print(metrics.roc_auc_score(y_train_predicted, y_train))
print(metrics.accuracy_score(y_train_predicted, y_train))



In [None]:
y_test_WA = SVM_WA.predict(X_test_WA)
y_test_HA = SVM_HA.predict(X_test_HA)
y_test_PA = SVM_PA.predict(X_test_PA)

print(accuracy_score(y_test_WA, y_test))
print(accuracy_score(y_test_HA, y_test))
print(accuracy_score(y_test_PA, y_test))

#### Comparing the different kernels

In [None]:
# our baseline SVM has a radial basis function kernel, so lets test the other kernel used in the algorithm

# our data isn't linearly separable, so using a linear kernel isn't feasable and therefore not worth testing 

# SVM with a polynomial kernel (default degree = 3 )
for dataset in datasets:
    poly_SVM = svm.SVC(kernel="poly", degree = 3)
    evalModel(poly_SVM,"Polynomial SVM", dataset, True)


# SVM with a sigmoid kernel 
for dataset in datasets:
    sig_SVM = svm.SVC(kernel="sigmoid")
    evalModel(sig_SVM,"Sigmoid SVM", dataset, True)

Observation: 

The polynomial kernel SVM seems to prefrom better than the baseline radial basis function one. 

Whereas the Sigmoid SVM seems to preform worst in every metric compared to the 2 others. 

#Degrees for polynomial kernel
The polynomial kernel seemed to preform well, so let's see if we can increase further the preformance of the model by fine tuning the degree of the polynomial kernel. 

In [None]:
# our original testing with a polynomial kernel had degree 3
# choosing and testing this is crucial. as too small a kernel will lead to underfitting and too big a kernel will lead to overfitting

# here, we will test different degrees of  polynomial kerenels, from 2 to 4. 
# this range has been selected as 2 is the minimum for a polynomial funciton, and with a degree above 4, not only do training times get too long, but we risk overfitting
# too long to train above 4

# store the metrics of our test for sorting and concluding
history_result_polynomial_kernel = []

# loop to test our polynomial kernel with different degrees
for dataset in datasets:
    history_result = []
    for k in range(2, 5): 
      poly_SVM = svm.SVC(kernel="poly", degree = k)
      history_result.append(evalModel(poly_SVM,"Polynomial SVM with degree:" + str(k), dataset, True))
      
    history_result_polynomial_kernel.append(history_result)


# helper function to sort our results by accuracy 
def Sort(array): 
    # sorts a 2D array using the 2nd element (our accuracy) in descending order
    array.sort(key = lambda x: x[1],  reverse=True) 
    return array 

# sort our results
sorted_results = Sort(history_result_polynomial_kernel)

# print our degrees in sorted order
print("Sorted order of polynomial SVMs by accuracy")
for result in sorted_results:
  print(result[0])
  print("Accuracy score: " + result[1].astype(str))
  print("Recall score: " + result[2].astype(str))
  print("Precision_score: " + result[3].astype(str))
  print("ROC/AUC score: " + result[4].astype(str))
  print("F1 score: " + result[5].astype(str))

  print("\n")



Observation: 

Increasing the degree of the polynomial kernel seems to ameliorate the metrics of this type of model

However, we know that increasing the degree of the polynomial kernel makes our model more prone to overfitting, and thus should be considered if we were to move forwards with this model. 

In [None]:
# the default kernel coefficent(gamma) for our SVM is "scale"(1 / (n_features * X.var())
# here, we are testing "auto", which uses 1 / n_features

# the kernel coeffiagamma kernel coefficeint for rbf, poly and sigmoid
# first testing has gamma  =  scale, here we test for auto

for dataset in dataset:
    # baseline SVM
    auto_baseline_SVM = svm.SVC(gamma = "auto")
    evalModel(auto_baseline_SVM,"Baseline SVM with auto gamma", dataset, True);
    
    # sigmoid kernel SVM
    auto_sig_SVM = svm.SVC(kernel="sigmoid", gamma = "auto")
    evalModel(auto_sig_SVM,"sigmoid SVM with auto gamma", dataset, True);

# "auto" gamma and a polynomial kernel is nearly impossible to train 
# auto_poly_SVM = svm.SVC(kernel="poly",gamma = "auto")
# auto_poly_SVM.fit(X_train, y_train)
# evalModel(auto_poly_SVM,"poly SVM  with auto gamma ", True)


Obersvation:

The results with the radial basis function are incredible in every metric. 

We could further hypothesise that this method would work even better with a polynomial kernel of degree 4, however the training times are too long to consider this. 

On the other hand, this method seems to preform very poorly with a sigmoid kernel.

#Shrinking  parameter
This parameter is used to shorten the training time by solve the optimization problem a bit more loosely. 

We are testing it to see if it impacts the preformance of our models significantly

In [None]:
# test shrinking parameter, default = true

# baseline rbf kernel SVM
non_shrink_baseline_SVM = svm.SVC(shrinking = False)
non_shrink_baseline_SVM.fit(X_train, y_train)
evalModel(non_shrink_baseline_SVM,"Baseline SVM", True);

# polynomial kernel SVM
non_shrink_poly_SVM = svm.SVC(kernel="poly",shrinking = False)
non_shrink_poly_SVM.fit(X_train, y_train)
evalModel(non_shrink_poly_SVM,"poly SVM", True)

# sigmoid kernel SVM
non_shrink_sig_SVM = svm.SVC(kernel="sigmoid",shrinking = False)
non_shrink_sig_SVM.fit(X_train, y_train)
evalModel(non_shrink_sig_SVM,"sigmoid SVM", True);

Observation:

The shrinking  parameter dosen't change the preformance of models significantly. 

As this parameter has an impact preformance, we will therefore keep it as default(True) for our final model.  

#Regularization parameter
The regularization parameter is very important to avoid overfitting the model to our dataset.  The strength of the regularization is inversely proportional to C, and must be strictly positive. The penalty is a squared l2 penalty.

We will test different values for this parameter and see its impact on the model preformance. 

In [None]:
# all our previous models had as default the regularization parameter C =1. 
# as our C must be stricly positive, we'll test our baseline model with different values (from 2 to 8), and we should expect the accruacy to stop increasing after a point.

# store the metrics of our test for sorting and concluding
history_result_reg_param = []

# loop to test our baseline with 
for n in range(1, 8): 

  SVM = svm.SVC(C = n)
  SVM.fit(X_train, y_train)
  history_result_reg_param.append(evalModel(SVM,"baseline SVM with regularization parameter C = " + str(n)))


# sort our results
sorted_results_reg = Sort(history_result_reg_param)

# print our degrees in sorted order
print("Sorted order of baseline SVMs by accuracy")
for result in sorted_results_reg:
  print(result[0])
  print("Accuracy score: " + result[1].astype(str))
  print("Recall score: " + result[2].astype(str))
  print("Precision_score: " + result[3].astype(str))
  print("ROC/AUC score: " + result[4].astype(str))
  print("F1 score: " + result[5].astype(str))

  print("\n")



Observation: 

As expected with a stronger regularization parameter, the metrics for the model are improved with a stronger regularization term, but only to a point where increasing it further  decreases the quality of the model, thus telling us that it has become overbearing. 




#Conclusion

Let's take our parameters and evaluate them in the context of the broader SVM model:

*  The type of kernel: 
  * Best preformance by polynomial
  * Decent by baseline
  * Below average for sigmoid 

*  The degree for polynomial kernels:
 * Best preformance by polynomial
  * Decent by baseline
  * Below average for sigmoid 

*  The gamma kernel coefficent
 * Best preformance by polynomial
  * Decent by baseline
  * Below average for sigmoid 
  
* The shrinking heuristic
 * Best preformance by polynomial
  * Decent by baseline
  * Below average for sigmoid 

* The strength of the regularization 
 * Best preformance by polynomial
  * Decent by baseline
  * Below average for sigmoid parameter


In [None]:
# baseline SVM
auto_baseline_SVM = svm.SVC(gamma = "auto")
auto_baseline_SVM.fit(X_train, y_train)
evalModel(auto_baseline_SVM,"Baseline SVM with auto gamma", True);

# baseline SVM
auto_baseline_SVM = svm.SVC(gamma = "auto", C=4)
auto_baseline_SVM.fit(X_train, y_train)
evalModel(auto_baseline_SVM,"Baseline SVM with auto gamma and reg param 4", True);


### Additional models

In [None]:
import xgboost
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


#initializing models
xgb_reg = xgboost.XGBRegressor()
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5)
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=20, learning_rate=1)
rnd_clf = RandomForestClassifier()
log_clf = LogisticRegression(max_iter=10000)


for dataset in datasets:   
    evalModel(xgb_reg, 'xgboost', dataset, True)
    evalModel(ada_clf, 'AdaBoostClassifier', dataset, True)
    evalModel(gbrt, 'GradientBoostingRegressor', dataset, True)
    evalModel(rnd_clf, 'RandomForestClassifier', dataset, True)
    evalModel(log_clf, 'LogisticRegression', dataset, True)
