In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
test_data = pd.read_csv(os.path.join('../data', 'cleanLoanDataValidationAllIncome.csv'))
pd.set_option('display.max_columns', None)
test_data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,TotalIncome
0,0,1,0,1,0,5720.0,0.0,110000.0,360.0,1,2,5720.0
1,0,1,1,1,0,3076.0,1500.0,126000.0,360.0,1,2,4576.0
2,0,1,2,1,0,5000.0,1800.0,208000.0,360.0,1,2,6800.0
3,0,1,2,1,0,2340.0,2546.0,100000.0,360.0,0,2,4886.0
4,0,0,0,0,0,3276.0,0.0,78000.0,360.0,1,2,3276.0
...,...,...,...,...,...,...,...,...,...,...,...,...
340,0,1,3,0,1,4009.0,1777.0,113000.0,360.0,1,2,5786.0
341,0,1,0,1,0,4158.0,709.0,115000.0,360.0,1,2,4867.0
342,0,0,0,1,0,3250.0,1993.0,126000.0,360.0,0,1,5243.0
343,0,1,0,1,0,5000.0,2393.0,158000.0,360.0,1,0,7393.0


In [3]:
train_data = pd.read_csv(os.path.join('../data', 'cleanLoanDataTrainAllIncome.csv'))
pd.set_option('display.max_columns', None)
train_data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome
0,0,1,1,1,0,4583.0,1508.0,128000.0,360.0,1,0,0,6091.0
1,0,1,0,1,1,3000.0,0.0,66000.0,360.0,1,2,1,3000.0
2,0,1,0,0,0,2583.0,2358.0,120000.0,360.0,1,2,1,4941.0
3,0,0,0,1,0,6000.0,0.0,141000.0,360.0,1,2,1,6000.0
4,0,1,2,1,1,5417.0,4196.0,267000.0,360.0,1,2,1,9613.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
558,1,0,0,1,0,2900.0,0.0,71000.0,360.0,1,0,1,2900.0
559,0,1,3,1,0,4106.0,0.0,40000.0,180.0,1,0,1,4106.0
560,0,1,1,1,0,8072.0,240.0,253000.0,360.0,1,2,1,8312.0
561,0,1,2,1,0,7583.0,0.0,187000.0,360.0,1,2,1,7583.0


## Grid Search

### 1a param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]}, random state 33 (Combined income)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "ApplicantIncome", "CoapplicantIncome"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=33)
model = SVC(kernel='linear', cache_size=1000)

In [None]:
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3, n_jobs=-1)

In [None]:
grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

In [None]:
predictions = grid.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=["Denied", "Approved"]))

Run 1a (Combined incomes) summary

GridSearchCV(estimator=SVC(cache_size=1000, kernel='linear'), n_jobs=-1,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=3)
             
Best parameters: {'C': 1, 'gamma': 0.0001}

              precision    recall  f1-score   support

      Denied       0.00      0.00      0.00        40
    Approved       0.71      0.98      0.83       101

    accuracy                           0.70       141
   macro avg       0.36      0.49      0.41       141
weighted avg       0.51      0.70      0.59       141

### 1b param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]}, random state 33 (Applicant and Co-applicant income separate)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "TotalIncome"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=33)
model = SVC(kernel='linear', cache_size=1000)

In [None]:
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3, n_jobs=-1)

In [None]:
grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

In [None]:
predictions = grid.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=["Denied", "Approved"]))

Run 1b (Applicant and Coapplicant incomes separate) summary
GridSearchCV(estimator=SVC(cache_size=1000, kernel='linear'), n_jobs=-1,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=3)
             
Best parameters: {'C': 10, 'gamma': 0.0001}

              precision    recall  f1-score   support

      Denied       0.00      0.00      0.00        40
    Approved       0.72      1.00      0.83       101

    accuracy                           0.72       141
   macro avg       0.36      0.50      0.42       141
weighted avg       0.51      0.72      0.60       141

### 2a param_grid={'C': [0.1, 1, 5], 'gamma': [0.00001, 0.0001, 0.001]}, random state 33 (Combined income)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "ApplicantIncome", "CoapplicantIncome"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=33)
model = SVC(kernel='linear', cache_size=1000)

In [None]:
param_grid = {'C': [0.1, 1, 5],
              'gamma': [0.00001, 0.0001, 0.001]}
grid = GridSearchCV(model, param_grid, verbose=3, n_jobs=-1)

In [None]:
grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

In [None]:
predictions = grid.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=["Denied", "Approved"]))

Run 2a summary

GridSearchCV(estimator=SVC(cache_size=1000, kernel='linear'), n_jobs=-1,
             param_grid={'C': [0.1, 1, 5], 'gamma': [1e-05, 0.0001, 0.001]},
             verbose=3)
Best parameters: {'C': 1, 'gamma': 1e-05}

              precision    recall  f1-score   support

      Denied       0.00      0.00      0.00        40
    Approved       0.71      0.98      0.83       101

    accuracy                           0.70       141
   macro avg       0.36      0.49      0.41       141
weighted avg       0.51      0.70      0.59       141


### 2b param_grid={'C': [5, 10, 100], 'gamma': [0.0001, 0.001, 0.01]}, random state 33 (Applicant and Co-applicant income separate)

In [4]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "TotalIncome"], axis=1)
feature_names = data.columns

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=33)
model = SVC(kernel='linear', cache_size=1000)

In [6]:
param_grid = {'C': [5, 10, 100],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3, n_jobs=-1)

In [None]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed: 10.4min remaining:  5.2min


In [None]:
print(grid.best_params_)

In [None]:
predictions = grid.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=["Denied", "Approved"]))

Run 2b summary

### 3 (less dummies). From https://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html#sphx-glr-auto-examples-model-selection-plot-nested-cross-validation-iris-py: nested vs. non-nested cross-validation -  param_grid={'C': [1, 5, 10], 'gamma': [0.00001, 0.0001, 0.001]} . Non-nested slightly higher, scores a little higher than with data_more_dummies

In [None]:
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np

In [None]:
NUM_TRIALS = 30

In [None]:
target = train_data_limit_dummies["Loan_Status_Y"]
target_names = ["Denied", "Approved"]

In [None]:
data = train_data_limit_dummies.drop(["Loan_Status_Y", "Loan_ID"], axis=1)
feature_names = data.columns

In [None]:
p_grid = {"C": [1, 5, 10],
          "gamma": [0.00001, 0.0001, 0.001]}

In [None]:
svm = SVC(kernel="rbf", cache_size=1000)

In [None]:
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

In [None]:
# Loop for each trial
for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv, verbose=2, n_jobs=-1)
    clf.fit(data, target)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=data, y=target, cv=outer_cv, verbose=2, n_jobs=-1)
    nested_scores[i] = nested_score.mean()

In [None]:
score_difference = non_nested_scores - nested_scores

In [None]:
print("Average difference of {:6f} with std. dev. of {:6f}."
      .format(score_difference.mean(), score_difference.std()))

In [None]:
# Plot scores on each trial for nested and non-nested CV
plt.figure()
plt.subplot(211)
non_nested_scores_line, = plt.plot(non_nested_scores, color='r')
nested_line, = plt.plot(nested_scores, color='b')
plt.ylabel("score", fontsize="14")
plt.legend([non_nested_scores_line, nested_line],
           ["Non-Nested CV", "Nested CV"],
           bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested and Nested Cross Validation on train_data_limit_dummies",
          x=.5, y=1.1, fontsize="15")

In [None]:
# Plot bar chart of the difference.
plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
           ["Non-Nested CV - Nested CV Score"],
           bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")

plt.show()

Run 3 summary

NUM_TRIALS = 30
Average difference of 0.006944 with std. dev. of 0.006896.

### 4 (less dummies). Nested vs. non-nested - param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]}. Non-nested slightly higher, scores a little higher than with data_more_dummies

In [None]:
NUM_TRIALS = 30

In [None]:
target = train_data_limit_dummies["Loan_Status_Y"]
target_names = ["Denied", "Approved"]
data = train_data_limit_dummies.drop(["Loan_Status_Y", "Loan_ID"], axis=1)
feature_names = data.columns

In [None]:
p_grid = {"C": [1, 5, 10],
          "gamma": [0.0001, 0.001, 0.01]}

In [None]:
svm = SVC(kernel="rbf", cache_size=1000)

In [None]:
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

In [None]:
# Loop for each trial
for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv, verbose=2, n_jobs=-1)
    clf.fit(data, target)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=data, y=target, cv=outer_cv, verbose=2, n_jobs=-1)
    nested_scores[i] = nested_score.mean()

In [None]:
score_difference = non_nested_scores - nested_scores

In [None]:
print("Average difference of {:6f} with std. dev. of {:6f}."
      .format(score_difference.mean(), score_difference.std()))

In [None]:
# Plot scores on each trial for nested and non-nested CV
plt.figure()
plt.subplot(211)
non_nested_scores_line, = plt.plot(non_nested_scores, color='r')
nested_line, = plt.plot(nested_scores, color='b')
plt.ylabel("score", fontsize="14")
plt.legend([non_nested_scores_line, nested_line],
           ["Non-Nested CV", "Nested CV"],
           bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested and Nested Cross Validation on train_data_limit_dummies",
          x=.5, y=1.1, fontsize="15")

In [None]:
# Plot bar chart of the difference.
plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
           ["Non-Nested CV - Nested CV Score"],
           bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")

plt.show()

Run 4 summary

NUM_TRIALS = 30
Average difference of 0.008819 with std. dev. of 0.004862.

### 5 (limit_dummies). Nested vs. non-nested - param_grid={'C': [1, 5, 10], 'gamma':  [0.00001, 0.0001, 0.001]}: As in #3, but with SVC(kernel="linear") rather than rbf -

In [None]:
# Data and parameters
NUM_TRIALS = 30
target = train_data_limit_dummies["Loan_Status_Y"]
target_names = ["Denied", "Approved"]
data = train_data_limit_dummies.drop(["Loan_Status_Y", "Loan_ID"], axis=1)
feature_names = data.columns
p_grid = {"C": [1, 5, 10],
          "gamma": [0.00001, 0.0001, 0.001]}
svm = SVC(kernel="linear", cache_size=1000)
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

In [None]:
# Loop for each trial
for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv, verbose=2, n_jobs=-1)
    clf.fit(data, target)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=data, y=target, cv=outer_cv, verbose=2, n_jobs=-1)
    nested_scores[i] = nested_score.mean()

In [None]:
score_difference = non_nested_scores - nested_scores
print("Average difference of {:6f} with std. dev. of {:6f}."
      .format(score_difference.mean(), score_difference.std()))

In [None]:
# Plot scores on each trial for nested and non-nested CV
plt.figure()
plt.subplot(211)
non_nested_scores_line, = plt.plot(non_nested_scores, color='r')
nested_line, = plt.plot(nested_scores, color='b')
plt.ylabel("score", fontsize="14")
plt.legend([non_nested_scores_line, nested_line],
           ["Non-Nested CV", "Nested CV"],
           bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested and Nested Cross Validation on train_data_limit_dummies",
          x=.5, y=1.1, fontsize="15")

In [None]:
# Plot bar chart of the difference.
plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
           ["Non-Nested CV - Nested CV Score"],
           bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")

plt.show()

In [None]:
print(grid.best_params_)

Run 5 summary

### 6 (limited dummies). As #4 but with scaling and larger cache size. Nested vs. non-nested - param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]}, kernel = rbf: No nested/non-nested difference

https://scikit-learn.org/stable/modules/svm.html#svm-classification Kernel cache size: For SVC, SVR, NuSVC and NuSVR, the size of the kernel cache has a strong impact on run times for larger problems. If you have enough RAM available, it is recommended to set cache_size to a higher value than the default of 200(MB), such as 500(MB) or 1000(MB). 

With 64 GB, I'd say I could go higher on the next run.

Support Vector Machine algorithms are not scale invariant, so it is highly recommended to scale your data. REDO AT LEAST FOR MODELS THAT RUN In REASONABLE TIME

In [None]:
NUM_TRIALS = 30

In [None]:
target = train_data_limit_dummies["Loan_Status_Y"]
target_names = ["Denied", "Approved"]
data = train_data_limit_dummies.drop(["Loan_Status_Y", "Loan_ID"], axis=1)
feature_names = data.columns

In [None]:
data_scaler = MinMaxScaler().fit(data)
data_scaled = data_scaler.transform(data)

In [None]:
p_grid = {"C": [1, 5, 10],
          "gamma": [0.0001, 0.001, 0.01]}

In [None]:
svm = SVC(kernel="rbf", cache_size=1000)

In [None]:
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

In [None]:
# Loop for each trial
for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv, verbose=2, n_jobs=-1)
    clf.fit(data_scaled, target)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=data_scaled, y=target, cv=outer_cv, verbose=2, n_jobs=-1)
    nested_scores[i] = nested_score.mean()

In [None]:
score_difference = non_nested_scores - nested_scores

In [None]:
print("Average difference of {:6f} with std. dev. of {:6f}."
      .format(score_difference.mean(), score_difference.std()))

In [None]:
# Plot scores on each trial for nested and non-nested CV
plt.figure()
plt.subplot(211)
non_nested_scores_line, = plt.plot(non_nested_scores, color='r')
nested_line, = plt.plot(nested_scores, color='b')
plt.ylabel("score", fontsize="14")
plt.legend([non_nested_scores_line, nested_line],
           ["Non-Nested CV", "Nested CV"],
           bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested and Nested Cross Validation on train_data_limit_dummies",
          x=.5, y=1.1, fontsize="15")

In [None]:
# Plot bar chart of the difference.
plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
           ["Non-Nested CV - Nested CV Score"],
           bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")

plt.show()

#Run 6 summary
NUM_TRIALS = 30
Average difference of 0.000208 with std. dev. of 0.001122.

### 7 (limited dummies). As #6 (limited dummies) but kernel="linear" 

In [None]:
NUM_TRIALS = 30

In [None]:
target = train_data_limit_dummies["Loan_Status_Y"]
target_names = ["Denied", "Approved"]
data = train_data_limit_dummies.drop(["Loan_Status_Y", "Loan_ID"], axis=1)
feature_names = data.columns

In [None]:
data_scaler = MinMaxScaler().fit(data)
data_scaled = data_scaler.transform(data)

In [None]:
p_grid = {"C": [1, 5, 10],
          "gamma": [0.0001, 0.001, 0.01]}

In [None]:
svm = SVC(kernel="linear", cache_size=1000)

In [None]:
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

In [None]:
# Loop for each trial
for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv, verbose=2, n_jobs=-1)
    clf.fit(data_scaled, target)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=data_scaled, y=target, cv=outer_cv, verbose=2, n_jobs=-1)
    nested_scores[i] = nested_score.mean()

In [None]:
score_difference = non_nested_scores - nested_scores

In [None]:
print("Average difference of {:6f} with std. dev. of {:6f}."
      .format(score_difference.mean(), score_difference.std()))

In [None]:
# Plot scores on each trial for nested and non-nested CV
plt.figure()
plt.subplot(211)
non_nested_scores_line, = plt.plot(non_nested_scores, color='r')
nested_line, = plt.plot(nested_scores, color='b')
plt.ylabel("score", fontsize="14")
plt.legend([non_nested_scores_line, nested_line],
           ["Non-Nested CV", "Nested CV"],
           bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested and Nested Cross Validation on train_data_limit_dummies",
          x=.5, y=1.1, fontsize="15")

In [None]:
# Plot bar chart of the difference.
plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
           ["Non-Nested CV - Nested CV Score"],
           bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")

plt.show()

#Run 7 summary
NUM_TRIALS = 30
Average difference of 0.000000 with std. dev. of 0.000000.
Guess nested vs. not doesn't matter for linear as opposed to rbf, or anyway not here?

### 8 (limited dummies). As #7 (limited dummies) but starting with data edits: take out Loan_Amount_Term column. Scores overall a bit lower than in 7. No nested/non-nested difference

In [None]:
NUM_TRIALS = 30

In [None]:
target = train_data_limit_dummies["Loan_Status_Y"]
target_names = ["Denied", "Approved"]
data = train_data_limit_dummies.drop(["Loan_Status_Y", "Loan_ID", 'Loan_Amount_Term'], axis=1)
feature_names = data.columns

In [None]:
data_scaler = MinMaxScaler().fit(data)
data_scaled = data_scaler.transform(data)

In [None]:
p_grid = {"C": [1, 5, 10],
          "gamma": [0.0001, 0.001, 0.01]}

In [None]:
svm = SVC(kernel="linear", cache_size=1000)

In [None]:
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

In [None]:
# Loop for each trial
for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv, verbose=2, n_jobs=-1)
    clf.fit(data_scaled, target)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=data_scaled, y=target, cv=outer_cv, verbose=2, n_jobs=-1)
    nested_scores[i] = nested_score.mean()

In [None]:
score_difference = non_nested_scores - nested_scores

In [None]:
print("Average difference of {:6f} with std. dev. of {:6f}."
      .format(score_difference.mean(), score_difference.std()))

In [None]:
# Plot scores on each trial for nested and non-nested CV
plt.figure()
plt.subplot(211)
non_nested_scores_line, = plt.plot(non_nested_scores, color='r')
nested_line, = plt.plot(nested_scores, color='b')
plt.ylabel("score", fontsize="14")
plt.legend([non_nested_scores_line, nested_line],
           ["Non-Nested CV", "Nested CV"],
           bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested and Nested Cross Validation on train_data_limit_dummies",
          x=.5, y=1.1, fontsize="15")

In [None]:
# Plot bar chart of the difference.
plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
           ["Non-Nested CV - Nested CV Score"],
           bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")

plt.show()

#Run 8 summary: Scores overall are a bit lower than in 7. Where nested/non-nested differ, non-nested is slightly higher.
NUM_TRIALS = 30
Average difference of 0.000625 with std. dev. of 0.001096.

### 9 (limited dummies). As #8 (limited dummies) but further data edits: take out Loan_Amount_Term and LoanAmount columns, since these could be considered results. No nested/non-nested difference.

In [None]:
NUM_TRIALS = 30

In [None]:
target = train_data_limit_dummies["Loan_Status_Y"]
target_names = ["Denied", "Approved"]
data = train_data_limit_dummies.drop(["Loan_Status_Y", "Loan_ID", 'LoanAmount', 'Loan_Amount_Term'], axis=1)
feature_names = data.columns

In [None]:
data_scaler = MinMaxScaler().fit(data)
data_scaled = data_scaler.transform(data)

In [None]:
p_grid = {"C": [1, 5, 10],
          "gamma": [0.0001, 0.001, 0.01]}

In [None]:
svm = SVC(kernel="linear", cache_size=1000)

In [None]:
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

In [None]:
# Loop for each trial
for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv, verbose=2, n_jobs=-1)
    clf.fit(data_scaled, target)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=data_scaled, y=target, cv=outer_cv, verbose=2, n_jobs=-1)
    nested_scores[i] = nested_score.mean()

In [None]:
score_difference = non_nested_scores - nested_scores

In [None]:
print("Average difference of {:6f} with std. dev. of {:6f}."
      .format(score_difference.mean(), score_difference.std()))

In [None]:
# Plot scores on each trial for nested and non-nested CV
plt.figure()
plt.subplot(211)
non_nested_scores_line, = plt.plot(non_nested_scores, color='r')
nested_line, = plt.plot(nested_scores, color='b')
plt.ylabel("score", fontsize="14")
plt.legend([non_nested_scores_line, nested_line],
           ["Non-Nested CV", "Nested CV"],
           bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested and Nested Cross Validation on train_data_limit_dummies",
          x=.5, y=1.1, fontsize="15")

In [None]:
# Plot bar chart of the difference.
plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
           ["Non-Nested CV - Nested CV Score"],
           bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")

plt.show()

### Run 9 summary: 
NUM_TRIALS = 30
Scores overall a tiny bit lower than for models with more columns, no difference between nested and non-nested