In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import confusion_matrix, SCORERS, classification_report
import numpy as np

In [None]:
featuresCD = pd.read_csv("/content/drive/MyDrive/NYC DSA Capstone Project/Data/featuresCD.csv", low_memory = False)
featuresCD.drop(["Unnamed: 0"], axis = 1, inplace = True)
featuresCD

In [None]:
featuresCD.columns

In [None]:
data = featuresCD.iloc[:, 1:]
target = featuresCD.iloc[:, 0]

In [None]:
data

In [None]:
target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.3, stratify = target, shuffle = True)

In [None]:
%%time

print("-"*60)
tree_model = tree.DecisionTreeClassifier(criterion = "entropy",
                                         class_weight = {0:2e-8, 1:1},
                                         #min_sample 
                                         random_state = 0)

tree_model.fit(X_train, y_train)

print(f"Training score: {tree_model.score(X_train, y_train)}")
print(f"Test score:     {tree_model.score(X_test, y_test)}")

print("-"*60)

print(f"Training Confusion Matrix")
print(confusion_matrix(y_true = y_train,
                       y_pred = tree_model.predict(X_train)))

print("-"*60)

print(f"Testing Confusion Matrix")
print(confusion_matrix(y_true = y_test,
                       y_pred = tree_model.predict(X_test)))

print("-"*60)

In [None]:
pd.DataFrame(list(zip(featuresCD.columns, tree_model.feature_importances_)), columns = ["feature", "importance"]).sort_values(by = "importance", ascending = False)

Look at Gradient Boosting again

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size = 0.01, test_size = 0.01, stratify = target, shuffle = True)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.model_selection as ms

In [None]:
param_grid = {
    "learning_rate" : [0.01],
    "max_depth"     : [5, 10, 15],
    "subsample"     : [0.9],
    "n_estimators"  : [300,]
}

In [None]:
%%time
gbm = GradientBoostingClassifier()
grid_search_gbm = ms.GridSearchCV(gbm, param_grid, scoring = 'recall', cv = 3, n_jobs = -1)
grid_search_gbm.fit(X_train, y_train)

In [None]:
grid_search_gbm.best_params_

In [None]:
grid_search_gbm.best_score_

In [None]:
print("-"*60)

print(f"Training Confusion Matrix")
print(confusion_matrix(y_true = y_train,
                       y_pred = grid_search_gbm.best_estimator_.predict(X_train)))

print("-"*60)

print(f"Testing Confusion Matrix")
print(confusion_matrix(y_true = y_test,
                       y_pred = grid_search_gbm.best_estimator_.predict(X_test)))

print("-"*60)

In [None]:
grid_search_gbm.cv_results_

In [None]:
# Can't use class_weight for Gradient Boosting

class_weight = {0:1, 
                1: len(featuresCD[featuresCD["loan_status_trim"] == 0]) / len(featuresCD[featuresCD["loan_status_trim"] == 1])
                }

In [None]:
from sklearn.svm import SVC

In [None]:
%%time 

X_train, X_test, y_train, y_test = train_test_split(data, target, train_size = 0.01, test_size = 0.01, stratify = target, shuffle = True)

svm = SVC()

weights = np.linspace(2e-16, 3, 11)

param_grid_svm = {
    "kernel": ["sigmoid"],
    "class_weight" : [{0:x, 1:3-x} for x in weights]
    }

grid_search_svm = ms.GridSearchCV(svm, 
                                  param_grid_svm, 
                                  scoring = "f1_macro", 
                                  cv = 2, 
                                  n_jobs = -1)

grid_search_svm.fit(X_train, y_train)
print(grid_search_svm.best_estimator_.score(X_train, y_train))
print(grid_search_svm.best_estimator_.score(X_test, y_test))
print(confusion_matrix(y_true = y_train, y_pred = grid_search_svm.best_estimator_.predict(X_train)))
print(confusion_matrix(y_true = y_test, y_pred = grid_search_svm.best_estimator_.predict(X_test)))
print(classification_report(y_test, y_pred = grid_search_svm.best_estimator_.predict(X_test)))
print(grid_search_svm.best_params_)

In [None]:
%%time 

X_train, X_test, y_train, y_test = train_test_split(data, target, train_size = 0.01, test_size = 0.01, stratify = target, shuffle = True)

svm = SVC(class_weight = {0:1.2, 1:1.8}, kernel = "sigmoid", C = 33)

param_grid_svm = {
    "C":[33]
    }

grid_search_svm = ms.GridSearchCV(svm, 
                                  param_grid_svm, 
                                  scoring = "f1_macro", 
                                  cv = 2, 
                                  n_jobs = -1)

grid_search_svm.fit(X_train, y_train)
print(grid_search_svm.best_estimator_.score(X_train, y_train))
print(grid_search_svm.best_estimator_.score(X_test, y_test))
print(confusion_matrix(y_true = y_train, y_pred = grid_search_svm.best_estimator_.predict(X_train)))
print(confusion_matrix(y_true = y_test, y_pred = grid_search_svm.best_estimator_.predict(X_test)))
print(classification_report(y_test, y_pred = grid_search_svm.best_estimator_.predict(X_test)))
print(grid_search_svm.best_params_)
print(grid_search_svm.cv_results_)

In [None]:
import imblearn
from imblearn.over_sampling import SVMSMOTE
from imblearn.pipeline import make_pipeline

In [None]:
%%time 

X_train, X_test, y_train, y_test = train_test_split(data, target, train_size = 0.01, test_size = 0.01, stratify = target, shuffle = True)

svm = SVC(#class_weight = {0:1.2, 1:1.8},
          kernel = "sigmoid", 
          C = 33)
          #tol = 2e-16, 
          #shrinking = False
          #break_ties = True)

pipe = make_pipeline(
    SVMSMOTE(svm_estimator = svm,
             sampling_strategy = {0:4369, 1:4369},
             n_jobs = -1,
             m_neighbors = 1,
             k_neighbors = 2),
    svm
)

param_grid_svm = {
    
    }

pipe_grid_search_svm = ms.GridSearchCV(pipe, 
                                  param_grid_svm, 
                                  scoring = "f1_macro", 
                                  cv = 2, 
                                  n_jobs = -1)

pipe_grid_search_svm.fit(X_train, y_train)
print(pipe_grid_search_svm.best_estimator_.score(X_train, y_train))
print(pipe_grid_search_svm.best_estimator_.score(X_test, y_test))
print(confusion_matrix(y_true = y_train, y_pred = pipe_grid_search_svm.best_estimator_.predict(X_train)))
print(confusion_matrix(y_true = y_test, y_pred = pipe_grid_search_svm.best_estimator_.predict(X_test)))
print(classification_report(y_test, y_pred = pipe_grid_search_svm.best_estimator_.predict(X_test)))
#print(pipe_grid_search_svm.best_params_)
print(pipe_grid_search_svm.cv_results_)

In [None]:
print("Imbalanced-Learn", imblearn.__version__)

In [None]:
sorted(SCORERS.keys())

In [None]:
grid_search_svm.get_params().keys()

In [None]:
dir(grid_search_svm)

In [None]:
grid_search_svm.cv_results_