In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

In [2]:
import pandas as pd
import sklearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score
from sklearn.model_selection import KFold

In [3]:
new_train = pd.read_csv("../new_train.csv")
new_test = pd.read_csv("../new_test.csv")

In [4]:
new_train.drop(new_train.filter(regex="Unname"),axis=1, inplace=True)
new_test.drop(new_test.filter(regex="Unname"),axis=1, inplace=True)

In [5]:
X = new_train.drop(columns=['bidder_id', 'payment_account', 'address', 'outcome','merchandise'])
y = new_train['outcome']

In [6]:
len(X.columns)

52

In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-0.29839867, -0.34020786, -0.13219046, ..., -0.36967066,
        -0.37052414,  2.16467791],
       [-0.41452291, -0.40452956, -0.13438925, ..., -0.36967066,
        -0.37052414,  0.73029739],
       [-0.39403039, -0.40452956, -0.13428454, ..., -0.36967066,
        -0.37052414,  0.73029739],
       ...,
       [-0.41452291, -0.40452956, -0.13449395, ..., -0.36967066,
        -0.37052414, -0.70408314],
       [-0.41452291, -0.4098897 , -0.13459866, ..., -0.36967066,
        -0.37052414, -0.70408314],
       [-0.41452291, -0.4098897 , -0.13449395, ..., -0.36967066,
        -0.37052414, -0.70408314]])

In [8]:
sm = SMOTE()
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.7, random_state=42)
X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)

In [9]:
from sklearn.model_selection import RandomizedSearchCV

#criterion for splitting
criterion = ['gini', 'entropy']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [1,2,5,8]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,11,21,31]
# Number of features to consider at every split
max_features = [5,10,15,25]
#Node splitting criteria
min_impurity_decrease = [0.00005,0.0005,0.005,0.05]
# Create the random grid
random_grid = {'criterion': criterion,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'max_features': max_features,
               'min_samples_leaf': min_samples_leaf, 
               'min_impurity_decrease': min_impurity_decrease}
# params = np.random.seed(0)

In [10]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict_proba(test_features)
    pred = model.predict(test_features)
    
    accuracy = accuracy_score(test_labels, pred)
    auc_roc_score = roc_auc_score(test_labels,predictions[:,1])
    
    print('Model Performance')
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    print('AUC ROC = {:0.2f}%.'.format(auc_roc_score))
    
    return accuracy, auc_roc_score


In [11]:

# Use the random grid to search for best hyperparameters
# First create the base model to tune
dt = DecisionTreeClassifier(random_state=42)
# Random search of parameters, using 3 fold cross validation, 
# search across 1000 different combinations, and use all available cores
kfold = StratifiedKFold(n_splits=10, shuffle = False)
np.random.seed(0)
dt_random = RandomizedSearchCV(estimator = dt, param_distributions = random_grid, n_iter = 100, cv = kfold, 
                               verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
dt_random.fit(X_train_oversampled,y_train_oversampled)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


310 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
310 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 937, in fit
    super().fit(
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in 

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
                   estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': [5, 10, 15, 25],
                                        'min_impurity_decrease': [5e-05, 0.0005,
                                                                  0.005, 0.05],
                                        'min_samples_leaf': [1, 11, 21, 31],
                                        'min_samples_split': [1, 2, 5, 8]},
                   random_state=42, verbose=2)

In [12]:
dt_random.best_params_

{'min_samples_split': 8,
 'min_samples_leaf': 1,
 'min_impurity_decrease': 0.0005,
 'max_features': 25,
 'max_depth': 50,
 'criterion': 'gini'}

In [13]:
base_model = DecisionTreeClassifier(random_state = 42)
base_model.fit(X_train_oversampled, y_train_oversampled)
base_accuracy = evaluate(base_model, X_test, y_test)

best_dt = dt_random.best_estimator_
random_accuracy = evaluate(best_dt, X_test, y_test)

Model Performance
Accuracy = 0.92%.
AUC ROC = 0.68%.
Model Performance
Accuracy = 0.94%.
AUC ROC = 0.70%.


In [30]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {'criterion': ['gini'],
              'max_depth': [47, 50, 53],
              'max_features': [23, 25, 27],
              'min_impurity_decrease': [0.001, 0.004, 0.007],
              'min_samples_leaf': [1, 2, 3],
              'min_samples_split': [6, 8, 10]
}
dt = DecisionTreeClassifier()
# Instantiate the grid search model
kfold1 = StratifiedKFold(n_splits=10, shuffle = False)
grid_search = GridSearchCV(estimator = dt, param_grid = param_grid, cv = kfold1, n_jobs = -1, verbose = 2)

In [31]:
grid_search.fit(X_train_oversampled,y_train_oversampled)

Fitting 10 folds for each of 243 candidates, totalling 2430 fits


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [47, 50, 53],
                         'max_features': [23, 25, 27],
                         'min_impurity_decrease': [0.001, 0.004, 0.007],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [6, 8, 10]},
             verbose=2)

In [32]:
grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 50,
 'max_features': 23,
 'min_impurity_decrease': 0.001,
 'min_samples_leaf': 1,
 'min_samples_split': 8}

In [33]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

Model Performance
Accuracy = 0.91%.
AUC ROC = 0.71%.


In [35]:
data = {'Models': ['Base Decision Tree Model', 'Decision Tree with Random Search CV', 'Decision Tree with Grid Search'], 
        'Accuracy': [0.92, 0.94, 0.91], 'AUC ROC': [0.68, 0.70, 0.71]}
results_df = pd.DataFrame(data = data)

In [36]:
results_df

Unnamed: 0,Models,Accuracy,AUC ROC
0,Base Decision Tree Model,0.92,0.68
1,Decision Tree with Random Search CV,0.94,0.7
2,Decision Tree with Grid Search,0.91,0.71
