In [1]:
from sklearn.ensemble import RandomForestClassifier

In [2]:
import pandas as pd
import sklearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score
from sklearn.model_selection import KFold

In [3]:
new_train = pd.read_csv("../new_train.csv")
new_test = pd.read_csv("../new_test.csv")

In [4]:
new_train.drop(new_train.filter(regex="Unname"),axis=1, inplace=True)
new_test.drop(new_test.filter(regex="Unname"),axis=1, inplace=True)

In [5]:
X = new_train.drop(columns=['bidder_id', 'payment_account', 'address', 'outcome','merchandise'])
y = new_train['outcome']

In [6]:
len(X.columns)

52

In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-0.29839867, -0.34020786, -0.13219046, ..., -0.36967066,
        -0.37052414,  2.16467791],
       [-0.41452291, -0.40452956, -0.13438925, ..., -0.36967066,
        -0.37052414,  0.73029739],
       [-0.39403039, -0.40452956, -0.13428454, ..., -0.36967066,
        -0.37052414,  0.73029739],
       ...,
       [-0.41452291, -0.40452956, -0.13449395, ..., -0.36967066,
        -0.37052414, -0.70408314],
       [-0.41452291, -0.4098897 , -0.13459866, ..., -0.36967066,
        -0.37052414, -0.70408314],
       [-0.41452291, -0.4098897 , -0.13449395, ..., -0.36967066,
        -0.37052414, -0.70408314]])

In [None]:
selected_features = ['num_bids_per_device', 'percentage_of_auctions_above_threshold', 'time_to_bid', 'num_bids_per_ip', 'on_url_that_has_a_bot_mean', 'num_bids_per_country', 'auction', 'max_devices_per_auction', 'num_last_bids', 'mobile']

X = X[selected_features]
X_test = X_test[selected_features]

In [8]:
sm = SMOTE()
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.7, random_state=42)
X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)

In [9]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt',5,10,25]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,8,10]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [10]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict_proba(test_features)
    pred = model.predict(test_features)
    
    accuracy = accuracy_score(test_labels, pred)
    auc_roc_score = roc_auc_score(test_labels,predictions[:,1])
    
    print('Model Performance')
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    print('AUC ROC = {:0.2f}%.'.format(auc_roc_score))
    
    return accuracy, auc_roc_score


In [11]:

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train_oversampled,y_train_oversampled)



Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt', 5, 10,
                                                         25],
                                        'min_samples_leaf': [1, 2, 4, 8, 10],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [12]:
rf_random.best_params_

{'n_estimators': 1200,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 5,
 'max_depth': 30,
 'bootstrap': False}

In [13]:
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train_oversampled, y_train_oversampled)
base_accuracy = evaluate(base_model, X_test, y_test)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Model Performance
Accuracy = 0.94%.
AUC ROC = 0.88%.
Model Performance
Accuracy = 0.95%.
AUC ROC = 0.90%.


In [18]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [20,30, 40],
    'max_features': [5,8,10],
    'min_samples_leaf': [1,2,3, 4],
    'min_samples_split': [1,3, 5],
    'n_estimators': [1200,1300,1500,2000]
}

rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [19]:
grid_search.fit(X_train_oversampled,y_train_oversampled)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits
[CV] END bootstrap=False, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   2.8s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   2.4s
[CV] END bootstrap=False, max_depth=80, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=   2.0s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=800; total time=   1.5s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   1.1s
[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.8s
[CV] END bootstrap=True, max_depth=None, max_features=5, min_samples_leaf=8, min_samples_split=10, n_estima

[CV] END bootstrap=False, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   2.9s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=8, min_samples_split=10, n_estimators=1200; total time=   1.8s
[CV] END bootstrap=True, max_depth=60, max_features=25, min_samples_leaf=4, min_samples_split=5, n_estimators=1400; total time=   5.9s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   1.4s
[CV] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=90, max_features=5, min_samples_leaf=10, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=90, max_features=5, min_samples_leaf=10, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=90, max_featu

[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=8, min_samples_split=10, n_estimators=200; total time=   0.4s
[CV] END bootstrap=False, max_depth=40, max_features=25, min_samples_leaf=8, min_samples_split=10, n_estimators=1000; total time=   6.1s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=8, min_samples_split=10, n_estimators=800; total time=   1.6s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   1.0s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1200; total time=   2.3s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   1.5s
[CV] END bootstrap=True, max_depth=90, max_features=5, min_samples_leaf=10, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=None, m

[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=8, min_samples_split=10, n_estimators=200; total time=   0.4s
[CV] END bootstrap=False, max_depth=40, max_features=25, min_samples_leaf=8, min_samples_split=10, n_estimators=1000; total time=   5.5s
[CV] END bootstrap=False, max_depth=80, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=   2.0s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=800; total time=   1.6s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1200; total time=   2.3s
[CV] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=70, max_features=10, min_samples_leaf=8, min_samples_split=5, n_estimators=400; total time=   0.8s
[CV] END bootstrap=True, max_depth=90, max

[CV] END bootstrap=False, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   2.9s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=8, min_samples_split=10, n_estimators=1200; total time=   1.9s
[CV] END bootstrap=True, max_depth=60, max_features=25, min_samples_leaf=4, min_samples_split=5, n_estimators=1400; total time=   6.1s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   1.4s
[CV] END bootstrap=True, max_depth=70, max_features=10, min_samples_leaf=8, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END bootstrap=False, max_depth=None, max_features=5, min_samples_leaf=1, min_samples_split=5, n_estimators=1800; total time=   3.4s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=2000; total time=   3.5s
[CV] END bootstrap=True, max_depth=20, max_f

[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=8, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   2.6s
[CV] END bootstrap=False, max_depth=110, max_features=5, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.4s
[CV] END bootstrap=False, max_depth=110, max_features=5, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.4s
[CV] END bootstrap=False, max_depth=110, max_features=5, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=60, max_features=25, min_samples_leaf=4, min_samples_split=5, n_estimators=1400; total time=   5.3s
[CV] END bootstrap=True, max_depth=20, max_f

[CV] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=200; total time=   0.4s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time=   2.5s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=8, min_samples_split=10, n_estimators=1200; total time=   2.0s
[CV] END bootstrap=False, max_depth=80, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=   1.9s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=8, min_samples_split=10, n_estimators=800; total time=   1.6s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   1.1s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1200; total time=   2.3s
[CV] END bootstrap=True, max_depth=

[CV] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=10, min_samples_split=10, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, max_depth=40, max_features=25, min_samples_leaf=8, min_samples_split=10, n_estimators=1000; total time=   6.0s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=8, min_samples_split=10, n_estimators=800; total time=   1.6s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=800; total time=   1.6s
[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.8s
[CV] END bootstrap=True, max_depth=None, max_features=5, min_samples_leaf=8, min_samples_split=10, n_estimators=1400; total time=   1.9s
[CV] END bootstrap=True, max_depth=40, max_features=10, min_samples_leaf=2, min_samples_split=5, n_estimators=1600; total time=   3.6s
[CV] END bootstrap=True, max_depth=30, 

[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=400; total time=   0.9s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=3, min_samples_split=10, n_estimators=400; total time=   0.9s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=3, n_estimators=200; total time=   0.5s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=3, n_estimators=500; total time=   1.1s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=400; total time=   0.9s
[CV] END bootstrap=True, max_depth=40, max_feat

[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=3, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=3, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=3, n_estimators=400; total time=   1.0s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time=   0.9s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=8, n_estimators=400; total time=   0.9s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=500; total time=   1.1s
[CV] END bootstrap=True, max_depth=50, max_fea

[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=500; total time=   1.1s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=500; total time=   1.2s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=200; total time=   0.5s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=500; total time=   1.1s
[CV] END bootstrap=True, max_depth=40, max_feat

[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   0.5s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=500; total time=   1.1s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=3, n_estimators=500; total time=   1.1s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=8, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=8, n_estimators=400; total time=   0.9s
[CV] END bootstrap=True, max_depth=40, max_fe

[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=400; total time=   0.8s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=8, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=8, n_estimators=500; total time=   1.1s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=5, min_samples_split=10, n_estimators=500; total time=   1.1s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.5s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.0s
[CV] END bootstrap=True, max_depth=50, max_feat

432 fits failed out of a total of 1296.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
432 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/.venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/.venv/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 441, in fit
    trees = Parallel(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/.venv/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/.v

GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [False], 'max_depth': [20, 30, 40],
                         'max_features': [5, 8, 10],
                         'min_samples_leaf': [1, 2, 3, 4],
                         'min_samples_split': [1, 3, 5],
                         'n_estimators': [1200, 1300, 1500, 2000]},
             verbose=2)

In [20]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 30,
 'max_features': 8,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 1300}

In [23]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)


Model Performance
Accuracy = 0.95%.
AUC ROC = 0.90%.


In [24]:
#np.random.seed(0)

In [None]:
# # best randomforestclassifier
# rf = RandomForestClassifier(bootstrap= False,
#  max_depth = 30,
#  max_features= 8,
#  min_samples_leaf = 1,
#  min_samples_split = 5,
#  n_estimators = 1300)