In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import statistics
import sklearn.metrics as metrics
import tensorflow as tf
from sklearn.metrics import f1_score

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import fbeta_score
from sklearn.feature_selection import RFECV
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from sklearn.ensemble import RandomForestClassifier
# from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [None]:
train = pd.read_csv("../new_train.csv")
test = pd.read_csv("../new_test.csv")

In [None]:
train

In [None]:
train.drop(train.filter(regex="Unname"),axis=1, inplace=True)
test.drop(test.filter(regex="Unname"),axis=1, inplace=True)

In [None]:
X = train.drop(columns=['bidder_id', 'payment_account', 'address', 'outcome','merchandise']) 
y = train['outcome']
X_test_original = test.drop(columns=['bidder_id', 'payment_account', 'address', 'merchandise'])

### Scaling the training and test dataset

In [None]:
X.shape

In [None]:
X.columns

In [None]:
scaler = StandardScaler()
# scaled_features = data.copy()
col_names = ['auction', 'device', 'time', 'country', 'ip', 'url', 'num_bids',
       'num_first_bids', 'num_last_bids', 'time_to_bid', 'inst_resp',
       'perc_inst_resp', 'num_bids_per_auction',
       'num_bids_per_device', 'num_bids_per_country', 'num_bids_per_ip',
       'on_ip_that_has_a_bot_mean', 'ip_entropy', 'url_entropy',
       'mean_country_per_auction', 'max_country_per_auction',
       'min_country_per_auction', 'std_country_per_auction',
       'mean_devices_per_auction', 'max_devices_per_auction',
       'min_devices_per_auction', 'std_devices_per_auction',
       'mean_ip_per_auction', 'max_ip_per_auction', 'min_ip_per_auction',
       'std_ip_per_auction', 'mean_url_per_auction', 'max_url_per_auction',
       'min_url_per_auction', 'std_url_per_auction',
       'total_no_of_participated_auctions', 'no_of_auction_exceeds_threshold',
       'percentage_of_auctions_above_threshold', 'total_no_of_bidded_category',
       'no_of_merchandise_exceeds_threshold',
       'percentage_of_merchandise_above_threshold',
       'on_url_that_has_a_bot_mean']

train_features = X[col_names]
scaler = StandardScaler().fit(train_features.values)
train_features = scaler.transform(train_features.values)
X[col_names] = train_features

test_features = X_test_original[col_names]
scaler_test = StandardScaler().fit(test_features.values)
test_features = scaler_test.transform(test_features.values)
X_test_original[col_names] = test_features

In [None]:
selected_features = ['num_bids_per_ip', 'num_bids_per_auction', 'num_bids_per_country', 'mean_ip_per_auction', 'perc_inst_resp', 'time', 'mean_url_per_auction', 'inst_resp', 'num_bids', 'time_to_bid', 'num_last_bids', 'on_url_that_has_a_bot_mean', 'device', 'percentage_of_auctions_above_threshold', 'num_bids_per_device', 'ip_entropy', 'ip', 'max_country_per_auction', 'max_ip_per_auction', 'mean_devices_per_auction', 'url_entropy', 'std_ip_per_auction', 'url', 'max_devices_per_auction', 'no_of_auction_exceeds_threshold']
X = X[selected_features]
X_test_original = X_test_original[selected_features]

### Helper Functions

In [None]:
def evaluate(model, X_train, y_train, X_test, y_test):
    print("TRAIN")
    train_predictions = model.predict_proba(X_train)
    train_pred = model.predict(X_train)
        
    train_accuracy = accuracy_score(y_train, train_pred)
    train_auc_roc_score = roc_auc_score(y_train,train_predictions[:,1])
    train_fbeta = fbeta_score(y_train, train_pred, average='binary', beta=2.0)
    
    print("Classification report")
    print(classification_report(y_train, train_pred, digits = 4))
    
    print("FBeta Score")
    print(fbeta_score(y_train, train_pred, average='binary', beta=2.0))
    
    print('Model Performance')
    print('Accuracy = {:0.4f}%.'.format(train_accuracy))
    print('AUC ROC = {:0.4f}%.'.format(train_auc_roc_score))
    print("*" * 100)
    
    print("TEST")
    
    test_predictions = model.predict_proba(X_test)
    test_pred = model.predict(X_test)
        
    test_accuracy = accuracy_score(y_test, test_pred)
    test_auc_roc_score = roc_auc_score(y_test,test_predictions[:,1])
    test_fbeta = fbeta_score(y_test, test_pred, average='binary', beta=2.0)
    
    print("Classification report")
    print(classification_report(y_test, test_pred, digits = 4))
    
    print("FBeta Score")
    print(fbeta_score(y_test, test_pred, average='binary', beta=2.0))
    
    print('Model Performance')
    print('Accuracy = {:0.4f}%.'.format(test_accuracy))
    print('AUC ROC = {:0.4f}%.'.format(test_auc_roc_score))
    print("*" * 100)
    
    return [train_accuracy, train_auc_roc_score, train_fbeta, test_accuracy, test_auc_roc_score, test_fbeta]

In [None]:
def process_results(arr):
    train_accuracy = []
    train_auc_roc_score = [] 
    train_fbeta = []
    test_accuracy = []
    test_auc_roc_score = []
    test_fbeta = []
    
    for item in arr:
        train_accuracy.append(item[0])
        train_auc_roc_score.append(item[1])
        train_fbeta.append(item[2])
        test_accuracy.append(item[3])
        test_auc_roc_score.append(item[4])
        test_fbeta.append(item[5])
    
    mean_accuracy = np.array(train_accuracy).mean()
    mean_train_auc_roc_score = np.array(train_auc_roc_score).mean()
    mean_train_fbeta = np.array(train_fbeta).mean()
    mean_test_accuracy = np.array(test_accuracy).mean()
    mean_test_auc_roc_score = np.array(test_auc_roc_score).mean()
    mean_test_fbeta = np.array(test_fbeta).mean()
    
    print("final train accuracy: " + str(mean_accuracy))
    print("final train AUC: " + str(mean_train_auc_roc_score))
    print("final train fbeta: " + str(mean_train_fbeta))
    print("final test accuracy: " + str(mean_test_accuracy))
    print("final test AUC: " + str(mean_test_auc_roc_score))
    print("final test fbeta: " + str(mean_test_fbeta))

### SKFold + SMOTE (Base Model)

In [None]:
def create_model(learn_rate=0.01, momentum=0):
    # create model
    model = Sequential()
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
counter = 1
ann_result = []

over = SMOTE(sampling_strategy=0.2, random_state = 42)
under = RandomUnderSampler(sampling_strategy=0.4, random_state = 42)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

for train_index, test_index in skf.split(X, y):
    print("Fold number " + str(counter))
    counter += 1
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    X_train, y_train = pipeline.fit_resample(X_train, y_train)
    
    # ann
    print("ANN")
    ann = KerasClassifier(build_fn=create_model, verbose=0)
    ann.fit(X_train, y_train)
    ann_result.append(evaluate(ann, X_train, y_train, X_test, y_test))

In [None]:
process_results(ann_result)

### SKFold + SMOTE (Random Search)

In [None]:
def random_search_ann(X, y):
    print("RANDOM SEARCH ANN EXPERIMENT")
    learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
    momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
    batch_size = [10, 20, 40, 60, 80, 100]
    epochs = [10, 50, 100]

    ann_random_grid = dict(learn_rate=learn_rate,momentum=momentum,batch_size=batch_size, epochs=epochs)
    
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    over = SMOTE(sampling_strategy=0.2, random_state = 42)
    under = RandomUnderSampler(sampling_strategy=0.4, random_state = 42)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    X_train, y_train = pipeline.fit_resample(X_train, y_train)

    ann = KerasClassifier(build_fn=create_model, verbose=0)
    ann_random = RandomizedSearchCV(estimator = ann, param_distributions = ann_random_grid, n_iter = 100, cv = skf, 
                                verbose=2, random_state=42, n_jobs = -1, scoring='roc_auc')
    ann_random.fit(X_train, y_train)
    ann_best_random = ann_random.best_estimator_
    ann_random_accuracy = evaluate(ann_best_random, X_train, y_train, X_test, y_test)
    print("Best Random Search Param for ANN")
    print(ann_random.best_params_)

In [None]:
random_search_ann(X, y)


*****************************************************************************************
## Best Random Search Param for ANN
{}

### SKFold + SMOTE (Grid Search)

In [None]:
def grid_search_ann(X, y):
    learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
    momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
    batch_size = [10, 20, 40, 60, 80, 100]
    epochs = [10, 50, 100]
    
    ann_grid = dict(learn_rate=learn_rate,momentum=momentum,batch_size=batch_size, epochs=epochs)
    
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    over = SMOTE(sampling_strategy=0.2, random_state = 42)
    under = RandomUnderSampler(sampling_strategy=0.4, random_state = 42)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    X_train, y_train = pipeline.fit_resample(X_train, y_train)
    
    ann = KerasClassifier(build_fn=create_model, verbose=0)
    ann_grid = GridSearchCV(estimator = ann, param_grid = ann_grid, n_jobs=-1, cv=skf, scoring='roc_auc')
    ann_grid.fit(X_train, y_train)
    
    ann_best_grid = ann_grid.best_estimator_
    ann_grid_accuracy = evaluate(ann_best_grid, X_train, y_train, X_test, y_test)
    print("Best Params Grid Search for ANN")
    print(ann_grid.best_params_)
    return ann_best_grid

In [None]:
ann_best_grid = grid_search_ann(X, y)