In [64]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score
import lightgbm

In [65]:

print(lightgbm)

<module 'lightgbm' from 'c:\\ProgramData\\miniconda3\\envs\\tf3.8\\lib\\site-packages\\lightgbm\\__init__.py'>


In [66]:
df = pd.read_csv('final_project.csv')


In [67]:
df.dtypes

x0     float64
x1     float64
x2     float64
x3     float64
x4     float64
x5     float64
x6     float64
x7     float64
x8     float64
x9     float64
x10    float64
x11    float64
x12    float64
x13    float64
x14    float64
x15    float64
x16    float64
x17    float64
x18    float64
x19    float64
x20    float64
x21    float64
x22    float64
x23    float64
x24     object
x25    float64
x26    float64
x27    float64
x28    float64
x29     object
x30     object
x31    float64
x32     object
x33    float64
x34    float64
x35    float64
x36    float64
x37     object
x38    float64
x39    float64
x40    float64
x41    float64
x42    float64
x43    float64
x44    float64
x45    float64
x46    float64
x47    float64
x48    float64
x49    float64
y        int64
dtype: object

Check data quality

In [68]:
count_na = df.isna().sum()
print(count_na)
class_counts = df['y'].value_counts()
print(class_counts)

x0     26
x1     25
x2     38
x3     37
x4     26
x5     37
x6     26
x7     27
x8     21
x9     30
x10    43
x11    30
x12    36
x13    31
x14    34
x15    35
x16    26
x17    27
x18    40
x19    35
x20    38
x21    29
x22    27
x23    47
x24    28
x25    22
x26    36
x27    30
x28    35
x29    30
x30    30
x31    39
x32    31
x33    41
x34    41
x35    30
x36    27
x37    23
x38    31
x39    23
x40    36
x41    40
x42    26
x43    37
x44    40
x45    29
x46    31
x47    37
x48    32
x49    32
y       0
dtype: int64
0    95803
1    64197
Name: y, dtype: int64


Handling the % in 'x32' by stripping the % and converting to float, then divide by 100

In [69]:
df['x32'] = df['x32'].str.replace('%', '')
df['x32'] = pd.to_numeric(df['x32']) / 100
df['x32']

0         0.0000
1        -0.0002
2        -0.0001
3         0.0001
4         0.0001
           ...  
159995    0.0000
159996   -0.0001
159997   -0.0000
159998   -0.0002
159999    0.0002
Name: x32, Length: 160000, dtype: float64

Handling column 'x37' by stripping the $ and converting it to a float64

In [70]:
df['x37'] = df['x37'].str.replace('$', '')
df['x37'] = pd.to_numeric(df['x37'])
df['x37']


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



0         1313.96
1         1962.78
2          430.47
3        -2366.29
4         -620.66
           ...   
159995    -891.96
159996    1588.65
159997     687.46
159998     439.21
159999   -1229.34
Name: x37, Length: 160000, dtype: float64

Imputing all of the missing data with either mean imputation for numerical or most frequent for category imputation.

In [71]:
num_cols = [f"x{i}" for i in range(50) if i not in [24, 29, 30]]
cat_cols = ["x24", "x29", "x30"]
#copy the target before doing the transform since it gets dropped
y = df['y'].values

num_imputer = SimpleImputer(strategy='mean')  
cat_imputer = SimpleImputer(strategy='most_frequent')

transformer = ColumnTransformer(
    transformers=[
        ('num_imputer', num_imputer, num_cols),
        ('cat_imputer', cat_imputer, cat_cols)
    ])

df_imputed = pd.DataFrame(transformer.fit_transform(df), columns=num_cols+cat_cols)
df_imputed.index = df.index

In [72]:
count_na = df_imputed.isna().sum()
print(count_na)

x0     0
x1     0
x2     0
x3     0
x4     0
x5     0
x6     0
x7     0
x8     0
x9     0
x10    0
x11    0
x12    0
x13    0
x14    0
x15    0
x16    0
x17    0
x18    0
x19    0
x20    0
x21    0
x22    0
x23    0
x25    0
x26    0
x27    0
x28    0
x31    0
x32    0
x33    0
x34    0
x35    0
x36    0
x37    0
x38    0
x39    0
x40    0
x41    0
x42    0
x43    0
x44    0
x45    0
x46    0
x47    0
x48    0
x49    0
x24    0
x29    0
x30    0
dtype: int64


One hot encoding the 'x24' that appears to be a continent, the x29 that is a month, and the x32 that is a weekday.

In [73]:
df_imputed.describe()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x43,x44,x45,x46,x47,x48,x49,x24,x29,x30
count,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,...,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000,160000,160000
unique,159975.0,159976.0,159963.0,159964.0,159975.0,159964.0,159975.0,159974.0,159980.0,159971.0,...,159964.0,159961.0,159972.0,159970.0,159964.0,159969.0,159969.0,3,12,5
top,-0.001028,0.001358,-1.150145,-0.024637,-0.000549,0.013582,-1.67067,-7.692795,-0.03054,0.005462,...,-0.002091,-0.00625,0.000885,-12.755395,0.028622,-0.000224,-0.674224,asia,July,wednesday
freq,26.0,25.0,38.0,37.0,26.0,37.0,26.0,27.0,21.0,30.0,...,37.0,40.0,29.0,31.0,37.0,32.0,32.0,138993,45599,101565


In [74]:
df = pd.get_dummies(df_imputed, columns=['x24', 'x29', 'x30'])

Prep the data for a cross val predict like prediction loop

In [75]:
df.dtypes

x0               object
x1               object
x2               object
x3               object
x4               object
                  ...  
x30_friday        uint8
x30_monday        uint8
x30_thurday       uint8
x30_tuesday       uint8
x30_wednesday     uint8
Length: 67, dtype: object

In [76]:
#split the label values into y and the features into X

X = df.values

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [77]:
# Setup some data in a train and val split to perform the search of the best model
#will circle back to using kfolds later

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=1234)

In [83]:
def objective(params):

    model = xgb.XGBClassifier(**params)
    evaluation = [( X_train, y_train), ( X_val, y_val)]
    model.fit(X_train, y_train, eval_set=evaluation, verbose=False) #uses early stopping so set a very high boosting rounds since it won't be hit
    preds= model.predict(X_val)
    accuracy=accuracy_score(y_val, preds>.5)
    return {'loss': accuracy, 'status': STATUS_OK}

#https://bradleyboehmke.github.io/xgboost_databricks_tuning/index.html#slide21
#https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning
space = {
    'learning_rate': hp.loguniform('learning_rate', -7, 0),
    'max_depth': scope.int(hp.uniform('max_depth', 1, 100)),
    'min_child_weight': hp.loguniform('min_child_weight', -2, 3),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.loguniform('gamma', -10, 10),
    'alpha': hp.loguniform('alpha', -10, 10),
    'lambda': hp.loguniform('lambda', -10, 10),
    'objective': 'binary:logistic',
    'seed': 123,
    'early_stopping_rounds':10,
    'eval_metric': 'error',
    'tree_method': 'gpu_hist' # Use GPU accelerated algorithm
}

trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print(best)

100%|██████████| 100/100 [05:53<00:00,  3.54s/trial, best loss: -0.9465]           
{'alpha': 4.669139679879851e-05, 'colsample_bytree': 0.927307751349598, 'gamma': 0.07316019528375882, 'lambda': 0.016597559745902376, 'learning_rate': 0.08959122190853762, 'max_depth': 91.81289644618724, 'min_child_weight': 2.1438094221929025, 'subsample': 0.8214646741441629}


#### Preparing a  K Fold split
The best hyperparameters from the Hyperband tuning algorithm will be used to predict all 160,000 predictions, with models training on 143,999 datapoints using a 10 k fold split to where a model will train on that fold's training data, and then predictions made on the test set.  All test sets predictions will be concatenated into a flat array of predictions and scored for accuracy against the true values.

In [84]:
kf = KFold(n_splits=3)
indices = kf.split(X,y)
for train_index, test_index in indices:
    print(f"train: {train_index}")
    print(f"test: {test_index}")

train: [ 53334  53335  53336 ... 159997 159998 159999]
test: [    0     1     2 ... 53331 53332 53333]
train: [     0      1      2 ... 159997 159998 159999]
test: [ 53334  53335  53336 ... 106664 106665 106666]
train: [     0      1      2 ... 106664 106665 106666]
test: [106667 106668 106669 ... 159997 159998 159999]


In [87]:
best['max_depth'] = int(best['max_depth'])#make it an int
best['min_child_weight'] = int(best['min_child_weight']) #make sure it is an int too
best['tree_method'] = 'gpu_hist' #run it on the gpu
probabilities = []
predictions = []
i = 1
for train_index, test_index in kf.split(X):
    X_train_val, X_test = X[train_index], X[test_index]
    y_train_val, y_test = y[train_index], y[test_index]

    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.15, random_state=1234)
        # Train final model
    model = xgb.XGBClassifier(**best)
    model.fit(X_train, y_train)
    y_prob = model.predict(X_test)
    y_pred = (y_prob > 0.5).astype("int32")
    probabilities.append(y_prob)
    predictions.append(y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    print(f"Iteration {i}")
    print(f"Accuracy {accuracy}")
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)
    print("Confusion Matrix: \n", confusion)
    i = i+1
    

Iteration 1
Accuracy 0.9401694978812765
Precision:  0.9388771795363097
Recall:  0.910992330931908
F1 Score:  0.9247245877662712
Confusion Matrix: 
 [[30543  1276]
 [ 1915 19600]]
Iteration 2
Accuracy 0.9435433971462321
Precision:  0.9416019674977094
Recall:  0.9155959861202289
F1 Score:  0.9284168984618311
Confusion Matrix: 
 [[30796  1211]
 [ 1800 19526]]
Iteration 3
Accuracy 0.9413121332008325
Precision:  0.9372421072833701
Recall:  0.9146843978273085
F1 Score:  0.9258258685245746
Confusion Matrix: 
 [[30669  1308]
 [ 1822 19534]]


##### Final Scoring of the Neural Network Approach

In [88]:
full_predictions = np.concatenate(predictions)
accuracy = accuracy_score(y, full_predictions)
precision = precision_score(y, full_predictions)
recall = recall_score(y, full_predictions)
f1 = f1_score(y, full_predictions)
confusion = confusion_matrix(y, full_predictions)
print(f"Accuracy {accuracy}")
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)
print("Confusion Matrix: \n", confusion)
cost = (confusion[0,1] * 150) + (confusion[1,0]*100) 
print(f"Final Cost: $ {'{:,.2f}'.format(cost)}")

Accuracy 0.941675
Precision:  0.9392362501000721
Recall:  0.913749863700796
F1 Score:  0.9263177841644822
Confusion Matrix: 
 [[92008  3795]
 [ 5537 58660]]
Final Cost: $ 1,122,950.00


In [90]:
import plotly.express as px
import numpy as np

def plot_confusion_matrix(values, title="Confusion Matrix"):
    TP, FP, FN, TN = values

    matrix = np.array([
        [TP, FN],
        [FP, TN]
    ])
    x_labels = ["Predicted Positive", "Predicted Negative"]
    y_labels = ["Actual Positive", "Actual Negative"]

    fig = px.imshow(matrix, labels=dict(x="Predicted Values", y="Actual Values", color="Count"),
                    x=x_labels, y=y_labels, color_continuous_scale="blues")

    fig.update_layout(title_text=title, title_x=0.5)
    for i, row in enumerate(matrix):
        for j, value in enumerate(row):
            fig.add_annotation(dict(
                x=j, y=i,
                text=str(value),
                showarrow=False,
                font_size=16,
                opacity=0.7,
                font_color='black'
            ))
    fig.show()

values = [92008, 5537, 3795, 58660]  # Example values (replace with your own)
plot_confusion_matrix(values, title="XGBoost Confusion Matrix")