In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import recall_score, f1_score, precision_score, classification_report
from sklearn.model_selection import train_test_split 

In [2]:
import pandas as pd
df = pd.read_csv('data/credit_card_engineered.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8',
       'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18',
       'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
       'Amount', 'Class', 'log_amt', 'outlier_score', 'DayOfWeek',
       'IsWeekend'],
      dtype='object')

In [4]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [5]:
# repalcing inf and -inf values with NaN
import numpy as np

df.replace([np.inf, -np.inf], np.nan, inplace = True)

In [6]:
# splitting the dataset into train and test 
X = df.drop(['Time', 'Amount', 'Class'], axis = 1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = df['Class'], random_state = 42)

In [7]:
X_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V23,V24,V25,V26,V27,V28,log_amt,outlier_score,DayOfWeek,IsWeekend
249927,-0.012102,0.707332,0.163334,-0.756498,0.590999,-0.653429,0.844608,-0.001672,-0.184399,-0.222719,...,0.042660,-0.360882,-0.494703,0.142657,0.235922,0.083758,1.964311,0.198609,5,1
214082,1.776151,-0.184642,-2.204096,1.191668,0.614461,-1.016525,0.919254,-0.387465,-0.318277,0.517022,...,-0.232262,-0.342096,0.492355,-0.427682,-0.075228,-0.056755,5.010635,0.182929,5,1
106005,-1.083391,-4.440527,-1.399530,0.469764,-2.076458,-0.766137,1.601441,-0.709176,-1.288745,0.086419,...,-1.097063,0.505926,-0.140185,0.870190,-0.316982,0.227833,7.172033,0.080895,4,0
58619,-0.518847,1.025087,-0.614624,-0.780959,2.474666,3.335055,0.046111,0.794249,-0.322448,0.128383,...,0.145672,0.944217,-0.788017,0.039995,0.010804,0.254309,1.501853,0.157329,5,1
191638,-0.640421,0.212171,0.283341,-1.786916,2.616127,4.024863,-0.198897,0.937087,0.474428,-0.283699,...,-0.321412,0.767971,-0.084221,0.612936,0.074029,-0.033344,1.501853,0.166806,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3441,-0.415022,0.713439,1.221551,-2.108216,0.187067,-1.281616,1.118548,-0.345326,0.463473,-0.999876,...,-0.290888,0.558730,0.037363,-0.871689,0.239482,-0.020422,0.000000,0.149844,5,1
233802,1.993864,-0.516866,-0.620118,0.129845,-0.285128,0.395044,-0.822358,0.231591,0.995898,0.212619,...,0.099141,0.275689,-0.195404,0.623598,-0.032455,-0.058552,1.790091,0.191949,3,0
85418,-1.497933,0.657921,1.581568,-0.024286,0.584698,1.303031,0.609212,0.135561,0.452745,0.108640,...,0.110048,-0.615980,-0.425883,0.263968,-0.448445,0.045178,3.610648,0.148536,6,1
29062,1.069777,0.072105,0.496540,1.505318,-0.380277,-0.370243,0.100551,-0.026687,0.319684,-0.131553,...,-0.050485,0.400171,0.593314,-0.335160,0.031014,0.024886,3.815953,0.204414,2,0


In [8]:
# different approach to splitting 
# splitting into three, train_df for traning, valid_df for hyperparameter tuning and test_df for testing the model
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify = df['Class'], shuffle=True)
train_df, valid_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify = train_df['Class'],  shuffle=True )

In [9]:
test_df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class', 'log_amt', 'outlier_score', 'DayOfWeek', 'IsWeekend'],
      dtype='object')

In [10]:
predictors = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
              'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
              'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'log_amt', 
              'outlier_score', 'DayOfWeek', 'IsWeekend']
target = 'Class'

In [12]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),

    "Decision Tree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42),

    "XGBoost": xgb.XGBClassifier(scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train), use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": lgb.LGBMClassifier(class_weight="balanced", random_state=42)
}

pipelines = {}
for name, model in models.items():
    if name in ["Logistic Regression", "SVM"]:  
        # Need scaling
        pipelines[name] = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("smote", SMOTE(sampling_strategy="auto", random_state=42)),
            ("clf", model)
        ])
    else:
        # Tree-based models don't need scaling
        pipelines[name] = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("smote", SMOTE(sampling_strategy="auto", random_state=42)),
            ("clf", model)
        ])

results = {}
for name, pipe in pipelines.items():
    print(f"\n{name} ----------------------")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[name] = {
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    }

    print(classification_report(y_test, y_pred, digits=4))

results_df = pd.DataFrame(results).T
print("\nSummary of Results:")
print(results_df)



Logistic Regression ----------------------
              precision    recall  f1-score   support

           0     0.9998    0.9759    0.9877     85295
           1     0.0595    0.8784    0.1114       148

    accuracy                         0.9757     85443
   macro avg     0.5296    0.9271    0.5495     85443
weighted avg     0.9982    0.9757    0.9862     85443


Decision Tree ----------------------
              precision    recall  f1-score   support

           0     0.9995    0.9979    0.9987     85295
           1     0.3741    0.7230    0.4931       148

    accuracy                         0.9974     85443
   macro avg     0.6868    0.8604    0.7459     85443
weighted avg     0.9984    0.9974    0.9978     85443


Random Forest ----------------------
              precision    recall  f1-score   support

           0     0.9996    0.9998    0.9997     85295
           1     0.8984    0.7770    0.8333       148

    accuracy                         0.9995     85443
   macro

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0     0.9997    0.9993    0.9995     85295
           1     0.6595    0.8243    0.7327       148

    accuracy                         0.9990     85443
   macro avg     0.8296    0.9118    0.8661     85443
weighted avg     0.9991    0.9990    0.9990     85443


LightGBM ----------------------
[LightGBM] [Info] Number of positive: 199020, number of negative: 199020
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004744 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 398040, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
              precision    recall  f1-score   support

           0     0.9996    0.9995    0.9995     85295
           1     0.7143  



In [51]:
xgboost_model = pipelines['XGBoost'].named_steps['clf']
xgboost_predict = xgboost_model.predict(test_df[predictors])
print(roc_auc_score(test_df[target], xgboost_predict))

0.9332602026460556


In [None]:
#joblib.dump(xgboost_model, 'artifacts/xgboost_model.joblib')

In [52]:
xgboost_pipeline = pipelines['XGBoost']
xgboost_pipeline_predict = xgboost_pipeline.predict(test_df[predictors])
roc_auc_score(test_df[target], xgboost_pipeline_predict)

0.9332602026460556

In [53]:
joblib.dump(xgboost_pipeline, 'artifacts/xgboost_pipeline.joblib')

['artifacts/xgboost_pipeline.joblib']

In [14]:
# saving the random forest pipeline and random forest model from the pipeline 
import joblib 
rf_pipeline = pipelines["Random Forest"]     # trained pipeline
rf_model = rf_pipeline.named_steps["clf"]    # trained model


joblib.dump(rf_pipeline, 'artifacts/rf_pipeline.joblib')
joblib.dump(rf_model, 'artifacts/rf_model.joblib')

['artifacts/rf_model.joblib']

In [16]:
import joblib
from sklearn.metrics import roc_auc_score
rf_model = joblib.load('artifacts/rf_pipeline.joblib')
rf_predict = rf_model.predict(test_df[predictors])
roc_auc_score(test_df[target], rf_predict)
print(f"ROC-AUC: {roc_auc_score(test_df[target], rf_predict)}\nPrecision Score: {precision_score(test_df[target], rf_predict)}\nF1 Score: {f1_score(test_df[target], rf_predict)}")

ROC-AUC: 0.9080753362121439
Precision Score: 0.8888888888888888
F1 Score: 0.851063829787234


In [47]:
rf_model_2 = joblib.load('artifacts/rf_model.joblib')
rf_predict_2 = rf_model_2.predict(test_df[predictors])
print(f"ROC-AUC: {roc_auc_score(test_df[target], rf_predict_2)}\nPrecision Score: {precision_score(test_df[target], rf_predict_2)}\nF1 Score: {f1_score(test_df[target], rf_predict_2)}")

ROC-AUC: 0.9080753362121439
Precision Score: 0.8888888888888888
F1 Score: 0.851063829787234


In [19]:
import sklearn
import imblearn
print("imblearn version:", imblearn.__version__)

imblearn version: 0.14.0


## training xgboost in a different way

In [20]:
# Prepare the train and valid datasets
dtrain = xgb.DMatrix(train_df[predictors], train_df[target].values)
dvalid = xgb.DMatrix(valid_df[predictors], valid_df[target].values)
dtest = xgb.DMatrix(test_df[predictors], test_df[target].values)


In [21]:
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

# Set xgboost parameters
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.039
params['silent'] = True
params['max_depth'] = 2
params['subsample'] = 0.8
params['colsample_bytree'] = 0.9
params['eval_metric'] = 'auc'
params['random_state'] = 42

In [38]:
import warnings
warnings.filterwarnings('ignore')
model = xgb.train(params, 
                dtrain, 
                1000, 
                watchlist, 
                early_stopping_rounds=50, 
                maximize=True, 
                verbose_eval=50)

[0]	train-auc:0.92153	valid-auc:0.90767
[50]	train-auc:0.98592	valid-auc:0.96813
[100]	train-auc:0.99185	valid-auc:0.97748
[150]	train-auc:0.99366	valid-auc:0.97802
[200]	train-auc:0.99516	valid-auc:0.97862
[211]	train-auc:0.99524	valid-auc:0.97823


In [36]:
pred = model.predict(dtest)

NotFittedError: need to call fit or load_model beforehand

In [26]:
from sklearn.metrics import roc_auc_score
roc_auc_score(test_df[target].values, pred)

0.9734188554431339

In [27]:
# saving the xgb model
joblib.dump(model, 'XGB_model.joblib')

['XGB_model.joblib']

## training random forest model in a different way

In [28]:
# defining a new random forest classifier 
clf = RandomForestClassifier(n_jobs=40, 
                             random_state=42,
                             criterion='gini',
                             n_estimators=100,
                             class_weight = 'balanced',
                             verbose=False)

In [29]:
clf.fit(train_df[predictors], train_df[target].values)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [30]:
preds = clf.predict(valid_df[predictors])

In [31]:
roc_auc_score(valid_df[target].values, preds)

0.8480463086893489

## checking if the models i have saved have the feature_names_in attribute 
- random forest and xgboost model are the one trained in the pipeline

In [54]:
xgboost_model = joblib.load('artifacts/xgboost_pipeline.joblib')
rf_model = joblib.load('artifacts/rf_pipeline.joblib')

In [55]:
# Verify feature_names_in_ is available
print("RandomForest model:")
print(f"  Has feature_names_in_: {hasattr(rf_model, 'feature_names_in_')}")
if hasattr(rf_model, 'feature_names_in_'):
    print(f"  Features: {list(rf_model.feature_names_in_)}")

print("\nXGBoost model:")  
print(f"  Has feature_names_in_: {hasattr(xgboost_model, 'feature_names_in_')}")
if hasattr(xgboost_model, 'feature_names_in_'):
    print(f"  Features: {list(xgboost_model.feature_names_in_)}")


RandomForest model:
  Has feature_names_in_: True
  Features: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'log_amt', 'outlier_score', 'DayOfWeek', 'IsWeekend']

XGBoost model:
  Has feature_names_in_: True
  Features: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'log_amt', 'outlier_score', 'DayOfWeek', 'IsWeekend']
