In [None]:
from collections import Counter
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance


from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

from lightgbm import LGBMClassifier

import time


## Data Preprocessing

In [None]:
df = pd.read_csv("transaction_dataset.csv").drop(columns=['Unnamed: 0'])
df.head(100)

In [None]:
list(df.columns)

In [None]:
print(len(df.columns))

In [None]:
df.info()

In [None]:
(df.isnull().sum()/df.shape[0]*100).sort_values(ascending=False)

In [None]:
# index check
sns.displot(df['Index'].value_counts())
plt.show()

We see that the index from the data is not unique, so it cannot be used as an index. In addition, it is worth checking objects with duplicate indexes - perhaps these objects are complete duplicates. Let's look at them:

In [None]:
df_index_check = df.merge(
    df[['Index', 'Address']].groupby('Index').count().rename(columns={'Address': 'Index_count'}).reset_index(),
    on='Index')
df_index_check[df_index_check['Index_count'] > 1].head(10)

In [None]:
df.drop(columns=['Index']).drop_duplicates()

In [None]:
df.drop(columns=['Index']).drop_duplicates().shape[0]

In [None]:
df.shape[0]

In [None]:
# remove index

assert df.shape[0] == df.drop_duplicates().shape[0]
df = df.drop(columns='Index').drop_duplicates()

In [None]:
df

In [None]:
df[df['FLAG'] == 1].isnull().sum()

In [None]:
(df.isnull().sum()!=0).sum()

In [None]:
df.isnull().sum(axis=1).value_counts()

In [None]:
# Turn the object variable into category dtype for easy computation
non_num = df.select_dtypes('O').columns.astype('category')
df[non_num]

In [None]:
# Check categoricals
for colu in df[non_num].columns:
    print(f'The column -- {colu} -- has -- {df[colu].nunique()} -- unique values')

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cbar=True)
plt.show()

In [None]:
num_var = df.select_dtypes(include=['int', 'float']).columns
df[num_var].describe()

In [None]:
for col in df:
    print(f"{col}: {len(df[col].unique())}")

In [None]:
df[num_var].var()

In [None]:
# class distribution
class_val = df['FLAG'].value_counts()
print(len(class_val))
print(np.arange(len(class_val)).shape)

In [None]:
# pie chart for class distribution

# data preparation for the pie chart
class_counts = df['FLAG'].value_counts()
labels = ['Non-fraud', 'Fraud']
colors = ['#f9ae35', '#f64e38']
explode = (0.1, 0)  # Only "explode" the first slice (Non-fraud)

pie, ax = plt.subplots(figsize=[15,10])
patches, texts, autotexts = plt.pie(x=class_counts, autopct='%.2f%%', explode=explode, labels=labels, pctdistance=0.85, textprops={'fontsize': 18, 'color': 'white'}, colors=colors)

# Stylize the autotexts (percent labels)
for autotext in autotexts:
    autotext.set_color('black')

# draw a circle at the center of pie
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

ax.axis('equal')

# adjust the title size
plt.title('Class Distribution', pad=20, fontsize=26)

# Show the plot
plt.show()



## Data Cleaning

### Drop features with variance is 0

In [None]:
no_var_list = df[num_var].var()[df[num_var].var() == 0].index.tolist()
no_var_list

In [None]:
df.drop(no_var_list, axis=1, inplace=True)
df

In [None]:
df.shape

In [None]:
all_feature_list = list(set(df.columns) - set(['Address', 'FLAG']) - set(no_var_list))
num_feature_list = list(set(all_feature_list) - set(df.dtypes[df.dtypes == 'object'].index))
cat_feature_list = list(set(all_feature_list) - set(num_feature_list))
print(len(all_feature_list))
print(len(num_feature_list))
print(len(cat_feature_list))

In [None]:
print(num_feature_list)

In [None]:
len(df[num_feature_list].columns)

In [None]:
fig, axes = plt.subplots(10, 4, figsize=(18, 40))
for i, feature in enumerate(num_feature_list):
    sns.boxplot(data=df, x=feature, ax=axes[i // 4][i % 4])

In [None]:
# Correlation matrix
correlation_matrix = df[num_feature_list + ['FLAG']].corr()
mask = np.zeros_like(correlation_matrix)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style('white'):
    fig, ax = plt.subplots(figsize=(60, 60))
    sns.heatmap(correlation_matrix, mask=mask, fmt='.2f',annot=True, cmap='seismic', center=0,
            square=True, cbar=True, annot_kws={"size": 16})

In [None]:
# correlation with target

df[num_feature_list + ['FLAG']].corr()[['FLAG']]

In [None]:
# pairs of highly correlated features

pairs = df[num_feature_list].corr().stack()
print(pairs)
pairs_list = list(pairs[pairs.abs().gt(0.9) & pairs.abs().lt(1.0)].index)
print('pairs of highly correlated features: ', int(len(pairs_list) / 2))

In [None]:
uniq_pairs_list = []

for x, y in pairs_list:
    if (x, y) in uniq_pairs_list or (y, x) in uniq_pairs_list:
        continue
    else:
        uniq_pairs_list.append((x, y))
print(uniq_pairs_list)
print(len(uniq_pairs_list))

In [None]:
uniq_pairs_list

In [None]:
filter_cor_feature_list =  list({x[0] for x in uniq_pairs_list})
filter_cor_feature_list

In [None]:
len(list(set(num_feature_list)-set(filter_cor_feature_list)))

In [None]:
# Correlation matrix after deleting highly-correlated features
correlation_matrix = df[list(set(num_feature_list)-set(filter_cor_feature_list))].corr()
mask = np.zeros_like(correlation_matrix)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style('white'):
    fig, ax = plt.subplots(figsize=(60, 60))
    sns.heatmap(correlation_matrix, mask=mask, fmt='.2f',annot=True, cmap='seismic', center=0,
            square=True, cbar=True, annot_kws={"size": 16})

In [None]:
df.drop(filter_cor_feature_list, axis=1, inplace=True)

In [None]:
new_num_list =  df.select_dtypes(include=['int', 'float']).columns
df[new_num_list] = df[new_num_list].fillna(df[new_num_list].median())

In [None]:
lower_quantile = 0.05
upper_quantile = 0.95

lower_bounds = df[new_num_list].quantile(lower_quantile)
upper_bounds = df[new_num_list].quantile(upper_quantile)

for feature in new_num_list:
    df[feature] = df[feature].clip(lower=lower_bounds[feature], upper=upper_bounds[feature])


In [None]:
new_cat_list = list(set(df.columns) - set(['Address', 'FLAG']) - set(new_num_list))
print(len(new_cat_list))
print(df[cat_feature_list[0]].value_counts())
print('---------------')
print(df[cat_feature_list[1]].value_counts())


In [None]:
new_df = df.drop(new_cat_list, axis=1)
len(new_df.columns)

In [None]:
new_df.drop(columns=['Address'], inplace=True)

In [None]:
new_df

## Data Preparation

In [None]:
y = new_df.iloc[:, 0]
X = new_df.iloc[:, 1:]
print(X.shape, y.shape)

In [None]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y, random_state = 0)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, shuffle=True, stratify=y_test_val, random_state = 0)

In [None]:
sc = StandardScaler()
sc_train = sc.fit_transform(X_train)
sc_val = sc.transform(X_val)

## LightGBM Model

In [None]:
def evaluate(y_true, y_pred):
    print()
    print(f'Accuracy: {accuracy_score(y_true, y_pred)}')
    print(classification_report(y_true, y_pred, digits=3))
    print(confusion_matrix(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')

##### Baseline model

In [None]:
# baseline model
model = LGBMClassifier()

In [None]:
start_time = time.time()
model.fit(sc_train, y_train)
end_time = time.time()
training_time = end_time - start_time
print(f"Training took {training_time} seconds")

In [None]:
evaluate(y_val, model.predict(sc_val))

In [None]:
y_val_prob = model.predict_proba(sc_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_prob)
print(f"The ROC-AUC Score on the validation set is: {roc_auc}")

### Use Sampling Techniques -- SMOTE, ADASYN & Undersampling

##### SMOTE

In [None]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
# Print the class distributions after applying the techniques
print(f'Original dataset shape {Counter(y_train)}')
print(f'Resampled dataset with SMOTE shape {Counter(y_train_smote)}')

# standardize smoted X_train data
sc_X_train_smote = sc.fit_transform(X_train_smote)

model.fit(sc_X_train_smote, y_train_smote)
evaluate(y_val, model.predict(sc_val))

In [None]:
y_val_prob = model.predict_proba(sc_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_prob)
print(f"The ROC-AUC Score on the validation set is: {roc_auc}")

##### ADASYN

In [None]:
#ADASYN
ada = ADASYN(random_state=42)
X_train_ada, y_train_ada = ada.fit_resample(X_train, y_train)
sc_X_train_ada = sc.fit_transform(X_train_ada)
model.fit(sc_X_train_ada, y_train_ada)
evaluate(y_val, model.predict(sc_val))

In [None]:
y_val_prob = model.predict_proba(sc_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_prob)
print(f"The ROC-AUC Score on the validation set is: {roc_auc}")

##### Undersampling

In [None]:
# Undersampling
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
sc_X_train_rus = sc.fit_transform(X_train_rus)
model.fit(sc_X_train_rus, y_train_rus)
evaluate(y_val, model.predict(sc_val))

In [None]:
y_val_prob = model.predict_proba(sc_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_prob)
print(f"The ROC-AUC Score on the validation set is: {roc_auc}")

## Hyperparameter tuning

In [None]:
# combine train and validation datasets for cross validation later?
X_full = np.concatenate((X_train, X_val))
y_full = np.concatenate((y_train, y_val))
sc_full = sc.fit_transform(X_full)
sc_test = sc.transform(X_test)

In [None]:
# grid search
param_grid = {
    'n_estimators': [100, 1000, 1500],
    'learning_rate': [0.1, 0.05, 1],
    'num_leaves': [31, 101, 151],
    'max_depth': [-1, 35, 55]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)

grid_search.fit(sc_full, y_full)
best_model = grid_search.best_estimator_

In [None]:
print(grid_search.best_params_)

In [None]:
new_model = LGBMClassifier(**grid_search.best_params_)
train_times = []
for i in range(100):
    start_time = time.time()
    new_model.fit(sc_full, y_full)
    end_time = time.time()
    new_train_time = end_time - start_time
    train_times.append(new_train_time)
avg_train = np.average(train_times)

In [None]:
print(f"New training time: {avg_train} seconds")

In [None]:
result = permutation_importance(new_model, sc_test, y_test,n_repeats=10, random_state=0)
df = pd.DataFrame({'Index': X.columns,
                   'Value': result['importances_mean']})
df = df.sort_values('Value',ascending=True)
plt.barh(y=df['Index'], width=df['Value'], cmp='')

In [None]:
# Measure inference speed on the test dataset
test_times = []
for i in range(100):
    start_time = time.time()
    new_model.predict(sc_test)
    end_time = time.time()
    inference_time_test = end_time - start_time
    test_times.append(inference_time_test)
avg_test_inf = np.average(test_times)
print(f"Inference time on test set: {avg_test_inf} seconds")

In [None]:
evaluate(y_test, new_model.predict(sc_test))

In [None]:
y_test_prob = new_model.predict_proba(sc_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_test_prob)
print(f"The ROC-AUC Score on the validation set is: {roc_auc}")

# Error Analysis

In [None]:
best_params =  {'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 1000, 'num_leaves': 31}
model = LGBMClassifier(**best_params)

In [None]:
model.fit(sc_full, y_full)

In [None]:
y_true_test = y_test.values
y_pred_test = model.predict(sc_test)

y_true_train = y_full
y_pred_train = model.predict(sc_full)

In [None]:
errors = X_test[y_true_test != y_pred_test]  # filter rows with wrong predictions

In [None]:
X_test.describe()

In [None]:
errors.describe()

# Comparing train/test performance

In [None]:
best_params =  {'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 1000, 'num_leaves': 31}
model = LGBMClassifier(**best_params)
model.fit(sc_full, y_full)

In [None]:
y_true_test = y_test.values
y_pred_test = model.predict(sc_test)

y_true_train = y_full
y_pred_train = model.predict(sc_full)

In [None]:
print(f"Train accuracy: {accuracy_score(y_true_train, y_pred_train):.5f}")
print(f"Test accuracy: {accuracy_score(y_true_test, y_pred_test):.5f}")

In [None]:
evaluate(y_true_train, y_pred_train)

In [None]:
evaluate(y_true_test, y_pred_test)