In [3]:
import pandas as pd
import os
import sys
import json
from pathlib import Path

In [4]:
# Defining the root path of the project (two levels up from the notebooks folder)
ROOT = Path().resolve().parent

# Paths for raw data files within the project
RAW_DATA_PATH = ROOT / 'data' / 'raw'
TRANSACTIONS_DATA_PATH = RAW_DATA_PATH / 'transactions_data.csv'
LABELS_DATA_PATH = RAW_DATA_PATH / 'train_fraud_labels.json'

## Loading and cleaning the data.

Loading two main datasets: transactions raw data and lables data.

In [5]:
# Load datasets
# Each dataset is loaded from a dynamically generated path
transactions_data = pd.read_csv(TRANSACTIONS_DATA_PATH)
labels = pd.read_json(LABELS_DATA_PATH)

To calculate the portion of the fraudulent transactions, I divide ones that are labelled as fraud by the total number of labels provided.

In [6]:
labels['target'].value_counts()
per_cent_fraud = labels['target'].value_counts().iloc[1] / labels['target'].value_counts().sum()
print(f'Percentage of fraud transactions: {per_cent_fraud * 100:.2f}%')

Percentage of fraud transactions: 0.15%


As the hakaton description warned, the class imbalance is significant.

I merge two datasets to filter transactions for which the labels were provided. This new dataset will be used to train the model.

In [7]:
labels['id'] = labels.index
df = pd.merge(transactions_data, labels, on='id', how='inner')

In [8]:
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns')
print(f'The dataset has {df.isnull().sum().sum()} missing values')
print(f'The dataset has {df.duplicated().sum()} duplicated rows')
print(f'The dataset has the following columns: {df.columns.tolist()}')
df.head(10)

The dataset has 8914963 rows and 13 columns
The dataset has 10928438 missing values
The dataset has 0 duplicated rows
The dataset has the following columns: ['id', 'date', 'client_id', 'card_id', 'amount', 'use_chip', 'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc', 'errors', 'target']


Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors,target
0,7475327,2010-01-01 00:01:00,1556,2972,$-77.00,Swipe Transaction,59935,Beulah,ND,58523.0,5499,,No
1,7475328,2010-01-01 00:02:00,561,4575,$14.57,Swipe Transaction,67570,Bettendorf,IA,52722.0,5311,,No
2,7475329,2010-01-01 00:02:00,1129,102,$80.00,Swipe Transaction,27092,Vista,CA,92084.0,4829,,No
3,7475332,2010-01-01 00:06:00,848,3915,$46.41,Swipe Transaction,13051,Harwood,MD,20776.0,5813,,No
4,7475333,2010-01-01 00:07:00,1807,165,$4.81,Swipe Transaction,20519,Bronx,NY,10464.0,5942,,No
5,7475335,2010-01-01 00:14:00,1684,2140,$26.46,Online Transaction,39021,ONLINE,,,4784,,No
6,7475338,2010-01-01 00:23:00,554,3912,$3.51,Swipe Transaction,67570,Pearland,TX,77581.0,5311,,No
7,7475339,2010-01-01 00:23:00,605,5061,$2.58,Swipe Transaction,75781,Brooklyn,NY,11210.0,5411,,No
8,7475340,2010-01-01 00:26:00,1556,2972,$39.63,Swipe Transaction,59935,Beulah,ND,58523.0,5499,,No
9,7475341,2010-01-01 00:27:00,1797,1127,$43.33,Swipe Transaction,33326,Kahului,HI,96732.0,4121,,No


Cleaning the data

In [9]:
def preprocess_data(df):
    # Filling in the missing values and replacing the $ sign in the amount column
    df["merchant_state"] = df["merchant_state"].fillna("Unknown")
    df['errors'] = df['errors'].fillna("No")

    # date and time extraction 
    df['date'] = pd.to_datetime(df['date'])
    df['time'] = df['date'].dt.time
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['hour'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.hour
    df['minute'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.minute

    # deleting the duplicated columns
    df = df.drop(columns=['date', 'time', 'zip'])# I drop the zip code column from the start as it correlates strongly with the address column
    
    return df

In [10]:
df = preprocess_data(df)

Dealing with types.

In [11]:
def assign_data_types(df):
    """Assigns the correct data types to the columns of the dataset"""
    categorical_cols = ['use_chip', 'merchant_city', 'merchant_state', 'errors']
    for col in categorical_cols:
        df[col] = df[col].astype('category')
    df['target'] = df['target'].map({'Yes': 1, 'No': 0}).astype(int) 
    df['amount'] = df['amount'].str.replace('$', '').astype(float)
    return df

In [12]:
df = assign_data_types(df)

In [13]:
df.dtypes

id                   int64
client_id            int64
card_id              int64
amount             float64
use_chip          category
merchant_id          int64
merchant_city     category
merchant_state    category
mcc                  int64
errors            category
target               int64
day                  int32
month                int32
year                 int32
hour                 int32
minute               int32
dtype: object

Data is ready to be encoded and used to train a model.

## Preprocessing steps

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

In [15]:
X = df.drop(columns=['target'])
y = df['target']

In [16]:
print(X.dtypes)

id                   int64
client_id            int64
card_id              int64
amount             float64
use_chip          category
merchant_id          int64
merchant_city     category
merchant_state    category
mcc                  int64
errors            category
day                  int32
month                int32
year                 int32
hour                 int32
minute               int32
dtype: object


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [18]:
categorical_cols = ['use_chip', 'merchant_city', 'merchant_state', 'errors']
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[categorical_cols] = ordinal_encoder.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = ordinal_encoder.transform(X_test[categorical_cols])

After coding the values, I apply SMOTE to the fraudulent class transactions to deal with the imbalance. SMOTE (Synthetic Minority Over-sampling Technique) is an oversampling technique that generates synthetic data of the minority class. 

In [19]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data only
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Проверка баланса классов после SMOTE
print("Classes before SMOTE:")
print(y_train.value_counts())
print("\nClasses after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())


Classes before SMOTE:
target
0    7121304
1      10666
Name: count, dtype: int64

Classes after SMOTE:
target
0    7121304
1    7121304
Name: count, dtype: int64


## Training the models

First I try RandomForestClassifier.

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, balanced_accuracy_score

In [21]:
%time
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf.fit(X_train_resampled, y_train_resampled)
y_pred = rf.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
print(f'Precision: {precision_score(y_test, y_pred):.2f}')
print(f'Recall: {recall_score(y_test, y_pred):.2f}')
print(f'F1: {f1_score(y_test, y_pred):.2f}')
print(f'Balanced accuracy: {balanced_accuracy_score(y_test, y_pred):.2f}')

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.91 µs
Accuracy: 1.00
Precision: 0.88
Recall: 0.80
F1: 0.84
Balanced accuracy: 0.90


The balanced accuracy achieved for RandomForest shows a good result already of 90%, to better understand the results I use confusion matrix.

In [22]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

tn, fp, fn, tp = cm.ravel()
print(f"True Negatives: {tn}, False Positives: {fp}")
print(f"False Negatives: {fn}, True Positives: {tp}")

Confusion Matrix:
 [[1780029     298]
 [    529    2137]]
True Negatives: 1780029, False Positives: 298
False Negatives: 529, True Positives: 2137


I tried one more instance indicating the parameters that gave me the best balanced accuracy during the hakaton (with 67 %). I received a slightly simular balanced accuracy of 88% this time. 

In [23]:
rf_1 = RandomForestClassifier(n_estimators=100, max_depth=5, n_jobs=-1, max_samples=0.8, random_state=42)
rf_1.fit(X_train_resampled, y_train_resampled)
y_pred_1 = rf_1.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred_1):.2f}')
print(f'Precision: {precision_score(y_test, y_pred_1):.2f}')
print(f'Recall: {recall_score(y_test, y_pred_1):.2f}')
print(f'F1: {f1_score(y_test, y_pred_1):.2f}')
print(f'Balanced accuracy: {balanced_accuracy_score(y_test, y_pred_1):.2f}')

Accuracy: 0.88
Precision: 0.01
Recall: 0.87
F1: 0.02
Balanced accuracy: 0.88


I decided to apply gradient boosting models to see if they give an even better result. I started with XGBClassifier.

In [24]:
from xgboost import XGBClassifier
from sklearn.metrics import balanced_accuracy_score, classification_report

# Настройка модели XGBoost
xgb_model = XGBClassifier(
    n_estimators=500,       
    max_depth=7,            
    learning_rate=0.1,      
    scale_pos_weight=7121304 / 10666,  # Additional parameter to handle the imbalance
    random_state=42,
    n_jobs=-1              
)

xgb_model.fit(X_train_resampled, y_train_resampled)
y_pred = xgb_model.predict(X_test)

# Balanced Accuracy
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy: {balanced_acc:.2f}")

# Classofication Report
print(classification_report(y_test, y_pred))


Balanced Accuracy: 0.96
              precision    recall  f1-score   support

           0       1.00      0.99      0.99   1780327
           1       0.09      0.94      0.16      2666

    accuracy                           0.99   1782993
   macro avg       0.54      0.96      0.58   1782993
weighted avg       1.00      0.99      0.99   1782993



Balanced accurace of 0.96 is a really great result. I decide to stop there and try LightGBM just to be sure.

In [25]:
%pip install lightgbm
from lightgbm import LGBMClassifier

# Настройка модели LightGBM
lgbm_model = LGBMClassifier(
    n_estimators=500,       
    max_depth=7,            
    learning_rate=0.1,      
    scale_pos_weight=7121304 / 10666,  
    random_state=42,
    n_jobs=-1              
)

lgbm_model.fit(X_train_resampled, y_train_resampled)
y_pred = lgbm_model.predict(X_test)

# Balanced Accuracy
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy: {balanced_acc:.2f}")

# Classofication Report
print(classification_report(y_test, y_pred))


Note: you may need to restart the kernel to use updated packages.
[LightGBM] [Info] Number of positive: 7121304, number of negative: 7121304
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.118189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2675
[LightGBM] [Info] Number of data points in the train set: 14242608, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Balanced Accuracy: 0.96
              precision    recall  f1-score   support

           0       1.00      0.97      0.99   1780327
           1       0.05      0.95      0.09      2666

    accuracy                           0.97   1782993
   macro avg       0.52      0.96      0.54   1782993
weighted avg       1.00      0.97      0.98   1782993



I was curious about the feature importance and made a final visualization to see what information helped to achieve this result.

In [26]:
feature_importances = lgbm_model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
print(importance_df.sort_values(by='Importance', ascending=False))

           Feature  Importance
8              mcc        2360
0               id        1918
7   merchant_state        1736
5      merchant_id        1444
6    merchant_city        1383
3           amount        1005
13            hour         911
1        client_id         894
2          card_id         849
10             day         607
4         use_chip         580
11           month         476
14          minute         456
9           errors         224
12            year         157


In [27]:
import plotly.express as px
sorted_importance_df = importance_df.sort_values(by='Importance', ascending=False)
px.histogram(sorted_importance_df, x='Feature', y='Importance', title='Feature Importance', labels={'Feature': 'Feature', 'Importance': 'Importance'})

Besides from ids that I forgot to delete, it seems like category of the transaction (mcc) and merchant's data make most of the difference.