## Data

In [None]:
import sys
sys.path.insert(0, '..')

import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import load_train_data
from src.feature_engineering import engineer_features
from src.visualization import (
    plot_category_fraud_distribution,
    plot_fraud_distribution,
    plot_transaction_time_distribution,
    plot_job_sector_distribution,
    plot_job_sector_transactions,
    plot_merchant_fraud_percentage,
    plot_distance_distribution,
    plot_transaction_frequency_fraud,
    plot_correlation_heatmap,
)

data = load_train_data('../Dataset/fraudTrain.csv')
print(data.head().to_string())

## Categories

In [None]:
plot_category_fraud_distribution(data)

## Fraud Distribution

In [None]:
plot_fraud_distribution(data)
fraud_counts = data["is_fraud"].value_counts()
print(fraud_counts)

## Time and Age

In [None]:
from src.feature_engineering import compute_age_at_transaction, compute_transaction_frequency

data = compute_age_at_transaction(data)
print(data[["trans_date_trans_time", "dob", "age_at_transaction"]].head().to_string())

In [None]:
plot_transaction_time_distribution(data)

## Job Sectors and Amount

In [None]:
from src.feature_engineering import assign_sector

data["job_sector"] = data["job"].apply(assign_sector)
plot_job_sector_distribution(data)

In [None]:
plot_job_sector_transactions(data)

## Merchant Fraud

In [None]:
plot_merchant_fraud_percentage(data)

## Distance

In [None]:
from src.feature_engineering import compute_distance

data = compute_distance(data)
print(data["distance"].describe())

In [None]:
plot_distance_distribution(data)

## Transaction Frequency (Last Hour and Last Day)

In [None]:
from src.feature_engineering import compute_transaction_frequency
from src.preprocessing import encode_categorical_features

data = compute_transaction_frequency(data)
data = encode_categorical_features(data)
print(data.head().to_string())

In [None]:
plot_transaction_frequency_fraud(data, column="transactions_last_hour")

In [None]:
plot_transaction_frequency_fraud(data, column="transactions_last_day")

## Distribution of Fraud after SMOTE

In [None]:
from src.preprocessing import apply_smote
from sklearn.model_selection import train_test_split

FEATURE_COLS = [
    "merchant", "category", "amt", "job",
    "age_at_transaction", "distance",
    "transactions_last_hour", "transactions_last_day",
]

X = data[FEATURE_COLS]
Y = data["is_fraud"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train_smote, Y_train_smote = apply_smote(X_train, Y_train)

fraud_counts = Y_train_smote.value_counts()
colors = sns.color_palette("husl", len(fraud_counts))
sns.barplot(x=fraud_counts.index, y=fraud_counts.values, palette=colors)
plt.title("Distribution of Fraud after SMOTE")
plt.xlabel("Fraud")
plt.ylabel("Count")
plt.show()
print(fraud_counts)

## Correlation Heatmap

In [None]:
import pandas as pd

new_data = pd.DataFrame(X_train_smote, columns=FEATURE_COLS).copy()
new_data["is_fraud"] = Y_train_smote.values
plot_correlation_heatmap(new_data)