In [None]:
# Imported Libraries

import numpy as np # linear algebra
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections


# Other Librariest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedShuffleSplit


In [None]:
data1 = pd.read_csv('raw_data/creditcard.csv')
data1.head()

In [None]:
df = data1.copy()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
print(df["Time"].min())
print(df["Time"].max())

In [None]:
df.columns

In [None]:
df['Hour'] = (df['Time'] // 3600) % 24

In [None]:
df.head()

In [None]:
df["Hour"].value_counts()

In [None]:
df.isnull().sum().max()

In [None]:
df["Class"].value_counts()

In [None]:
print('Not Fraud', round(df['Class'].value_counts()[0]/len(df) * 100,2))
print('Fraud', round(df['Class'].value_counts()[1]/len(df) * 100,2))

In [None]:
sns.countplot('Class', data=df, palette=["red","blue"])
plt.title('Class Distributions \n (0: Not Fraud || 1: Fraud)', fontsize=12, color="Blue")

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(13,4))

amount_val = df['Amount'].values
time_val = df['Time'].values

sns.distplot(amount_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of Transaction Amount', fontsize=14)
ax[0].set_xlim([min(amount_val), max(amount_val)])

sns.distplot(time_val, ax=ax[1], color='b')
ax[1].set_title('Distribution of Transaction Time', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])


plt.show()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(12,4))
V1_values = df["V1"].values
V15_values = df["V15"].values
V26_values = df["V26"].values

sns.distplot(V1_values, ax=ax[0], color="red")
ax[0].set_title('Distribution of V1', fontsize=14)
ax[0].set_xlim([min(V1_values), max(V1_values)*3])

sns.distplot(V15_values, ax=ax[1], color="blue")
ax[1].set_title('Distribution of V15', fontsize=14)
ax[1].set_xlim([min(V15_values), max(V15_values)])

sns.distplot(V26_values, ax=ax[2], color="orange")
ax[2].set_title('Distribution of V26', fontsize=14)
ax[2].set_xlim([min(V26_values), max(V26_values)])

plt.show()

In [None]:
# Separate features and target variable
X = df.drop(columns=['Class'])
y = df['Class']

# Split data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training set
smote = SMOTE(sampling_strategy=0.2, random_state=42)  # Adjust ratio if needed
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
print("\nClass distribution after SMOTE:\n", pd.Series(y_train_smote).value_counts())

# Plot new class distribution
plt.figure(figsize=(6, 4))
sns.barplot(x=pd.Series(y_train_smote).value_counts().index,
            y=pd.Series(y_train_smote).value_counts().values, palette="coolwarm")
plt.xticks(ticks=[0, 1], labels=["Non-Fraud (0)", "Fraud (1)"])
plt.ylabel("Count")
plt.title("Class Distribution After SMOTE")
plt.show()

In [None]:
y_train_smote.value_counts()[0]/len(y_train_smote)

In [None]:
y_train_smote.value_counts()[1]/len(y_train_smote)

In [None]:
print('Not Fraud', round(pd.Series(y_train_smote).value_counts()[0]/len(y_train) * 100,2))
print('Fraud', round(pd.Series(y_train_smote).value_counts()[1]/len(y_train) * 100,2))

In [None]:

# Initialize RobustScaler
scaler = RobustScaler()

# Apply scaling only to 'Time' and 'Amount'
X_train_smote[['Time', 'Amount']] = scaler.fit_transform(X_train_smote[['Time', 'Amount']])
X_test[['Time', 'Amount']] = scaler.transform(X_test[['Time', 'Amount']])

# Check transformed statistics
print("\nScaled 'Time' and 'Amount' statistics:\n", X_train_smote[['Time', 'Amount']].describe())

In [None]:
# Log transform the 'Amount' column to reduce skewness
X_train_smote['Log_Amount'] = np.log1p(X_train_smote['Amount'])
X_test['Log_Amount'] = np.log1p(X_test['Amount'])

# Drop the original 'Amount' column if needed
X_train_smote.drop(columns=['Amount'], inplace=True)
X_test.drop(columns=['Amount'], inplace=True)

# Check the new features
print(X_train_smote.head())


In [None]:
# Plot the distribution of Log_Amount
plt.figure(figsize=(6, 4))
sns.histplot(X_train_smote['Log_Amount'], bins=50, kde=True)
plt.title("Distribution of Log_Amount (After Transformation)")
plt.show()

## sine and cosine encoding to capture the cyclical nature

In [None]:

# Apply cyclical transformation
X_train_smote["Hour_sin"] = np.sin(2 * np.pi * X_train_smote["Hour"] / 24)
X_train_smote["Hour_cos"] = np.cos(2 * np.pi * X_train_smote["Hour"] / 24)

X_test["Hour_sin"] = np.sin(2 * np.pi * X_test["Hour"] / 24)
X_test["Hour_cos"] = np.cos(2 * np.pi * X_test["Hour"] / 24)


In [None]:
# Plot the number of transactions by Hour
plt.figure(figsize=(10, 4))
sns.countplot(x=df['Hour'])
plt.title("Transaction Count by Hour")
plt.show()

### Drop the original Hour column since it's no longer needed:

In [None]:
X_train_smote.drop(columns=["Hour"], inplace=True)
X_test.drop(columns=["Hour"], inplace=True)


In [None]:
X_train_smote['Class'] = y_train_smote

# Compute correlation matrix
correlation_matrix = X_train_smote.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:

# Compute correlation with fraud target
corr = X_train_smote.corr()['Class'].sort_values(ascending=False)

# Show top 10 most correlated features
print("\nTop 20 Most Correlated Features with Fraud:\n", corr.head(25))


In [None]:
X_train_smote.drop(columns=['Class'], inplace=True)

In [None]:
X_train_smote.head(5)

In [None]:
y_train_smote.head()

In [None]:
X_test.head()