In [3]:
#Deleting 1081 duplicate rows from the dataset. This is done to avoid data leakage and to ensure that the model is trained on unique data points.
import pandas as pd
df = pd.read_csv("../data/raw/creditcard.csv")
initial_shape = df.shape
df.drop_duplicates(inplace=True)
final_shape = df.shape
print(f"Initial shape: {initial_shape}")
print(f"Final shape: {final_shape}")
print(f"Number of duplicate rows removed: {initial_shape[0] - final_shape[0]}")
#Deleteing the 'Time' column as it is not relevant for the analysis and can be considered as a noise feature. Adding hour_of_day feature which can be useful for the analysis as it can help to identify patterns in the data based on the time of the day when the transactions were made.
df.drop(columns=['Time'], inplace=True)
df['hour_of_day'] = (df.index // 3600) % 24
print(f"Time column dropped and hour_of_day feature added.")
#Scaling the 'Amount' column- log trasformation 
import numpy as np
df['Amount'] = np.log1p(df['Amount'])
print(f"Amount column scaled using log transformation.")
#Train/test split 
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Class'])
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}")
#SMOTE 
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f"After SMOTE, Train set shape: {X_train_smote.shape}, Test set shape: {X_test.shape}")
#Saving the preprocessed data to csv files for future use
X_train_smote.to_csv('../data/processed/X_train_smote.csv', index=False)
y_train_smote.to_csv('../data/processed/y_train_smote.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)


Initial shape: (284807, 31)
Final shape: (283726, 31)
Number of duplicate rows removed: 1081
Time column dropped and hour_of_day feature added.
Amount column scaled using log transformation.
Train set shape: (226980, 30), Test set shape: (56746, 30)
After SMOTE, Train set shape: (453204, 30), Test set shape: (56746, 30)
