#### Data Transformation and Handle Class Imbalance For Creditcard
#### What We Do here:
  * Normalize/scale Amount and Time (StandardScaler or MinMaxScaler)
  * Apply SMOTE or undersampling to training data only
  * Document the class distribution before and after resampling

#### Import Custom and Other Libraries

In [1]:
# import libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import sys as sys
sys.path.append("..")

In [2]:
# import custom library
from src.creditcard_transformer import CreditCardDataTransformer


#### Load The Data

In [3]:
credit_df=pd.read_csv("../data/processed/creditcard_cleaned.csv")
credit_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283721,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
283722,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
283723,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
283724,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [4]:
credit_df.duplicated().sum()

0

##### Train-Test split before Resampling to protect Data leakage

In [5]:
from sklearn.model_selection import train_test_split
# split in X and Y
X=credit_df.drop(columns=["Class"])
Y=credit_df["Class"]
# split in Train and Test
x_train,x_test,y_train,y_test=train_test_split(X,Y,
                                    test_size=0.2,
                                    stratify=Y,
                                    random_state=42)
                                    
# Recombine Training data for transformer input
train_df=x_train.copy()
train_df['Class']=y_train

In [12]:
original_cols=train_df.columns.tolist() # get the col names 

##### Check class distribution BEFORE resampling

In [6]:
# check class distribution before resampling
transformer=CreditCardDataTransformer()
# before SMOTE
y_train=transformer.get_class_distribution(y_train)
y_train

Class
0    99.833466
1     0.166534
Name: proportion, dtype: float64

#### Apply preprocessing + SMOTE (TRAIN ONLY)


In [7]:
X_train_resampled,y_train_resampled=transformer.fit_resample(train_df)

#### Transform TEST data (NO SMOTE)

In [8]:
X_test_transformed = transformer.preprocessor.transform(x_test)

#### Get feature Names for shap explanation

In [14]:
feature_names=transformer.get_feature_names(original_cols) #get feature anmes for shap explanation

In [15]:
X_c_test_df=pd.DataFrame(X_test_transformed,columns=feature_names) # convert to data frame 
X_c_test_df.to_csv("../data/processed/credit_test_shap.csv",index=False) # save for later use with shap 

#### ðŸ“Š Class Distribution Documentation (Task Requirement)

In [9]:
transformer.get_class_distribution(y_train_resampled)

Class
0    50.0
1    50.0
Name: proportion, dtype: float64

#### ðŸ“Œ Save the datasets For Future Modeling

In [10]:
import numpy as np

# Save training data
np.save("../data/processed/credit_X_train.npy", X_train_resampled)
np.save("../data/processed/credit_y_train.npy", y_train_resampled)

# Save test data
np.save("../data/processed/credit_X_test.npy", X_test_transformed)
np.save("../data/processed/credit_y_test.npy", y_test.values)