In [1]:
import warnings
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import re

warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv(r'C:\Users\HP\Desktop\Multicollinearity-main\data\fraud_dataset.csv')

In [3]:
df.drop(columns='Unnamed: 0',inplace=True)

In [4]:
df.isnull().sum()

transaction_id     0
duration          10
day                0
fraud              0
dtype: int64

One common mistake people make when dealing with missing values in their dataset is filling in the missing values for the entire dataset. This can lead to data leakage when using simple, interpolation, or bootstrapping methods to fill in missing values. These methods may take values from the test or cross-validation datasets, which can result in leaking data from these sets. To avoid this, it's important to only fill in missing values using information from the training set.

lets do some code

# The Wrong Way

# we fill with simple method ( " mean " )

In [5]:
SI=SimpleImputer(strategy='mean')
df['duration']=SI.fit_transform(df[['duration']])

In [6]:
df.isnull().sum()

transaction_id    0
duration          0
day               0
fraud             0
dtype: int64

## ML Model

In [7]:
df['fraud'].replace([True,False],[1,0],inplace=True)
df['weekend']=pd.get_dummies(df['day'],drop_first=True)
df.drop(columns=['day','transaction_id'],inplace=True)

In [8]:
X=df.drop(columns=['fraud'])
y=df['fraud']

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

In [10]:
model=LogisticRegression()

In [11]:
model.fit(X_train,y_train)

In [12]:
model.score(X_train,y_train)

0.9990900818926297

In [13]:
np.mean(cross_val_score(model,X_train,y_train,cv=KFold(n_splits=5,random_state=0,shuffle=True)))

0.9990901048179921

In [14]:
model.score(X_test,y_test)

0.998180991359709

# The Right Way

In [15]:
df=pd.read_csv(r'C:\Users\HP\Desktop\Multicollinearity-main\data\fraud_dataset.csv')
df.drop(columns='Unnamed: 0',inplace=True)
df.isnull().sum()

transaction_id     0
duration          10
day                0
fraud              0
dtype: int64

In [16]:
X=df.drop(columns=['fraud','transaction_id'])
y=df['fraud']

In [17]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

In [18]:
model=LogisticRegression(max_iter=100)
tr_cv=[]
SI=SimpleImputer(strategy='mean')
OHE=OneHotEncoder(sparse=False,drop='first')
for train,cv in KFold(n_splits=5,random_state=0,shuffle=True).split(X_train):
    Xtrain,ytrain=X_train.iloc[train],y_train.iloc[train]
    Xcv,ycv=X_train.iloc[train],y_train.iloc[train]
    Xtr=pd.DataFrame(OHE.fit_transform(Xtrain[['day']]),columns=[re.search(r'(_\w+)',OHE.get_feature_names_out()[0]).group()[1:]],index=Xtrain.index)
    Xtrain=pd.merge(Xtrain,Xtr,left_index=True,right_index=True).drop(columns='day')
    Xtrain=SI.fit_transform(Xtrain)
    
    Xc=pd.DataFrame(OHE.transform(Xcv[['day']]),columns=[re.search(r'(_\w+)',OHE.get_feature_names_out()[0]).group()[1:]],index=Xcv.index)
    Xcv=pd.merge(Xcv,Xc,left_index=True,right_index=True).drop(columns='day')
    Xcv=SI.fit_transform(Xcv)

    model.fit(Xtrain,ytrain)
    tr_cv.append(((model.score(Xtrain,ytrain),model.score(Xcv,ycv))))

In [19]:
tr=0
cv=0
for _ in tr_cv:
    tr+=_[0]
    cv+=_[1]

In [20]:
tr/5,cv/5

(0.9990521686327067, 0.9990521686327067)

In [21]:
model=LogisticRegression(max_iter=100)
SI=SimpleImputer(strategy='mean')
OHE=OneHotEncoder(sparse=False,drop='first')

x1=pd.DataFrame(OHE.fit_transform(X_train[['day']]),columns=[re.search(r'(_\w+)',OHE.get_feature_names_out()[0]).group()[1:]],index=X_train.index)
X_train=pd.merge(X_train,x1,left_index=True,right_index=True).drop(columns='day')

x2=pd.DataFrame(OHE.fit_transform(X_test[['day']]),columns=[re.search(r'(_\w+)',OHE.get_feature_names_out()[0]).group()[1:]],index=X_test.index)
X_test=pd.merge(X_test,x2,left_index=True,right_index=True).drop(columns='day')

X_train = SI.fit_transform(X_train)
X_test=SI.transform(X_test)

In [22]:
model.fit(X_train,y_train)

In [23]:
model.score(X_test,y_test)

0.998180991359709

| train | cv | test |
|----------|----------|----------|
| 0.9990900818926297 | 0.9990901048179921 | 0.998180991359709 |
| 0.9990521686327067 | 0.9990521686327067 | 0.998180991359709 |


Method 2 prevents leakage.

In Method 1, the imputer, one-hot encoder, and drop operations are performed on the entire dataset before splitting into training and testing sets. This can lead to data leakage because information from the testing set is used to transform the training set, which can cause the model to overfit and perform poorly on new data.

In Method 2, the imputer and one-hot encoder are performed separately on each fold of the data during cross-validation. This ensures that the model is not influenced by information from the testing set during training and helps to prevent overfitting.

The difference between the performance of Method 1 and Method 2 may not be large because the dataset used in the example may not contain a lot of missing values and categorical variables with many unique values. In such cases, the impact of data leakage may not be significant, and the performance difference between the two methods may not be large.


In [24]:
# to do
"""Implement both methods while scaling the data, and compare the performance of the two models. 
There may be a significant difference in performance between the two methods when the data is scaled."""

'Implement both methods while scaling the data, and compare the performance of the two models. \nThere may be a significant difference in performance between the two methods when the data is scaled.'