In [1]:
import pandas as pd
import numpy as np


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#!pip install imblearn
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

from collections import Counter

from Clean_Function import  provider_group_feats

## Load testing dataset

## Load data

In [2]:
df_train = pd.read_csv("Data/df_groupby_provider1.csv")

In [3]:
## Label Encoding PotentialFraud 

label_encoder = LabelEncoder()
label_encoder.fit(["No", "Yes"])  # 0: Non-fraud (No), 1: fraud (Yes), 
df_train['PotentialFraud'] = label_encoder.transform(df_train['PotentialFraud']) 

## Separating x and y for train dataset

LR_x = df_train.drop(['PotentialFraud'], axis=1)
LR_y = df_train['PotentialFraud']


In [4]:
LR_y.value_counts(normalize=True)*100 # Imbalanced classes

0    87.302928
1    12.697072
Name: PotentialFraud, dtype: float64

#### Split dataset into train and validation dataset

In [5]:
# Split 80:20 

X_train, X_test, Y_train, Y_test = train_test_split(LR_x, LR_y, random_state=42,\
                                                shuffle=True, stratify=LR_y, test_size=0.2)

# Looking at dataset shapes

print('\n')
print('Shape of X_train data :',X_train.shape)
print('Shape of X_test data :',X_test.shape)
print('Shape of Y_train data :',Y_train.shape)
print('Shape of Y_test data :',Y_test.shape)
print('\n')
print("*"*60)

# Looking at class ratios

print('\n')
print('Class ratio - Fraud/Non-Fraud (Y_train): \n',Y_train.value_counts(normalize=True)*100)
print('Class ratio - Fraud/Non-Fraud (Y_test): \n',Y_test.value_counts(normalize=True)*100)
print('\n')
print("*"*60)



Shape of X_train data : (5683, 46)
Shape of X_test data : (1421, 46)
Shape of Y_train data : (5683,)
Shape of Y_test data : (1421,)


************************************************************


Class ratio - Fraud/Non-Fraud (Y_train): 
 0    87.295443
1    12.704557
Name: PotentialFraud, dtype: float64
Class ratio - Fraud/Non-Fraud (Y_test): 
 0    87.332864
1    12.667136
Name: PotentialFraud, dtype: float64


************************************************************


#### Deal with Imbalanced Data:  
- We will only do oversample/undersample in the train set, don't balance the Validation set only. If we balance the Validation set, our model may work well(may get better score in Val) but in the future after deploying, it may not work better.

https://www.analyticsvidhya.com/blog/2020/10/overcoming-class-imbalance-using-smote-techniques/

Performance Analysis after Resampling
To understand the effect of oversampling, I will be using a bank customer churn dataset. It is an imbalanced data where thetarget variable,
churn
has 81.5% customers not churning and 18.5% customers who have churned.

A comparative analysis was done on the dataset using 3 classifi er models: Logistic Regression, Decision Tree, and RandomForest. As discussed earlier, we’ll ignore the accuracy metric to evaluate the performance of the classifi er on this imbalanceddataset. Here, we are more interested to know that which are the customers who’ll churn out in the coming months. Thereby,we’ll focus on metrics like precision, recall, F1-score to understand the performance of the classifi ers for correctly determiningwhich customers will churn.

1. SMOTE: Synthetic Minority Oversampling Technique
- SMOTE is an oversampling technique where the synthetic samples are generated for the minority class. Thisalgorithm helps to overcome the overfi tting problem posed by random oversampling. It focuses on the feature spaceto generate new instances with the help of interpolation between the positive instances that lie together.

In [6]:
counter_before = Counter(Y_train)
print('Before', counter_before)
X_train_sm, Y_train_sm = SMOTE(random_state=42, k_neighbors=5).fit_resample(X_train, Y_train)
counter_after_sm = Counter(Y_train_sm)
print('After', counter_after_sm)

Before Counter({0: 4961, 1: 722})
After Counter({0: 4961, 1: 4961})


2. Hybridization: SMOTE + ENN
- SMOTE + ENN is hybrid technique where more no. of observations are removed from the sample space.Here, ENN is yet another undersampling technique where the nearest neighbors of each of the majority class isestimated. If the nearest neighbors misclassify that particular instance of the majority class, then that instance getsdeleted.
Integrating this technique with oversampled data done by SMOTE helps in doing extensive data cleaning. Here onmisclassifi cation by NN’s samples from both the classes are removed. This results in a more clear and concise classseparation.

In [7]:
X_train_smenn, Y_train_smenn = SMOTEENN(random_state=42).fit_resample(X_train, Y_train)
counter_after_smenn = Counter(Y_train_smenn)
print('After', counter_after_smenn)

After Counter({1: 4554, 0: 3199})


#### Save the train/validation Original sm smeen dataset for future use:

In [8]:
# Original Files
X_train.to_csv('Data/X_train.csv',index= False)
Y_train.to_csv('Data/Y_train.csv',index= False)
X_test.to_csv('Data/X_test.csv',index= False)
Y_test.to_csv('Data/Y_test.csv',index= False)

# SMOTE files

X_train_sm.to_csv('Data/X_train_sm.csv',index= False)
Y_train_sm.to_csv('Data/Y_train_sm.csv',index= False)

# SMOTE + ENN files

X_train_smenn.to_csv('Data/X_train_smenn.csv',index= False)
Y_train_smenn.to_csv('Data/Y_train_smenn.csv',index= False)


In [9]:
Counter(Y_test)

Counter({0: 1241, 1: 180})