## Loading Libraries

In [None]:
# to install these packages if they aren't there already, use below install commands:
# !pip install pandas
# !pip install numpy
# !pip install sklearn
# !pip install scipy
# !pip install imblearn
import pandas as pd
import numpy as np
import random
import sklearn
from collections import Counter

## How to Load Fraud Data

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Imbalanced_classes/master/fraud_data.csv")

## Data Description :
IEEE Fraud Dataset was provided at Kaggle a year ago:
- **Categorical Features – Transaction**
- ProductCD – Product code
- card1 - card6 : payment card information, such as card type, card category, issue bank, country, etc.
- addr1, addr2
- P_emaildomain – Purchaser 
- R_emaildomain- Recipient
- M1 - M9 – Match between names on card and address etc.
- **Categorical Features - Identity**
- DeviceType
- DeviceInfo
- id_12 - id_38 customer identity variables
- The TransactionDT feature is a timedelta from a given reference datetime (not an actual timestamp).
- **Outcome/Target Variable – isFraud**
- whether transaction is fraud or not

#### More about this dataset here: https://www.kaggle.com/c/ieee-fraud-detection/data


## The target variable here is 'isFraud' whether the transaction is fraudulent. Let us look at how many fraud transactions are there and how many normal transactions are there in the dataset.

In [None]:
df.isFraud.value_counts()

In [None]:
df['isFraud'].value_counts()

# Pre-Processing and Data Wrangling:

## Train Test Data Split - to evaluate performance in an unbiased manner

In [None]:
df.iloc[:5, :3]


In [None]:
# set x and y variables
y = df['isFraud']
x= df.loc[:, df.columns != 'isFraud']


from sklearn.model_selection import train_test_split

# Split randomly into 70% train data and 30% test data
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.3, random_state = 123) 


In [None]:
xTrain.info()
xTest.info()

In [None]:
yTest.info()

## Step 1. Check for missingness in variables 

In [None]:
xTrain.isnull().sum() #check how many missing/null values in each variable

## Eliminate automatically variables with more than 20% of missingness

In [None]:
# Eliminate automatically variables with more than 20% of missingness

xTrain_before_filling= xTrain
xTrain = xTrain[xTrain.columns[xTrain.isnull().mean() < 0.2]]


In [None]:
xTrain

## Let us see which among remaining columns have missing values in the code below

In [None]:
missing_cols=xTrain.columns[xTrain.isnull().mean() > 0]
print(missing_cols)

In [None]:
xTrain['card5'].isnull().mean() # let us use this variable for comparison later, keep this in mind for now!

# Imputation : Filling missing values in a variable by reasonable approximations like mean of the variable to allow machine learning models to work

## Step 1a. Single Imputation Technique

## Impute Numeric Variables with mean of the variable

In [None]:
xTrain_single= xTrain
cols= xTrain_single.columns
num_cols = xTrain_single.select_dtypes(include=np.number).columns


In [None]:
num_cols

In [None]:
xTrain_single.loc[:,num_cols]=xTrain_single.loc[:,num_cols].fillna(xTrain_single.loc[:,num_cols].mean())

print(num_cols)
print(xTrain_single.loc[:,num_cols].mean())

In [None]:
cat_cols= list(set(cols) - set(num_cols))
cat_cols

## Impute Categorical Variables with mode of the variable

In [None]:
cols

In [None]:
num_cols

In [None]:
xTrain_single.loc[:,cat_cols] = xTrain_single.loc[:,cat_cols].fillna(xTrain.loc[:,cat_cols].mode())
train_cols = xTrain_single.columns
print(xTrain_single.loc[:5, cat_cols].isnull().sum())
print(cat_cols)
print(xTrain_single.loc[:,cat_cols].mode())

### Check if missingness is now 0 for all variables remaining 
##### empty column list indicates no variable has missing values anymore!

In [None]:
xTrain_single.columns[xTrain_single.isnull().mean() > 0] # select those variables with missing values

### Comparing variable before and after filling: (remember variable 'Card5' earlier?)
Card5 variable before filling missing values with its mean

In [None]:
xTrain_single['card5'].plot.hist(figsize=(16,8));

In [None]:
xTrain_before_filling['card5'].plot.hist(figsize=(16,8));

In [None]:
xTrain_single['card5'].describe()

In [None]:
xTrain_before_filling['card5'].describe()

### in above describe command, we are looking how is the variable distributed in terms of mean, standard deviation etc.

In [None]:
### The variable 'card5' didn't change much as a whole (except for very slight change in standard deviation of the variable) even after filling with mean values. So filling values isn't changing the existing variables much because of filling with measures like mean of the variable. It just models to not misunderstand missing values

In [None]:
xTrain= xTrain_single # let us single imputed data as further data for preprocessing in the next step

## Step 2. One hot encoding : To make all variables numeric to feed to machine learning process further

In [None]:
xTrain.info()
xTrain_dummy = pd.get_dummies(xTrain, prefix_sep='_', drop_first=True)

In [None]:
xTrain_dummy

## Finalizing the data before training a model

In [None]:
final_tr = pd.DataFrame(data=xTrain_dummy)

print(final_tr.head())
print(final_tr.shape)

In [None]:
## Decision Tree using grid search CV
from sklearn import tree
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, n_jobs=4,cv=5,scoring = 'roc_auc')
clf.fit(X=final_tr, y=yTrain)
dt = clf.best_estimator_  #final decision tree!
print (clf.best_score_, clf.best_params_) 

In [None]:
## Visualizing the decision tree initially- load visualization libraries
from sklearn.tree import export_graphviz
import graphviz
dot_data = tree.export_graphviz(dt, out_file=None, 
                    feature_names=final_tr.columns,  
                      class_names=['No_Fraud','Fraud'],  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph

## Part 2: Handling Class Imbalance


### What is class imbalance?
Class imbalance in classification problem is too less data points of one class compared to another class we are trying to predict. It leads to machine to learn too much of the dominant class and too less about the minority class!




###  Let us use data from step 2 (one hot encoded), use SMOTE and resample data 

## SMOTE: Synthetic Minority Oversampling Technique

We can update the example to first oversample the minority class to have 20 percent the number of examples of the majority class (here about 8k), then use random undersampling to reduce the number of examples in the majority class. But we are NOT undersampling majority class as this worked best for this case and we will see the results later!


###  How to apply SMOTE oversampling and undersampling technique?

In [None]:
#Libraries and functions to load for class imbalance
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline


over = SMOTE(sampling_strategy=0.2,random_state=2) # 
#under = RandomUnderSampler(sampling_strategy=0.2) : we are avoid under sampling of non fraud data, to avoid removing data: this worked well in this case!
steps = [('o', over)] # only do oversampling
pipeline = Pipeline(steps=steps)
X_res, y_res = pipeline.fit_resample(xTrain_dummy, yTrain)


print('Original dataset shape %s' % Counter(yTrain))
print('Resampled dataset shape %s' % Counter(y_res))

### As you see in the above output, the dataset originally had just 1393 fraud cases and arodun 40k non fraud transactions.
### Now thanks to oversampling via SMOTE, we have  number of fraud and non-fraud cases in the ratio 1:5 and are relatively balanced information for model to learn

### Now let us use SMOTE'd data for training a decsision tree classifier on it!

In [None]:
from sklearn import preprocessing
#scaled_tr_res = preprocessing.StandardScaler().fit_transform(X_res)
final_tr_res = pd.DataFrame(data=X_res)
final_tr_res.columns= xTrain_dummy.columns
final_tr_res
print(final_tr_res.head())

In [None]:
## Decision Tree using grid search CV
from sklearn import tree
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, n_jobs=4,cv=5,scoring = 'roc_auc')
clf.fit(X=final_tr_res, y=y_res)
dt_smote = clf.best_estimator_  #final decision tree!
print (clf.best_score_, clf.best_params_) 

## As you see in best score for both models earlier, there is an increased performance on resampled data after handling class imbalance. But let us now judge early! Let us judge by the performance on the unseen test data which we separated earlier

## See the tree for yourself- tree with SMOTE'd data

In [None]:
## Visualizing the decision tree initially- load visualization libraries
from sklearn.tree import export_graphviz
import graphviz
dot_data = tree.export_graphviz(dt, out_file=None, 
                    feature_names=final_tr.columns,  
                      class_names=['No_Fraud','Fraud'],  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph

## The real test! 
## Apply on Test Data : apply steps 1-4 namely and then do prediction
1. Apply single imputation,
2. Select only variables which are used for training
3. One Hot encode variables
4: make sure test data again has exact same number of variables as training !

### Step 1: Account for missing values with single imputation like we did earlier

In [None]:

cols= xTest.columns
num_cols = xTest.select_dtypes(include=np.number).columns
xTest.loc[:,num_cols] = xTest.loc[:,num_cols].fillna(xTest.loc[:,num_cols].mean())

cat_cols= list(set(cols) - set(num_cols))
xTest.loc[:,cat_cols] = xTest.loc[:,cat_cols].fillna(xTest.loc[:,cat_cols].mode().iloc[0])
test_cols = xTest.columns


### Step 2:  Select only those features which are there in training #

In [None]:

#train_cols = xTrain.columns
xTest = xTest[train_cols] 

### Step 3. One Hot encode variables

In [None]:
xTest.info()
xTest_dummy = pd.get_dummies(xTest, prefix_sep='_', drop_first=True)
# Dummify categorical vars
xTest_dummy = pd.get_dummies(xTest, prefix_sep='__', drop_first=True)

##missing columns levels train and test
missing_levels_cols= list(set(xTrain_dummy.columns) - set(xTest_dummy.columns))


for c in missing_levels_cols:
    xTest_dummy[c]=0

# Select only those columns which are there in training data
xTest_dummy=xTest_dummy[xTrain_dummy.columns]

### Step 4: make sure test data again has exact same number of variables as training !

In [None]:
from sklearn import preprocessing

final_ts = pd.DataFrame(data=xTest_dummy)
final_ts.columns= xTest_dummy.columns
final_ts
print(final_ts.head())
print(final_ts.shape)

### Prediction on test data: Without SMOTE vs With SMOTE

In [None]:
ytest_dt = dt.predict_proba(final_ts)



In [None]:
from sklearn.metrics import roc_curve,roc_auc_score
print('The ROC AUC score for 1st model without SMOTE is {}'.format(roc_auc_score(yTest,ytest_dt[:,1])))

In [None]:
ytest_dt_smote = dt_smote.predict_proba(final_ts)
print('The ROC AUC score for 1st model after SMOTE is {}'.format(roc_auc_score(yTest,ytest_dt_smote[:,1])))


## As you see, there is increase performance of decision tree classifer after SMOTE is applied, with respect to AUC. 

### Disclaimer: this is just for demo, there are various techniques one should consider before judging increase in performance. You will learn about them later!

# Additional Reading Material and to  try:

## Multivariate Imputation Example

### Multivariate Imputation : Imputation using values of other variables to predict the value of missing variable

In [None]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


In [None]:
xTrain_multiple= xTrain
imp = IterativeImputer(max_iter=100, random_state=0)
numeric_missing_cols= xTrain_multiple.loc[:,numeric_missing_cols].select_dtypes(include=np.number).columns
xTrain_multiple.loc[:,numeric_missing_cols]= imp.fit_transform(xTrain_multiple.loc[:,numeric_missing_cols].values)
print("The missing columns that are imputed are: " +numeric_missing_cols)
# the model learns that the second feature is double the first
#print(np.round(imp.transform(X_test)))

##  Scaling/Normalizing Data

The preprocessing module further provides a utility class StandardScaler that implements the Transformer API to compute the mean and standard deviation on a training set so as to be able to later reapply the same transformation on the testing set.

In [None]:
from sklearn import preprocessing
scaled_tr = preprocessing.StandardScaler().fit_transform(xTrain_dummy)
final_tr = pd.DataFrame(data=scaled_tr)
final_tr.columns= xTrain_dummy.columns
final_tr
print(final_tr.head())
print(final_tr.shape)