In [2]:
#Loading Libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [3]:
#Loading data set
df = pd.read_csv('..\Data\processed_data.csv')
df.sample(30000, random_state=42)

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,PAYMENT,DEBIT,TRANSFER,CASH_OUT,CASH_IN
3737323,278,330218.42,20866.00,351084.42,452419.57,122201.15,0,0,0,0,0,0,1
264914,15,11647.08,30370.00,18722.92,0.00,0.00,0,0,1,0,0,0,0
85647,10,152264.21,106589.00,258853.21,201303.01,49038.80,0,0,0,0,0,0,1
5899326,403,1551760.63,0.00,0.00,3198359.45,4750120.08,0,0,0,0,1,0,0
2544263,206,78172.30,2921331.58,2999503.88,415821.90,337649.60,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4962742,351,128222.03,2082306.48,2210528.51,277346.60,149124.57,0,0,0,0,0,0,1
4001952,298,308520.08,10230.00,0.00,274313.82,582833.90,0,0,0,0,0,1,0
4462025,323,12955.60,150048.50,137092.90,0.00,0.00,0,0,1,0,0,0,0
1939819,177,119243.79,0.00,119243.79,350903.74,231659.95,0,0,0,0,0,0,1


In [3]:
#Select Target Data
y = df['isFraud']

#Loading the X Variables
X = df.drop(['isFraud'], axis = 1)

In [4]:
#Divide the data into Train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

In [5]:
print(f'X_train : {X_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_test : {y_test.shape}')

X_train : (5090096, 12)
y_train : (5090096,)
X_test : (1272524, 12)
y_test : (1272524,)


<H4>Building a Random Forest</H4>

In [6]:
rf_Model = RandomForestClassifier()

<p>Note |  It's important to note that this call won't inherently choose any parameters; </P>
<p>instead, it will use default parameters or any hyperparameters you've explicitly set when creating the RandomForestClassifier Object</p>

In [7]:
#Fitting the model
rf_Model.fit(X_train,y_train)
from pprint import pprint

In [8]:
print('Parameters currently in use:\n')
pprint(rf_Model.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


<H4>Checking for accuracy</H4>

In [9]:
print(f'Train Accuracy = {rf_Model.score(X_train,y_train):.3f}')
print(f'Test Accuracy = {rf_Model.score(X_test,y_test):.3f}')

Train Accuracy = 1.000
Test Accuracy = 1.000


In [10]:
from sklearn.metrics import f1_score
y_pred = rf_Model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

F1 Score: 0.8791283622744297


<H6>Achieving 100% accuracy on both training and test sets it's a sign of overfitting, especially if the dataset is not too complex. 

Overfitting occurs when a model learns to capture noise in the training data rather than true underlying patterns. In such cases, the model might perform poorly on truly unseen data.

Therefore we need to evaluate its performance on a separate validation set or by using other techniques.

if the test accuracy is reasonably high and close to the train accuracy, it suggests that the model is performing well and generalizing effectively. But 100% in this case means overfiitting and it will perform poorly on truly unseen data

In [11]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [12]:
# Use the random grid to search for best hyperparameters

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf_Model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [None]:
rf_random.best_params_

In [None]:
print(f'Train Accuracy = {rf_random.score(X_train,y_train):.3f}')
print(f'Test Accuracy = {rf_random.score(X_test,y_test):.3f}')

In [None]:
from sklearn.metrics import f1_score
y_pred = rf_random.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)