# Use Random Forest to prepare a model on fraud data 
# treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

In [1]:
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the data set ' Fraud_check1.csv '
df= pd.read_csv('Fraud_check1.csv')
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [3]:
# Basic info about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.3+ KB


In [4]:
df.isnull().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [5]:
df.duplicated().sum()

0

# No null and duplicated dvalues

In [6]:
df.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [7]:
corel = df.corr()

In [8]:
corel

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
Taxable.Income,1.0,-0.064387,-0.001818
City.Population,-0.064387,1.0,0.013135
Work.Experience,-0.001818,0.013135,1.0


In [9]:
# No strong correlation between any of the indepandent variables

In [10]:
# One hot encoding by using pandas
df=pd.get_dummies(df,columns=['Undergrad','Marital.Status','Urban'], drop_first=True)

In [11]:
df.head()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES
0,68833,50047,10,0,0,1,1
1,33700,134075,18,1,0,0,1
2,36925,160205,30,0,1,0,1
3,50190,193264,15,1,0,1,1
4,81002,27533,28,0,1,0,0


In [12]:
# Create bins based on the given statistics
bins = [10003, 30000, 99619]  # Define bin edges
labels = ['Risky', 'Good']  # Define labels for the bins

In [13]:
# Create a new column 'Income_Category' with the bin labels
df['Income_Category'] = pd.cut(df['Taxable.Income'], bins=bins, labels=labels)

In [14]:
df.head()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES,Income_Category
0,68833,50047,10,0,0,1,1,Good
1,33700,134075,18,1,0,0,1,Good
2,36925,160205,30,0,1,0,1,Good
3,50190,193264,15,1,0,1,1,Good
4,81002,27533,28,0,1,0,0,Good


In [15]:
df['Income_Category'].value_counts()

Good     476
Risky    123
Name: Income_Category, dtype: int64

In [16]:
df = df.fillna('Good')

In [17]:
df['Income_Category'].value_counts()

Good     477
Risky    123
Name: Income_Category, dtype: int64

In [18]:
df['Income_Category']=df['Income_Category'].map({'Good':1,'Risky':0})

In [19]:
df.head()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES,Income_Category
0,68833,50047,10,0,0,1,1,1
1,33700,134075,18,1,0,0,1,1
2,36925,160205,30,0,1,0,1,1
3,50190,193264,15,1,0,1,1,1
4,81002,27533,28,0,1,0,0,1


In [20]:
##Droping the Taxable income variable
df.drop(["Taxable.Income"],axis=1,inplace=True)

In [21]:
df.head()

Unnamed: 0,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES,Income_Category
0,50047,10,0,0,1,1,1
1,134075,18,1,0,0,1,1
2,160205,30,0,1,0,1,1
3,193264,15,1,0,1,1,1
4,27533,28,0,1,0,0,1


# 

# No null values after performing the basic EDA as per problem statement

# Now the dataset is ready for the problem solving

In [23]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [24]:
# # Define the features (X) and the target variable (y)
x = df.drop('Income_Category',axis = 1)
y = df['Income_Category']

In [25]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [26]:
# Define the Random Forest model and perform with default parameter values
rfc = RandomForestClassifier()

In [27]:
rfc.fit(x_train,y_train)

In [28]:
pred_train = rfc.predict(x_train)
pred_train[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [29]:
pred_test = rfc.predict(x_test)
pred_test[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [30]:
# Evaluate the model
acc_rfc_train = accuracy_score(y_train, pred_train)
print(f'Accuracy: {acc_rfc_train}')

Accuracy: 1.0


In [31]:
# Evaluate the model
acc_rfc_test = accuracy_score(y_test, pred_test)
print(f'Accuracy: {acc_rfc_test}')

Accuracy: 0.7722222222222223


# Hyper parameter tuning for finding the best parameter values

In [36]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [2,4,6,8],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 3, 4]
}

In [37]:
# Use GridSearchCV for hyperparameter tuning
gv = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy')
gv.fit(x_train, y_train)

In [38]:
# Get the best parameters from the grid search
best_params = gv.best_params_
best_params

{'max_depth': 8,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 50}

In [47]:
# Build a final random forest model to see the performance
rfc_best = RandomForestClassifier(max_depth=8,min_samples_leaf=4,n_estimators = 50,min_samples_split=2)
rfc_best.fit(x_train,y_train)

In [48]:
pred_train_best = rfc_best.predict(x_train)
pred_train_best[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [49]:
pred_test_best = rfc_best.predict(x_test)
pred_test_best[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [50]:
acc_best_train = accuracy_score(y_train,pred_train_best)
print(f'Accuracyscore:{acc_best_train}')

Accuracyscore:0.8


In [51]:
acc_best_test = accuracy_score(y_test,pred_test_best)
print(f'Accuracy score:{acc_best_test}')

Accuracy score:0.7944444444444444


# The Final model is giving almost same accuracy on the train and test data 