- Use Random Forest to prepare a model on fraud data 
treating those who have taxable_income <= 30000 as "Risky" and others are "Good"


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

In [2]:
df = pd.read_csv('Fraud_check.csv')

In [3]:
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [4]:
le = preprocessing.LabelEncoder()
df['Undergrad']=le.fit_transform(df['Undergrad'])
df['Marital.Status']=le.fit_transform(df['Marital.Status'])
df['Urban']=le.fit_transform(df['Urban'])

In [5]:
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,68833,50047,10,1
1,1,0,33700,134075,18,1
2,0,1,36925,160205,30,1
3,1,2,50190,193264,15,1
4,0,1,81002,27533,28,0
...,...,...,...,...,...,...
595,1,0,76340,39492,7,1
596,1,0,69967,55369,2,1
597,0,0,47334,154058,0,1
598,1,1,98592,180083,17,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Undergrad        600 non-null    int32
 1   Marital.Status   600 non-null    int32
 2   Taxable.Income   600 non-null    int64
 3   City.Population  600 non-null    int64
 4   Work.Experience  600 non-null    int64
 5   Urban            600 non-null    int32
dtypes: int32(3), int64(3)
memory usage: 21.2 KB


In [7]:
df.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
count,600.0,600.0,600.0,600.0,600.0,600.0
mean,0.52,1.046667,55208.375,108747.368333,15.558333,0.503333
std,0.500017,0.821958,26204.827597,49850.075134,8.842147,0.500406
min,0.0,0.0,10003.0,25779.0,0.0,0.0
25%,0.0,0.0,32871.5,66966.75,8.0,0.0
50%,1.0,1.0,55074.5,106493.5,15.0,1.0
75%,1.0,2.0,78611.75,150114.25,24.0,1.0
max,1.0,2.0,99619.0,199778.0,30.0,1.0


In [9]:
Target = []
for x1 in df['Taxable.Income']:
    if x1 <= 30000:
        Target.append("Risky")
    else:
        Target.append('Good')
df['Target'] = Target
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Target
0,0,2,68833,50047,10,1,Good
1,1,0,33700,134075,18,1,Good
2,0,1,36925,160205,30,1,Good
3,1,2,50190,193264,15,1,Good
4,0,1,81002,27533,28,0,Good
...,...,...,...,...,...,...,...
595,1,0,76340,39492,7,1,Good
596,1,0,69967,55369,2,1,Good
597,0,0,47334,154058,0,1,Good
598,1,1,98592,180083,17,0,Good


In [10]:
df['Target']=le.fit_transform(df['Target'])

In [11]:
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Target
0,0,2,68833,50047,10,1,0
1,1,0,33700,134075,18,1,0
2,0,1,36925,160205,30,1,0
3,1,2,50190,193264,15,1,0
4,0,1,81002,27533,28,0,0
...,...,...,...,...,...,...,...
595,1,0,76340,39492,7,1,0
596,1,0,69967,55369,2,1,0
597,0,0,47334,154058,0,1,0
598,1,1,98592,180083,17,0,0


# Spiltting Data into Training and Testing

In [12]:
x = df.drop(axis =1,columns={'Taxable.Income','Target'})
y = df['Target']

In [13]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=1)

# Model Building
- Random Forest

In [14]:
rf = RandomForestClassifier()

In [15]:
rf.fit(xtrain,ytrain)
ypred = rf.predict(xtest)

In [16]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86        97
           1       0.00      0.00      0.00        23

    accuracy                           0.75       120
   macro avg       0.40      0.46      0.43       120
weighted avg       0.64      0.75      0.69       120



In [17]:
print(rf.score(xtrain,ytrain))
rf.score(xtest,ytest)

1.0


0.75

# Hyperparameter Tuning
- Finding Best Parameter

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
param_grid = {
    'n_estimators': [10,20,30,40,50,60,70,80],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3,4,5,6,7,8,9],
    'max_leaf_nodes': [3,4,5,6,7,8,9,10],
}

In [20]:
grid_search = GridSearchCV(RandomForestClassifier(),
                           param_grid=param_grid)

In [21]:
grid_search.fit(xtrain,ytrain)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9],
                         'max_features': ['sqrt', 'log2', None],
                         'max_leaf_nodes': [3, 4, 5, 6, 7, 8, 9, 10],
                         'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80]})

In [22]:
grid_search.best_params_

{'max_depth': 8,
 'max_features': 'log2',
 'max_leaf_nodes': 9,
 'n_estimators': 10}

# Final Model

In [23]:
rf1 = RandomForestClassifier(n_estimators=10, max_features='log2',max_leaf_nodes=9,max_depth=8)

In [24]:
rf1.fit(xtrain,ytrain)
ypred = rf1.predict(xtest)

In [25]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.81      1.00      0.89        97
           1       0.00      0.00      0.00        23

    accuracy                           0.81       120
   macro avg       0.40      0.50      0.45       120
weighted avg       0.65      0.81      0.72       120



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
print(rf1.score(xtrain,ytrain))
rf1.score(xtest,ytest)

0.7895833333333333


0.8083333333333333