In [18]:
import pandas as pd
import numpy as np
from numpy import array
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [19]:
file_path = r'C:\Users\Kashi\Downloads\Fraud_check.csv'

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [20]:
df.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [22]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()
columns_to_encode = ['Undergrad', 'Marital.Status', 'Urban']

In [23]:
for col in columns_to_encode:
    df[col] = le.fit_transform(df[col])

In [24]:
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,68833,50047,10,1
1,1,0,33700,134075,18,1
2,0,1,36925,160205,30,1
3,1,2,50190,193264,15,1
4,0,1,81002,27533,28,0


In [25]:
df['Taxable.Income.Class'] = np.where(df['Taxable.Income'] <= 30000, 'Risky', 'Good')
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Taxable.Income.Class
0,0,2,68833,50047,10,1,Good
1,1,0,33700,134075,18,1,Good
2,0,1,36925,160205,30,1,Good
3,1,2,50190,193264,15,1,Good
4,0,1,81002,27533,28,0,Good


In [26]:
df['Taxable.Income.Class']= le.fit_transform(df['Taxable.Income.Class'])
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Taxable.Income.Class
0,0,2,68833,50047,10,1,0
1,1,0,33700,134075,18,1,0
2,0,1,36925,160205,30,1,0
3,1,2,50190,193264,15,1,0
4,0,1,81002,27533,28,0,0


In [27]:
df = df.drop('Taxable.Income', axis = 1)


In [28]:
df.head()

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Taxable.Income.Class
0,0,2,50047,10,1,0
1,1,0,134075,18,1,0
2,0,1,160205,30,1,0
3,1,2,193264,15,1,0
4,0,1,27533,28,0,0


In [31]:
x = df.iloc[:,0:5]
y = df['Taxable.Income.Class']

In [34]:
num_trees = 200
#max_features = 3
kfold = KFold(n_splits=10, random_state=7, shuffle=True) #Bootstrap

model = RandomForestClassifier(n_estimators=num_trees,max_depth=5,  min_samples_leaf=5,max_features='sqrt')
results = cross_val_score(model, x, y, cv=kfold)
print(results)


[0.78333333 0.73333333 0.86666667 0.91666667 0.75       0.76666667
 0.83333333 0.66666667 0.83333333 0.78333333]


In [35]:
print(results.mean())


0.7933333333333332


In [37]:
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
params = {
    'max_depth': [2,3,5,None],
    'min_samples_leaf': [5,10,20],
    'n_estimators': [50,100,200,500]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, verbose=2, scoring="accuracy")
grid_search.fit(x, y)

Fitting 4 folds for each of 48 candidates, totalling 192 fits


GridSearchCV(cv=4, estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [2, 3, 5, None],
                         'min_samples_leaf': [5, 10, 20],
                         'n_estimators': [50, 100, 200, 500]},
             scoring='accuracy', verbose=2)

In [38]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_depth': 2, 'min_samples_leaf': 5, 'n_estimators': 50}
0.7933333333333333


In [41]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=2, min_samples_leaf=5, n_estimators=50,
                       n_jobs=-1, random_state=42)