In [12]:
import pandas as pd 
import numpy as np 
from sklearn import preprocessing 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score 
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv("Fraud_check.csv")
data 

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [4]:
data.columns

Index(['Undergrad', 'Marital.Status', 'Taxable.Income', 'City.Population',
       'Work.Experience', 'Urban'],
      dtype='object')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [6]:
data.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [9]:
label_encoder = preprocessing.LabelEncoder() 

In [10]:
data['Undergrad']=label_encoder.fit_transform(data['Undergrad']) 
data['Urban']=label_encoder.fit_transform(data['Urban'])
data['Marital.Status']=label_encoder.fit_transform(data['Marital.Status'])
data 

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,68833,50047,10,1
1,1,0,33700,134075,18,1
2,0,1,36925,160205,30,1
3,1,2,50190,193264,15,1
4,0,1,81002,27533,28,0
...,...,...,...,...,...,...
595,1,0,76340,39492,7,1
596,1,0,69967,55369,2,1
597,0,0,47334,154058,0,1
598,1,1,98592,180083,17,0


In [13]:
np.min(data['Taxable.Income'])

10003

In [14]:
np.max(data['Taxable.Income']) 

99619

In [15]:
taxable_new=pd.cut(data['Taxable.Income'],bins=[10003,30000,99619],labels=['Risky','Good'])
taxable_new 

0      Good
1      Good
2      Good
3      Good
4      Good
       ... 
595    Good
596    Good
597    Good
598    Good
599    Good
Name: Taxable.Income, Length: 600, dtype: category
Categories (2, object): ['Risky' < 'Good']

In [16]:
x=pd.Series(taxable_new)
x 

0      Good
1      Good
2      Good
3      Good
4      Good
       ... 
595    Good
596    Good
597    Good
598    Good
599    Good
Name: Taxable.Income, Length: 600, dtype: category
Categories (2, object): ['Risky' < 'Good']

In [17]:
data['Taxable_Categorical']=x
data 

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Taxable_Categorical
0,0,2,68833,50047,10,1,Good
1,1,0,33700,134075,18,1,Good
2,0,1,36925,160205,30,1,Good
3,1,2,50190,193264,15,1,Good
4,0,1,81002,27533,28,0,Good
...,...,...,...,...,...,...,...
595,1,0,76340,39492,7,1,Good
596,1,0,69967,55369,2,1,Good
597,0,0,47334,154058,0,1,Good
598,1,1,98592,180083,17,0,Good


In [18]:
data1=data.drop(['Taxable.Income'],axis=1)
data1 

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Taxable_Categorical
0,0,2,50047,10,1,Good
1,1,0,134075,18,1,Good
2,0,1,160205,30,1,Good
3,1,2,193264,15,1,Good
4,0,1,27533,28,0,Good
...,...,...,...,...,...,...
595,1,0,39492,7,1,Good
596,1,0,55369,2,1,Good
597,0,0,154058,0,1,Good
598,1,1,180083,17,0,Good


In [19]:
data1['Taxable_Categorical']=np.where(data['Taxable_Categorical'].str.contains('Good'),0,1) 
data1  

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Taxable_Categorical
0,0,2,50047,10,1,0
1,1,0,134075,18,1,0
2,0,1,160205,30,1,0
3,1,2,193264,15,1,0
4,0,1,27533,28,0,0
...,...,...,...,...,...,...
595,1,0,39492,7,1,0
596,1,0,55369,2,1,0
597,0,0,154058,0,1,0
598,1,1,180083,17,0,0


In [21]:
X=data1.iloc[:,0:5]
Y=data1['Taxable_Categorical'] 
seed = 7 

In [23]:
data1.Taxable_Categorical.value_counts()  

0    477
1    123
Name: Taxable_Categorical, dtype: int64

In [24]:
num_trees = 100
max_features = 3 

In [25]:
kfold = KFold(n_splits=10, random_state=7, shuffle = True) 
kfold  

KFold(n_splits=10, random_state=7, shuffle=True)

In [26]:
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features) 
model 

In [28]:
results = cross_val_score(model, X, Y, cv=kfold) 
results 

array([0.78333333, 0.66666667, 0.8       , 0.8       , 0.73333333,
       0.73333333, 0.73333333, 0.66666667, 0.8       , 0.76666667])

In [29]:
print(results.mean()) 

0.7483333333333333
