In [33]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import tree
plt.rcParams["figure.figsize"]=(10,5)
plt.rcParams["figure.dpi"]=300
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Fraud Check Data

In [34]:
FC=pd.read_csv("Fraud_check.csv")
FC

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


# EDA

In [35]:
FC.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.3+ KB


In [36]:
FC.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [37]:
FC=pd.get_dummies(FC)
FC

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Marital.Status_Divorced,Marital.Status_Married,Marital.Status_Single,Urban_NO,Urban_YES
0,68833,50047,10,1,0,0,0,1,0,1
1,33700,134075,18,0,1,1,0,0,0,1
2,36925,160205,30,1,0,0,1,0,0,1
3,50190,193264,15,0,1,0,0,1,0,1
4,81002,27533,28,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
595,76340,39492,7,0,1,1,0,0,0,1
596,69967,55369,2,0,1,1,0,0,0,1
597,47334,154058,0,1,0,1,0,0,0,1
598,98592,180083,17,0,1,0,1,0,1,0


In [38]:
FC['Taxable.Income'].max()

99619

In [39]:
FC['Taxable.Income'].min()

10003

In [40]:
FC['Tax']=pd.cut(FC['Taxable.Income'], bins = [10002,30000,99620], labels = ["Risky", "Good"])

In [41]:
FC

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Marital.Status_Divorced,Marital.Status_Married,Marital.Status_Single,Urban_NO,Urban_YES,Tax
0,68833,50047,10,1,0,0,0,1,0,1,Good
1,33700,134075,18,0,1,1,0,0,0,1,Good
2,36925,160205,30,1,0,0,1,0,0,1,Good
3,50190,193264,15,0,1,0,0,1,0,1,Good
4,81002,27533,28,1,0,0,1,0,1,0,Good
...,...,...,...,...,...,...,...,...,...,...,...
595,76340,39492,7,0,1,1,0,0,0,1,Good
596,69967,55369,2,0,1,1,0,0,0,1,Good
597,47334,154058,0,1,0,1,0,0,0,1,Good
598,98592,180083,17,0,1,0,1,0,1,0,Good


In [42]:
array=FC.values
array

array([[68833, 50047, 10, ..., 0, 1, 'Good'],
       [33700, 134075, 18, ..., 0, 1, 'Good'],
       [36925, 160205, 30, ..., 0, 1, 'Good'],
       ...,
       [47334, 154058, 0, ..., 0, 1, 'Good'],
       [98592, 180083, 17, ..., 1, 0, 'Good'],
       [96519, 158137, 16, ..., 1, 0, 'Good']], dtype=object)

In [43]:
S=array[:,1:10]
T=array[:,10]

In [44]:
S

array([[50047, 10, 1, ..., 1, 0, 1],
       [134075, 18, 0, ..., 0, 0, 1],
       [160205, 30, 1, ..., 0, 0, 1],
       ...,
       [154058, 0, 1, ..., 0, 0, 1],
       [180083, 17, 0, ..., 0, 1, 0],
       [158137, 16, 1, ..., 0, 1, 0]], dtype=object)

In [45]:
T

array(['Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Risky', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Risky',
       'Good', 'Good', 'Risky', 'Good', 'Good', 'Risky', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Risky',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Risky',
       'Good', 'Risky', 'Good', 'Good', 'Good', 'Risky', 'Risky', 'Good',
       'Risky', 'Good', 'Risky', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Good', 'Good'

# Model Building

In [46]:
kfold=KFold(n_splits=10,shuffle=True)

In [47]:
model1=RandomForestClassifier(n_estimators=100,max_features=5)
model1

In [48]:
fit1=model1.fit(S,T)
pred=fit1.predict(S)
pred

array(['Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Risky', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Risky',
       'Good', 'Good', 'Risky', 'Good', 'Good', 'Risky', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Risky',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Risky',
       'Good', 'Risky', 'Good', 'Good', 'Good', 'Risky', 'Risky', 'Good',
       'Risky', 'Good', 'Risky', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Good', 'Good'

In [49]:
result=cross_val_score(model1,S,T,cv=kfold)
result

array([0.78333333, 0.76666667, 0.66666667, 0.76666667, 0.76666667,
       0.7       , 0.63333333, 0.75      , 0.75      , 0.83333333])

In [50]:
result.mean()*100

74.16666666666667

# Company Data

In [51]:
company=pd.read_csv('Company_Data.csv')
company

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


# EDA

In [52]:
company.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [53]:
company.describe()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,7.496325,124.975,68.6575,6.635,264.84,115.795,53.3225,13.9
std,2.824115,15.334512,27.986037,6.650364,147.376436,23.676664,16.200297,2.620528
min,0.0,77.0,21.0,0.0,10.0,24.0,25.0,10.0
25%,5.39,115.0,42.75,0.0,139.0,100.0,39.75,12.0
50%,7.49,125.0,69.0,5.0,272.0,117.0,54.5,14.0
75%,9.32,135.0,91.0,12.0,398.5,131.0,66.0,16.0
max,16.27,175.0,120.0,29.0,509.0,191.0,80.0,18.0


In [54]:
arr=company.values
arr

array([[9.5, 138, 73, ..., 17, 'Yes', 'Yes'],
       [11.22, 111, 48, ..., 10, 'Yes', 'Yes'],
       [10.06, 113, 35, ..., 12, 'Yes', 'Yes'],
       ...,
       [7.41, 162, 26, ..., 18, 'Yes', 'Yes'],
       [5.94, 100, 79, ..., 12, 'Yes', 'Yes'],
       [9.71, 134, 37, ..., 16, 'Yes', 'Yes']], dtype=object)

In [55]:
X=arr[:,0:6]
Y=arr[:,6]

In [56]:
X

array([[9.5, 138, 73, 11, 276, 120],
       [11.22, 111, 48, 16, 260, 83],
       [10.06, 113, 35, 10, 269, 80],
       ...,
       [7.41, 162, 26, 12, 368, 159],
       [5.94, 100, 79, 7, 284, 95],
       [9.71, 134, 37, 0, 27, 120]], dtype=object)

In [57]:
Y

array(['Bad', 'Good', 'Medium', 'Medium', 'Bad', 'Bad', 'Medium', 'Good',
       'Medium', 'Medium', 'Bad', 'Good', 'Medium', 'Good', 'Good',
       'Medium', 'Good', 'Good', 'Good', 'Medium', 'Medium', 'Good',
       'Medium', 'Medium', 'Bad', 'Good', 'Good', 'Medium', 'Bad', 'Bad',
       'Good', 'Medium', 'Good', 'Good', 'Medium', 'Medium', 'Good',
       'Medium', 'Medium', 'Bad', 'Bad', 'Bad', 'Medium', 'Medium',
       'Medium', 'Bad', 'Medium', 'Bad', 'Bad', 'Good', 'Bad', 'Bad',
       'Bad', 'Medium', 'Medium', 'Medium', 'Medium', 'Bad', 'Bad',
       'Medium', 'Bad', 'Medium', 'Bad', 'Medium', 'Medium', 'Medium',
       'Medium', 'Medium', 'Good', 'Medium', 'Good', 'Medium', 'Medium',
       'Good', 'Medium', 'Bad', 'Medium', 'Medium', 'Medium', 'Bad',
       'Bad', 'Good', 'Good', 'Bad', 'Bad', 'Medium', 'Medium', 'Good',
       'Medium', 'Medium', 'Medium', 'Medium', 'Medium', 'Medium', 'Bad',
       'Medium', 'Good', 'Bad', 'Good', 'Bad', 'Medium', 'Medium',
       'Medium

# Model Building

In [58]:
kfold=KFold(n_splits=8,shuffle=True)

In [59]:
model=RandomForestClassifier(n_estimators=100,max_features=4)
model

In [60]:
fit=model.fit(X,Y)

In [61]:
pred=fit.predict(X)
pred

array(['Bad', 'Good', 'Medium', 'Medium', 'Bad', 'Bad', 'Medium', 'Good',
       'Medium', 'Medium', 'Bad', 'Good', 'Medium', 'Good', 'Good',
       'Medium', 'Good', 'Good', 'Good', 'Medium', 'Medium', 'Good',
       'Medium', 'Medium', 'Bad', 'Good', 'Good', 'Medium', 'Bad', 'Bad',
       'Good', 'Medium', 'Good', 'Good', 'Medium', 'Medium', 'Good',
       'Medium', 'Medium', 'Bad', 'Bad', 'Bad', 'Medium', 'Medium',
       'Medium', 'Bad', 'Medium', 'Bad', 'Bad', 'Good', 'Bad', 'Bad',
       'Bad', 'Medium', 'Medium', 'Medium', 'Medium', 'Bad', 'Bad',
       'Medium', 'Bad', 'Medium', 'Bad', 'Medium', 'Medium', 'Medium',
       'Medium', 'Medium', 'Good', 'Medium', 'Good', 'Medium', 'Medium',
       'Good', 'Medium', 'Bad', 'Medium', 'Medium', 'Medium', 'Bad',
       'Bad', 'Good', 'Good', 'Bad', 'Bad', 'Medium', 'Medium', 'Good',
       'Medium', 'Medium', 'Medium', 'Medium', 'Medium', 'Medium', 'Bad',
       'Medium', 'Good', 'Bad', 'Good', 'Bad', 'Medium', 'Medium',
       'Medium

In [62]:
results=cross_val_score(model,X,Y,cv=kfold)
results

array([0.72, 0.64, 0.64, 0.56, 0.6 , 0.7 , 0.62, 0.68])

In [63]:
results.mean()*100

64.5