In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

In [2]:
company = pd.read_csv('Company_Data.csv')
company.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [3]:
company.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [4]:
company.shape

(400, 11)

In [5]:
company['Sales'] = pd.cut(company['Sales'],3) # cutting the Sales column so that it can be converted into catogery

In [6]:
le = LabelEncoder()

In [7]:
company['Sales'] = le.fit_transform(company['Sales']) #converting dataset into catogorical data
company['ShelveLoc'] = le.fit_transform(company['ShelveLoc'])
company['Urban'] = le.fit_transform(company['Urban'])
company['US'] = le.fit_transform(company['US'])

In [8]:
company

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,1,138,73,11,276,120,0,42,17,1,1
1,2,111,48,16,260,83,1,65,10,1,1
2,1,113,35,10,269,80,2,59,12,1,1
3,1,117,100,4,466,97,2,55,14,1,1
4,0,141,64,3,340,128,0,38,13,1,0
...,...,...,...,...,...,...,...,...,...,...,...
395,2,138,108,17,203,128,1,33,14,1,1
396,1,139,23,3,37,120,2,55,11,0,1
397,1,162,26,12,368,159,2,40,18,1,1
398,1,100,79,7,284,95,0,50,12,1,1


In [9]:
min = MinMaxScaler() #transforming the dataset to get better result
std_company =  min.fit_transform(company)
std_company = pd.DataFrame(std_company,columns=('Sales','CompPrice','Income','Advertising','Population','Price','ShelveLoc','Age','Education','Urban','US'))
std_company

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,0.5,0.622449,0.525253,0.379310,0.533066,0.574850,0.0,0.309091,0.875,1.0,1.0
1,1.0,0.346939,0.272727,0.551724,0.501002,0.353293,0.5,0.727273,0.000,1.0,1.0
2,0.5,0.367347,0.141414,0.344828,0.519038,0.335329,1.0,0.618182,0.250,1.0,1.0
3,0.5,0.408163,0.797980,0.137931,0.913828,0.437126,1.0,0.545455,0.500,1.0,1.0
4,0.0,0.653061,0.434343,0.103448,0.661323,0.622754,0.0,0.236364,0.375,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
395,1.0,0.622449,0.878788,0.586207,0.386774,0.622754,0.5,0.145455,0.500,1.0,1.0
396,0.5,0.632653,0.020202,0.103448,0.054108,0.574850,1.0,0.545455,0.125,0.0,1.0
397,0.5,0.867347,0.050505,0.413793,0.717435,0.808383,1.0,0.272727,1.000,1.0,1.0
398,0.5,0.234694,0.585859,0.241379,0.549098,0.425150,0.0,0.454545,0.250,1.0,1.0


In [10]:
std_company.corr() #Checking the correlation between the columns

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
Sales,1.0,0.060345,0.106786,0.222274,0.005188,-0.359774,0.167151,-0.21435,-0.030147,0.017709,0.154487
CompPrice,0.060345,1.0,-0.080653,-0.024199,-0.094707,0.584848,0.02335,-0.100239,0.025197,0.066594,0.016869
Income,0.106786,-0.080653,1.0,0.058995,-0.007877,-0.056698,-0.067678,-0.00467,-0.056855,0.037967,0.089601
Advertising,0.222274,-0.024199,0.058995,1.0,0.265652,0.044537,0.008544,-0.004557,-0.033594,0.042035,0.68446
Population,0.005188,-0.094707,-0.007877,0.265652,1.0,-0.012144,-0.044772,-0.042663,-0.106378,-0.052025,0.060564
Price,-0.359774,0.584848,-0.056698,0.044537,-0.012144,1.0,0.014633,-0.102177,0.011747,0.047016,0.057861
ShelveLoc,0.167151,0.02335,-0.067678,0.008544,-0.044772,0.014633,1.0,0.056488,0.001492,-0.063978,-0.03976
Age,-0.21435,-0.100239,-0.00467,-0.004557,-0.042663,-0.102177,0.056488,1.0,0.006488,0.028479,0.008652
Education,-0.030147,0.025197,-0.056855,-0.033594,-0.106378,0.011747,0.001492,0.006488,1.0,-0.033094,-0.07825
Urban,0.017709,0.066594,0.037967,0.042035,-0.052025,0.047016,-0.063978,0.028479,-0.033094,1.0,0.047085


Population column is having very low correlation that's why we can drop that column

In [11]:
std_company = std_company.drop(columns='Population',axis=1)
std_company.shape

(400, 10)

In [12]:
std_company['Sales'] = le.fit_transform(std_company['Sales'])
std_company['ShelveLoc'] = le.fit_transform(std_company['ShelveLoc'])
std_company['Urban'] = le.fit_transform(std_company['Urban'])
std_company['US'] = le.fit_transform(std_company['US'])

In [13]:
std_company['Sales'].value_counts()

1    247
0    102
2     51
Name: Sales, dtype: int64

In [14]:
X = std_company.iloc[:,1:] #Splitting the data into featuers and labels
X.head()


Unnamed: 0,CompPrice,Income,Advertising,Price,ShelveLoc,Age,Education,Urban,US
0,0.622449,0.525253,0.37931,0.57485,0,0.309091,0.875,1,1
1,0.346939,0.272727,0.551724,0.353293,1,0.727273,0.0,1,1
2,0.367347,0.141414,0.344828,0.335329,2,0.618182,0.25,1,1
3,0.408163,0.79798,0.137931,0.437126,2,0.545455,0.5,1,1
4,0.653061,0.434343,0.103448,0.622754,0,0.236364,0.375,1,0


In [15]:
Y = std_company.iloc[:,0] #Splitting the data into featuers and labels
Y.head()

0    1
1    2
2    1
3    1
4    0
Name: Sales, dtype: int64

In [16]:
model = RandomForestClassifier() # Getting the optimum hyperparameters using GridSearchcv
max_features = [3,4,5,6]
n_estimators = [50,100,150,200]
cv = KFold()
param_grid = dict(max_features=max_features,n_estimators=n_estimators)
grid = GridSearchCV(estimator=model,param_grid=param_grid,verbose=10,cv=cv)
grid.fit(X,Y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] max_features=3, n_estimators=50 .................................
[CV] ..... max_features=3, n_estimators=50, score=0.637, total=   0.1s
[CV] max_features=3, n_estimators=50 .................................
[CV] ..... max_features=3, n_estimators=50, score=0.713, total=   0.1s
[CV] max_features=3, n_estimators=50 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] ..... max_features=3, n_estimators=50, score=0.775, total=   0.1s
[CV] max_features=3, n_estimators=50 .................................
[CV] ..... max_features=3, n_estimators=50, score=0.738, total=   0.1s
[CV] max_features=3, n_estimators=50 .................................
[CV] ..... max_features=3, n_estimators=50, score=0.762, total=   0.1s
[CV] max_features=3, n_estimators=100 ................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s remaining:    0.0s


[CV] .... max_features=3, n_estimators=100, score=0.662, total=   0.2s
[CV] max_features=3, n_estimators=100 ................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.7s remaining:    0.0s


[CV] .... max_features=3, n_estimators=100, score=0.700, total=   0.2s
[CV] max_features=3, n_estimators=100 ................................
[CV] .... max_features=3, n_estimators=100, score=0.762, total=   0.2s
[CV] max_features=3, n_estimators=100 ................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.1s remaining:    0.0s


[CV] .... max_features=3, n_estimators=100, score=0.725, total=   0.2s
[CV] max_features=3, n_estimators=100 ................................
[CV] .... max_features=3, n_estimators=100, score=0.750, total=   0.2s
[CV] max_features=3, n_estimators=150 ................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.3s remaining:    0.0s


[CV] .... max_features=3, n_estimators=150, score=0.625, total=   0.3s
[CV] max_features=3, n_estimators=150 ................................
[CV] .... max_features=3, n_estimators=150, score=0.700, total=   0.3s
[CV] max_features=3, n_estimators=150 ................................
[CV] .... max_features=3, n_estimators=150, score=0.775, total=   0.3s
[CV] max_features=3, n_estimators=150 ................................
[CV] .... max_features=3, n_estimators=150, score=0.700, total=   0.3s
[CV] max_features=3, n_estimators=150 ................................
[CV] .... max_features=3, n_estimators=150, score=0.738, total=   0.3s
[CV] max_features=3, n_estimators=200 ................................
[CV] .... max_features=3, n_estimators=200, score=0.637, total=   0.3s
[CV] max_features=3, n_estimators=200 ................................
[CV] .... max_features=3, n_estimators=200, score=0.738, total=   0.3s
[CV] max_features=3, n_estimators=200 ................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   18.6s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_s

In [17]:
print('The best parameters are ',grid.best_params_)

The best parameters are  {'max_features': 6, 'n_estimators': 100}


In [44]:
random = RandomForestClassifier(n_estimators=100,max_features=6) #building the model

In [45]:
random.fit(X,Y) #training the model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=6,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [46]:
pred = random.predict(X)

In [47]:
print(accuracy_score(Y,pred))

0.98


In [48]:
result = cross_val_score(random, X, Y,cv=KFold(10))
result


array([0.6  , 0.725, 0.6  , 0.7  , 0.75 , 0.85 , 0.75 , 0.725, 0.775,
       0.8  ])

In [49]:
print(np.round(result.mean(),2))

0.73


In [24]:
from sklearn.metrics import confusion_matrix
con = confusion_matrix(Y,pred)
print(con)

[[102   0   0]
 [  0 247   0]
 [  0   0  51]]


In [25]:
random.feature_importances_

array([0.1498007 , 0.123105  , 0.08804824, 0.23075034, 0.15754359,
       0.14923584, 0.06784264, 0.01662484, 0.01704881])

# Question 2

In [26]:
fraud = pd.read_csv('Fraud_check.csv')
fraud.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [27]:
fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [28]:
fraud.shape

(600, 6)

In [29]:
fraud = fraud.rename({'Marital.Status':'Marital','Taxable.Income':'Tax','City.Population':'Population','Work.Experience':'Work_Experience'},axis=1) 
# Changing the names of some colomns for better understanding

In [30]:
fraud["Income_Nature"]="<=30000" #converting the data into good and risky 
fraud.loc[fraud["Tax"]>=30000,"Income_Nature"]="Good"
fraud.loc[fraud["Tax"]<=30000,"Income_Nature"]="Risky"

In [31]:
fraud.head()

Unnamed: 0,Undergrad,Marital,Tax,Population,Work_Experience,Urban,Income_Nature
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good


In [32]:
fraud = fraud.drop(columns='Tax') # droping the tax column because it is not helpful for model building

In [33]:
fraud['Income_Nature'].value_counts()

Good     476
Risky    124
Name: Income_Nature, dtype: int64

In [34]:
coder = LabelEncoder() # Converting into categorical data
fraud['Undergrad'] = coder.fit_transform(fraud['Undergrad'])
fraud['Marital'] = coder.fit_transform(fraud['Marital'])
fraud['Urban'] = coder.fit_transform(fraud['Urban'])
fraud['Income_Nature'] = coder.fit_transform(fraud['Income_Nature'])

In [35]:
max = MinMaxScaler() # Standardizing the data
std_fraud = max.fit_transform(fraud)
std_fraud = pd.DataFrame(std_fraud,columns=('Undergrad','Marital','Population','Work_Experience','Urban','Income_Nature'))
std_fraud.head()

Unnamed: 0,Undergrad,Marital,Population,Work_Experience,Urban,Income_Nature
0,0.0,1.0,0.139472,0.333333,1.0,0.0
1,1.0,0.0,0.622394,0.6,1.0,0.0
2,0.0,0.5,0.772568,1.0,1.0,0.0
3,1.0,1.0,0.962563,0.5,1.0,0.0
4,0.0,0.5,0.010081,0.933333,0.0,0.0


In [36]:
X1 = std_fraud.iloc[:,0:5] #Splitting the data into featuers and labels

In [37]:
Y1 = std_fraud.iloc[:,5] #Splitting the data into featuers and labels
Y1.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Income_Nature, dtype: float64

In [38]:
x_train,x_test,y_train,y_test = train_test_split(X1,Y1,test_size=0.2,random_state=20) # converting into test and train data

In [39]:
model1 = RandomForestClassifier()
n_estimators=[50,100,150,200]
max_features = [2,3,4,5]
ccp_alpha = [0.1,0.2,0.01,0.3]
param_grid = dict(n_estimators=n_estimators,max_features=max_features,ccp_alpha=ccp_alpha)
grid1 = GridSearchCV(model1,param_grid,verbose=10)
grid1.fit(X1,Y1)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] ccp_alpha=0.1, max_features=2, n_estimators=50 ..................
[CV]  ccp_alpha=0.1, max_features=2, n_estimators=50, score=0.800, total=   0.1s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=50 ..................
[CV]  ccp_alpha=0.1, max_features=2, n_estimators=50, score=0.792, total=   0.1s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=50 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV]  ccp_alpha=0.1, max_features=2, n_estimators=50, score=0.792, total=   0.1s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=50 ..................
[CV]  ccp_alpha=0.1, max_features=2, n_estimators=50, score=0.792, total=   0.1s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=50 ..................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s


[CV]  ccp_alpha=0.1, max_features=2, n_estimators=50, score=0.792, total=   0.1s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=100 .................
[CV]  ccp_alpha=0.1, max_features=2, n_estimators=100, score=0.800, total=   0.2s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=100 .................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.7s remaining:    0.0s


[CV]  ccp_alpha=0.1, max_features=2, n_estimators=100, score=0.792, total=   0.2s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=100 .................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.9s remaining:    0.0s


[CV]  ccp_alpha=0.1, max_features=2, n_estimators=100, score=0.792, total=   0.2s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=100 .................
[CV]  ccp_alpha=0.1, max_features=2, n_estimators=100, score=0.792, total=   0.2s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=100 .................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.3s remaining:    0.0s


[CV]  ccp_alpha=0.1, max_features=2, n_estimators=100, score=0.792, total=   0.2s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=150 .................
[CV]  ccp_alpha=0.1, max_features=2, n_estimators=150, score=0.800, total=   0.3s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=150 .................
[CV]  ccp_alpha=0.1, max_features=2, n_estimators=150, score=0.792, total=   0.3s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=150 .................
[CV]  ccp_alpha=0.1, max_features=2, n_estimators=150, score=0.792, total=   0.3s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=150 .................
[CV]  ccp_alpha=0.1, max_features=2, n_estimators=150, score=0.792, total=   0.3s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=150 .................
[CV]  ccp_alpha=0.1, max_features=2, n_estimators=150, score=0.792, total=   0.3s
[CV] ccp_alpha=0.1, max_features=2, n_estimators=200 .................
[CV]  ccp_alpha=0.1, max_features=2, n_estimators=200, score=0.800, total=   0.4s


[Parallel(n_jobs=1)]: Done 320 out of 320 | elapsed:  1.4min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [40]:
print('best parameters are',grid1.best_params_)

best parameters are {'ccp_alpha': 0.1, 'max_features': 2, 'n_estimators': 50}


In [41]:
random1 = RandomForestClassifier(n_estimators=50,criterion='entropy',max_features=2,ccp_alpha=0.1)
random1.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.1, class_weight=None,
                       criterion='entropy', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [42]:
pred = random1.predict(x_test)

In [43]:
random1.score(x_test,y_test)

0.7833333333333333