In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [43]:
df = pd.read_csv("Company_Data.csv")
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [45]:
print(df.ShelveLoc.value_counts())
print(df.Urban.value_counts())
print(df.US.value_counts())

Medium    219
Bad        96
Good       85
Name: ShelveLoc, dtype: int64
Yes    282
No     118
Name: Urban, dtype: int64
Yes    258
No     142
Name: US, dtype: int64


In [46]:
# Converting Catagorical Variables by Using Map Function
df["ShelveLoc"]= df["ShelveLoc"].map({"Bad":0, "Medium":1, "Good":2})
df["Urban"]= df["Urban"].map({"No":0, "Yes":1})
df["US"]= df["US"].map({"No":0, "Yes":1})

In [47]:
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,0,42,17,1,1
1,11.22,111,48,16,260,83,2,65,10,1,1
2,10.06,113,35,10,269,80,1,59,12,1,1
3,7.4,117,100,4,466,97,1,55,14,1,1
4,4.15,141,64,3,340,128,0,38,13,1,0


In [48]:
print(df.ShelveLoc.value_counts())
print(df.Urban.value_counts())
print(df.US.value_counts())

1    219
0     96
2     85
Name: ShelveLoc, dtype: int64
1    282
0    118
Name: Urban, dtype: int64
1    258
0    142
Name: US, dtype: int64


In [49]:
X= df.drop("Sales", axis=1)
X.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,138,73,11,276,120,0,42,17,1,1
1,111,48,16,260,83,2,65,10,1,1
2,113,35,10,269,80,1,59,12,1,1
3,117,100,4,466,97,1,55,14,1,1
4,141,64,3,340,128,0,38,13,1,0


In [50]:
y= pd.cut(df.Sales, bins=[-1, 10, 17], labels=[0,1])
y

0      0
1      1
2      1
3      0
4      0
      ..
395    1
396    0
397    0
398    0
399    0
Name: Sales, Length: 400, dtype: category
Categories (2, int64): [0 < 1]

In [51]:
y.value_counts()

0    322
1     78
Name: Sales, dtype: int64

In [52]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Applying Random Forest on splited data

In [86]:
model = RandomForestClassifier(n_estimators=150, random_state=42, max_features=0.9)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
model.score(x_test, y_test)

0.8916666666666667

In [87]:
confusion_matrix(y_test,y_pred)

array([[90,  6],
       [ 7, 17]], dtype=int64)

In [88]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93        96
           1       0.74      0.71      0.72        24

    accuracy                           0.89       120
   macro avg       0.83      0.82      0.83       120
weighted avg       0.89      0.89      0.89       120



# Using Cross Validation

In [91]:
skfold= RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
scores = cross_val_score(model, X, y, cv=skfold, scoring="accuracy")
print(scores)

[0.825  0.85   0.8875 0.875  0.9    0.8625 0.875  0.8125 0.8875 0.85
 0.825  0.8625 0.9    0.9125 0.85   0.825  0.8625 0.8375 0.8875 0.85
 0.8375 0.825  0.8625 0.85   0.8625 0.9125 0.7875 0.85   0.8875 0.8625
 0.85   0.85   0.8875 0.9    0.9125 0.9    0.875  0.85   0.8625 0.9125
 0.9125 0.875  0.8625 0.85   0.875  0.8375 0.8875 0.825  0.8625 0.9   ]


In [92]:
np.mean(scores)

0.86525

In [93]:
np.max(scores)

0.9125

In [94]:
np.min(scores)

0.7875

# let's try to apply RandomForest Regression

In [95]:
from sklearn.ensemble import RandomForestRegressor

In [96]:
model1= RandomForestRegressor()
model1.fit(x_train, y_train)
y_pred1 = model.predict(x_test)
model.score(x_test, y_test)

0.8916666666666667

In [97]:
confusion_matrix(y_test, y_pred1)

array([[90,  6],
       [ 7, 17]], dtype=int64)

# Using Adaboost

In [98]:
from sklearn.ensemble import AdaBoostClassifier

In [122]:
model3= AdaBoostClassifier(base_estimator=model, n_estimators= 200, random_state=42)
model3.fit(x_train, y_train)
y_pred3 = model3.predict(x_test)
model3.score(x_test, y_test)

0.9

In [123]:
confusion_matrix(y_test, y_pred3)

array([[92,  4],
       [ 8, 16]], dtype=int64)

In [125]:
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94        96
           1       0.80      0.67      0.73        24

    accuracy                           0.90       120
   macro avg       0.86      0.81      0.83       120
weighted avg       0.90      0.90      0.90       120

