In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


### Problem 01 - Company Problem
A cloth manufacturing company is interested to know about the segment or attributes causes high sale. 
Approach - A Random Forest can be built with target variable Sales (we will first convert it in categorical variable) & all other variable will be independent in the analysis.  


#### A. Import Data

In [2]:
data_1 = pd.read_csv('Company_Data.csv')
data_1

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


#### B. Data Understanding

In [15]:
data_1.shape

(400, 11)

In [16]:
data_1.isnull().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [17]:
data_1.dtypes

Sales          float64
CompPrice        int64
Income           int64
Advertising      int64
Population       int64
Price            int64
ShelveLoc       object
Age              int64
Education        int64
Urban           object
US              object
dtype: object

In [18]:
data_1.describe()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,7.496325,124.975,68.6575,6.635,264.84,115.795,53.3225,13.9
std,2.824115,15.334512,27.986037,6.650364,147.376436,23.676664,16.200297,2.620528
min,0.0,77.0,21.0,0.0,10.0,24.0,25.0,10.0
25%,5.39,115.0,42.75,0.0,139.0,100.0,39.75,12.0
50%,7.49,125.0,69.0,5.0,272.0,117.0,54.5,14.0
75%,9.32,135.0,91.0,12.0,398.5,131.0,66.0,16.0
max,16.27,175.0,120.0,29.0,509.0,191.0,80.0,18.0


In [19]:
# From the Targeted column Sales. we can observe mean and median is 7.49, so the threshold is 7.49.

#### C. Data Preparation

In [22]:
data_1["sales"]="small"

In [24]:
data_1.loc[data_1["Sales"]>7.49,"sales"]="large"

In [25]:
data_1.drop(["Sales"],axis=1,inplace=True)

In [26]:
data_1 = pd.get_dummies(data_1, columns = ['ShelveLoc','US','Urban'])

In [27]:
X = data_1.drop(labels='sales',axis=1)
y = data_1[['sales']]

In [28]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.20,random_state=12,shuffle=True)

#### D. Model Building

In [29]:
rf_classifier_1 = RandomForestClassifier()
rf_classifier_1.fit(X_train,y_train)

  rf_classifier_1.fit(X_train,y_train)


RandomForestClassifier()

In [32]:
y_train_pred = rf_classifier_1.predict(X_train)

In [33]:
accuracy_score(y_train,y_train_pred)

1.0

In [36]:
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

       large       1.00      1.00      1.00       159
       small       1.00      1.00      1.00       161

    accuracy                           1.00       320
   macro avg       1.00      1.00      1.00       320
weighted avg       1.00      1.00      1.00       320



In [37]:
print(confusion_matrix(y_train,y_train_pred))

[[159   0]
 [  0 161]]


In [39]:
y_test_pred = rf_classifier_1.predict(X_test)

In [40]:
print(accuracy_score(y_test,y_test_pred))

0.8125


In [41]:
print('Classification Report:\n',classification_report(y_test,y_test_pred))

Classification Report:
               precision    recall  f1-score   support

       large       0.88      0.72      0.79        40
       small       0.77      0.90      0.83        40

    accuracy                           0.81        80
   macro avg       0.82      0.81      0.81        80
weighted avg       0.82      0.81      0.81        80



In [42]:
print('Confusion Matrix:\n',confusion_matrix(y_test,y_test_pred))

Confusion Matrix:
 [[29 11]
 [ 4 36]]


#### from above we can observe that training accuracy is 100% while test accuracy is 81.25 %. 
#### Model is clearly overfitted we can tackle this over fitting problem by using GridSearchCV.