In [2]:
import pandas as pd               # for Data Manipulation
import numpy as np                #for Mathematical calculations
# Importing company dataset using pandas
company = pd.read_csv("Company_Data.csv")

In [3]:
company.columns

Index(['Sales', 'CompPrice', 'Income', 'Advertising', 'Population', 'Price',
       'ShelveLoc', 'Age', 'Education', 'Urban', 'US'],
      dtype='object')

In [4]:
company

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


In [5]:
company.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [6]:
company.ShelveLoc.value_counts(normalize = True)

Medium    0.5475
Bad       0.2400
Good      0.2125
Name: ShelveLoc, dtype: float64

In [7]:
company.Urban.value_counts(normalize = True)

Yes    0.705
No     0.295
Name: Urban, dtype: float64

In [8]:
company.US.value_counts(normalize = True)

Yes    0.645
No     0.355
Name: US, dtype: float64

In [9]:
company.describe()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,7.496325,124.975,68.6575,6.635,264.84,115.795,53.3225,13.9
std,2.824115,15.334512,27.986037,6.650364,147.376436,23.676664,16.200297,2.620528
min,0.0,77.0,21.0,0.0,10.0,24.0,25.0,10.0
25%,5.39,115.0,42.75,0.0,139.0,100.0,39.75,12.0
50%,7.49,125.0,69.0,5.0,272.0,117.0,54.5,14.0
75%,9.32,135.0,91.0,12.0,398.5,131.0,66.0,16.0
max,16.27,175.0,120.0,29.0,509.0,191.0,80.0,18.0


In [10]:
# Missing values
company.isnull().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [11]:
# we have to convert sales data to categorical by binning
# we will alot top 33% to high category
company['Sales'] = np.where(company['Sales'] <= company['Sales'].quantile(.67), 'Not High', 'High')

In [12]:
# converting the non numerical variables to numerical
label_ShelveLoc = {'Bad':1,'Medium':2,'Good':3}
company.ShelveLoc = company.ShelveLoc.map(label_ShelveLoc)

In [13]:
company = pd.get_dummies(company, columns = ["Urban", "US"], drop_first = True)

In [14]:
# Input and Output Split
predictors = company.loc[:, company.columns!="Sales"] 
# All row and all columns except sales column

In [15]:
target = company["Sales"] 
target

0          High
1          High
2          High
3      Not High
4      Not High
         ...   
395        High
396    Not High
397    Not High
398    Not High
399        High
Name: Sales, Length: 400, dtype: object

In [16]:
label_Sales = {'Not High':0,'High':1}
target = target.map(label_Sales)
target

0      1
1      1
2      1
3      0
4      0
      ..
395    1
396    0
397    0
398    0
399    1
Name: Sales, Length: 400, dtype: int64

In [17]:
# Splitting data into training and testing data set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.25, random_state=0)

In [18]:
from sklearn.tree import DecisionTreeClassifier as DT

In [19]:
model = DT(criterion = 'entropy')
model.fit(x_train, y_train)

In [20]:
# Prediction on Test Data
preds = model.predict(x_test)
pd.crosstab(y_test, preds, rownames=['Actual'], colnames=['Predictions'])

Predictions,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,53,10
1,13,24


In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, model.predict(x_test))

0.77

In [22]:
# Prediction on Train Data
preds = model.predict(x_train)
pd.crosstab(y_train, preds, rownames = ['Actual'], colnames = ['Predictions'])

Predictions,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,205,0
1,0,95


In [23]:
# Train Data Accuracy 
accuracy_score(y_train, model.predict(x_train))

1.0

In [24]:
# let us try random forest
from sklearn.ensemble import RandomForestClassifier

In [25]:
rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=1, random_state=42)

In [26]:
rf_clf.fit(x_train, y_train)

In [27]:
confusion_matrix(y_test, rf_clf.predict(x_test))

array([[56,  7],
       [15, 22]], dtype=int64)

In [28]:
# Test Data Accuracy 
accuracy_score(y_test, rf_clf.predict(x_test))

0.78

In [29]:
confusion_matrix(y_train, rf_clf.predict(x_train))

array([[205,   0],
       [  0,  95]], dtype=int64)

In [30]:
# Train Data Accuracy 
accuracy_score(y_train, rf_clf.predict(x_train))

1.0

In [31]:
#HYPERPARAMETER TUNING

In [32]:
# Creating new model testing with new parameters
forest_new = RandomForestClassifier(n_estimators=100,max_depth=10,min_samples_split=20,criterion='gini')  # n_estimators is the number of decision trees
forest_new.fit(x_train, y_train)

In [33]:
print('Train accuracy: {}'.format(forest_new.score(x_train, y_train)))
print('Test accuracy: {}'.format(forest_new.score(x_test, y_test)))

Train accuracy: 0.93
Test accuracy: 0.8
