In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn import preprocessing


In [2]:
data=pd.read_csv("Company_Data.csv")

In [3]:
data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


In [4]:
data.describe()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,7.496325,124.975,68.6575,6.635,264.84,115.795,53.3225,13.9
std,2.824115,15.334512,27.986037,6.650364,147.376436,23.676664,16.200297,2.620528
min,0.0,77.0,21.0,0.0,10.0,24.0,25.0,10.0
25%,5.39,115.0,42.75,0.0,139.0,100.0,39.75,12.0
50%,7.49,125.0,69.0,5.0,272.0,117.0,54.5,14.0
75%,9.32,135.0,91.0,12.0,398.5,131.0,66.0,16.0
max,16.27,175.0,120.0,29.0,509.0,191.0,80.0,18.0


In [5]:
A=data['Sales'].copy()
A[data.Sales<=(7.5-1*2.83)]=1
A[data.Sales>=(7.5+1*2.83)]=3
A[(data.Sales<(7.5+1*2.83))&(data.Sales>(7.5-1*2.83))]=2
data['Sales']=A.copy()

In [6]:
data.groupby(['Sales']).count()

Unnamed: 0_level_0,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
Sales,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1.0,60,60,60,60,60,60,60,60,60,60
2.0,274,274,274,274,274,274,274,274,274,274
3.0,66,66,66,66,66,66,66,66,66,66


In [7]:
label_encoder=preprocessing.LabelEncoder()
data['ShelveLoc']=label_encoder.fit_transform(data['ShelveLoc'])
data['US']=label_encoder.fit_transform(data['US'])
data['Urban']=label_encoder.fit_transform(data['Urban'])

In [8]:
feat_labels=['CompPrice','Income','Advertising','Population','Price','ShelveLoc','Age','Education','Urban','US']

In [9]:
X=np.array(data.iloc[:,1:])
Y=np.array(data.iloc[:,0:1]).ravel()

In [10]:
parameters={'n_estimators':[50,100,150,200,250,300,350,400,450,500],'max_features':[1,2,3,4,5,6,7,8,9,10], 'max_depth':[3,5,7]}
model=RandomForestClassifier(random_state=230)
grid=GridSearchCV(model,parameters,cv=5,scoring='accuracy')
grid.fit(X,Y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=230),
             param_grid={'max_depth': [3, 5, 7],
                         'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400,
                                          450, 500]},
             scoring='accuracy')

In [11]:
grid.best_params_

{'max_depth': 7, 'max_features': 8, 'n_estimators': 50}

In [12]:
model=RandomForestClassifier(n_estimators=50,max_features=8,max_depth=7)
results=cross_val_score(model,X,Y,cv=KFold(n_splits=10))
print(results.mean())

0.745


In [13]:
model.fit(X,Y)
for feature in zip(feat_labels, model.feature_importances_):
    print(feature)

('CompPrice', 0.14488443290933503)
('Income', 0.08371797858202784)
('Advertising', 0.07032885295888995)
('Population', 0.10290347663724395)
('Price', 0.2308533132327785)
('ShelveLoc', 0.18908424222011425)
('Age', 0.113656407710834)
('Education', 0.04258566382984757)
('Urban', 0.010508450251216602)
('US', 0.011477181667712381)


In [14]:
sfm=SelectFromModel(model, threshold=0.10)
sfm.fit(X,Y)

SelectFromModel(estimator=RandomForestClassifier(max_depth=7, max_features=8,
                                                 n_estimators=50),
                threshold=0.1)

In [15]:
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

CompPrice
Price
ShelveLoc
Age


# Building model using top five features

In [16]:
X_train=np.array(data[['CompPrice','Population','Price','ShelveLoc','Age']])

In [17]:
parameters={'n_estimators':[50,100,150,200,250,300,350,400,450,500],'max_features':[1,2,3,4,5], 'max_depth':[3,5,7]}
model1=RandomForestClassifier(random_state=230)
grid1=GridSearchCV(model,parameters,cv=5,scoring='accuracy')
grid1.fit(X_train,Y)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(max_depth=7, max_features=8,
                                              n_estimators=50),
             param_grid={'max_depth': [3, 5, 7],
                         'max_features': [1, 2, 3, 4, 5],
                         'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400,
                                          450, 500]},
             scoring='accuracy')

In [18]:
grid1.best_params_

{'max_depth': 7, 'max_features': 5, 'n_estimators': 50}

In [19]:
model1=RandomForestClassifier(n_estimators=200,max_features=2,max_depth=7)
results=cross_val_score(model1,X_train,Y,cv=KFold(n_splits=10))
print(results.mean())

0.7525
