# Problem Statement:
# A cloth manufacturing company is interested to know about the segment or attributes causes high sale. 
# Approach - A Random Forest can be built with target variable Sales (we will first convert it in categorical variable) & all other variable will be independent in the analysis.  


In [48]:
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [49]:
# Load the data set ' Company_Data1.csv '
df= pd.read_csv('Company_Data1.csv')
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [50]:
# basic info about the data 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [51]:
df.isnull().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [52]:
df.duplicated().sum()

0

# No null values and duplicated values in the data set

In [53]:
df.describe().round(2)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,7.5,124.98,68.66,6.64,264.84,115.8,53.32,13.9
std,2.82,15.33,27.99,6.65,147.38,23.68,16.2,2.62
min,0.0,77.0,21.0,0.0,10.0,24.0,25.0,10.0
25%,5.39,115.0,42.75,0.0,139.0,100.0,39.75,12.0
50%,7.49,125.0,69.0,5.0,272.0,117.0,54.5,14.0
75%,9.32,135.0,91.0,12.0,398.5,131.0,66.0,16.0
max,16.27,175.0,120.0,29.0,509.0,191.0,80.0,18.0


#

In [54]:
# First we will do one hot encoding for the other categorical features
#One hot encoding usind pandas
df=pd.get_dummies(df,columns=['Urban','US','ShelveLoc'], drop_first=True)

In [55]:
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education,Urban_Yes,US_Yes,ShelveLoc_Good,ShelveLoc_Medium
0,9.5,138,73,11,276,120,42,17,1,1,0,0
1,11.22,111,48,16,260,83,65,10,1,1,1,0
2,10.06,113,35,10,269,80,59,12,1,1,0,1
3,7.4,117,100,4,466,97,55,14,1,1,0,1
4,4.15,141,64,3,340,128,38,13,1,0,0,0


# As per the problem statement , we need to convert the sales into categorical feature

In [56]:
# Define the bin edges and labels for three categories
bin_edges = [0, 7.49, 9.32, 16.27]  # Adjusted bin edges
bin_labels = ['Low', 'Medium', 'High']

In [57]:
# Use pd.cut to convert 'Sales' into a categorical variable
df['Sales_Category'] = pd.cut(df['Sales'], bins=bin_edges, labels=bin_labels)

In [58]:
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education,Urban_Yes,US_Yes,ShelveLoc_Good,ShelveLoc_Medium,Sales_Category
0,9.5,138,73,11,276,120,42,17,1,1,0,0,High
1,11.22,111,48,16,260,83,65,10,1,1,1,0,High
2,10.06,113,35,10,269,80,59,12,1,1,0,1,High
3,7.4,117,100,4,466,97,55,14,1,1,0,1,Low
4,4.15,141,64,3,340,128,38,13,1,0,0,0,Low


In [59]:
# Drop sales column from the data frame
df.drop('Sales',axis = 1,inplace = True)

In [60]:
df.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,Urban_Yes,US_Yes,ShelveLoc_Good,ShelveLoc_Medium,Sales_Category
0,138,73,11,276,120,42,17,1,1,0,0,High
1,111,48,16,260,83,65,10,1,1,1,0,High
2,113,35,10,269,80,59,12,1,1,0,1,High
3,117,100,4,466,97,55,14,1,1,0,1,Low
4,141,64,3,340,128,38,13,1,0,0,0,Low


In [61]:
df['Sales_Category'].value_counts()

Low       200
Medium    100
High       99
Name: Sales_Category, dtype: int64

In [62]:
# Since Low is the most frequently occured value
df = df.fillna('Low')

In [63]:
df['Sales_Category'].value_counts()

Low       201
Medium    100
High       99
Name: Sales_Category, dtype: int64

In [64]:
df.isnull().sum()

CompPrice           0
Income              0
Advertising         0
Population          0
Price               0
Age                 0
Education           0
Urban_Yes           0
US_Yes              0
ShelveLoc_Good      0
ShelveLoc_Medium    0
Sales_Category      0
dtype: int64

# No null values after performing the basic EDA as per problem statement

# Now the dataset is ready for the problem solving

In [65]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [66]:
# Split the data into features (X) and target variable (y)
x = df.drop('Sales_Category', axis=1)
y = df['Sales_Category']

In [67]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [68]:
# Define the Random Forest model and perform with default parameter values
rfc = RandomForestClassifier()

In [69]:
rfc.fit(x_train,y_train)

In [70]:
pred_train = rfc.predict(x_train)
pred_train[:5]

array(['High', 'Medium', 'High', 'Low', 'High'], dtype=object)

In [71]:
pred_test = rfc.predict(x_test)
pred_test[:5]

array(['Low', 'Low', 'High', 'Low', 'Medium'], dtype=object)

In [72]:
# Evaluate the model
acc_rfc_train = accuracy_score(y_train, pred_train)
print(f'Accuracy: {acc_rfc_train}')

Accuracy: 1.0


In [73]:
# Evaluate the model
acc_rfc_test = accuracy_score(y_test, pred_test)
print(f'Accuracy: {acc_rfc_test}')

Accuracy: 0.6583333333333333


In [74]:
# Get feature importances
imp_features = rfc.feature_importances_
imp_features

array([0.12836166, 0.10218073, 0.09828628, 0.10756344, 0.22267543,
       0.12576237, 0.06755533, 0.01954472, 0.01610528, 0.08716361,
       0.02480116])

In [75]:
_# Create a DataFrame to display feature importance
imp_features_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': imp_features})

In [76]:
# Sort the DataFrame by importance in descending order
imp_features_df = imp_features_df.sort_values(by='Importance', ascending=False)

In [77]:
# Print the feature importance
print("imp_features:")
print(imp_features_df)

imp_features:
             Feature  Importance
4              Price    0.222675
0          CompPrice    0.128362
5                Age    0.125762
3         Population    0.107563
1             Income    0.102181
2        Advertising    0.098286
9     ShelveLoc_Good    0.087164
6          Education    0.067555
10  ShelveLoc_Medium    0.024801
7          Urban_Yes    0.019545
8             US_Yes    0.016105


# While the model is giving 100% accuracy on training data set, But on the test data set , the performance is poor and the important features for causing high sales will be price of the product and competitor price

# 

# Hyper parameter tuning for finding the best parameter values

In [78]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 3, 4]
}

In [37]:
# Use GridSearchCV for hyperparameter tuning
gv = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy')
gv.fit(x_train, y_train)

In [38]:
# Get the best parameters from the grid search
best_params = gv.best_params_
best_params

{'max_depth': 15,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 50}

# We will try with the above parameter values and compare the performance with the default values

In [39]:
rfc_best = RandomForestClassifier(max_depth=15,n_estimators = 50,min_samples_leaf=2,min_samples_split=2)
rfc_best.fit(x_train,y_train)

In [40]:
pred_train_best = rfc_best.predict(x_train)
pred_train_best[:5]

array(['High', 'Medium', 'High', 'Low', 'High'], dtype=object)

In [41]:
pred_test_best = rfc_best.predict(x_test)
pred_test_best[:5]

array(['Low', 'Low', 'High', 'Low', 'Low'], dtype=object)

In [42]:
# Evaluate the model
acc_best_train = accuracy_score(y_train, pred_train_best)
print(f'Accuracy: {acc_best_train}')

Accuracy: 0.9857142857142858


In [43]:
# Evaluate the model
acc_best_test = accuracy_score(y_test, pred_test_best)
print(f'Accuracy: {acc_best_test}')

Accuracy: 0.6916666666666667


In [44]:
# Get feature importances
feature_importances = rfc_best.feature_importances_

In [45]:
# Create a DataFrame to display feature importance
importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': feature_importances})

In [46]:
# Sort the DataFrame by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [47]:
# Print the feature importance
print("Feature Importance:")
print(importance_df)

Feature Importance:
             Feature  Importance
4              Price    0.237825
5                Age    0.126019
9     ShelveLoc_Good    0.113659
0          CompPrice    0.112596
3         Population    0.103479
2        Advertising    0.102867
1             Income    0.093730
6          Education    0.056726
10  ShelveLoc_Medium    0.021732
8             US_Yes    0.016835
7          Urban_Yes    0.014531


# While the model is giving 98% accuracy on training data set, But on the test data set , the performance is poor and the important features for causing high sales will be price of the product and age in this model.

# from the two models, the Price of the product and Competetor price are having importance for high sales.

In [79]:
# Cross validation suing KFOLD
from sklearn.model_selection import cross_val_score,KFold
kfold =KFold(n_splits=10,shuffle =True,random_state =None)
rfc_cv =RandomForestClassifier(n_estimators=100,max_features =3)
results=cross_val_score(rfc_cv,x,y, cv =kfold)

In [80]:
print(results)

[0.65  0.725 0.575 0.6   0.825 0.7   0.725 0.8   0.6   0.575]


In [81]:
print(np.mean(results))

0.6775


# On cross validation of the data , the model performance is 67% 