In [14]:
# Loading the dataset
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [15]:
# Loading the dataset
df = pd.read_csv('Company_Data.csv')

In [16]:
# Checking the null values in the data
df.isnull().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [17]:
# Separating the features and the target variable
X = df.drop('Sales', axis=1)
y = df['Sales']

In [22]:
# Identifying the columns with non-numeric values
non_numeric_cols = X.select_dtypes(exclude='number').columns

In [23]:
# Found the categorical features present in the dataset
for col in non_numeric_cols:
    unique_values = X[col].unique()
    print(f"Column '{col}': {unique_values}")

Column 'ShelveLoc': ['Bad' 'Good' 'Medium']
Column 'Urban': ['Yes' 'No']
Column 'US': ['Yes' 'No']


In [24]:
# Based on the unique values, decided on an appropriate strategy to handle non-numeric values (removed the rows with non-numeric values)
X = X.dropna(subset=non_numeric_cols)
y = y[X.index]

In [25]:
# importing the required library
from sklearn.preprocessing import LabelEncoder
# Converting non-numeric values to numeric representations
for col in non_numeric_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

In [26]:
# One-hot encoding on categorical variables:
X = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True)

# Using RandomForest which is an bagging ensemble method is for Feature Selection

In [27]:
# Initialize the Random Forest regressor
rf = RandomForestRegressor()

In [28]:
# Fit the model
rf.fit(X, y)

RandomForestRegressor()

In [29]:
# Get feature importances
feature_importances = rf.feature_importances_

In [30]:
# Create a dataframe to display the feature importances
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [31]:
# Get the important features
print(importance_df)

        Feature  Importance
4         Price    0.286122
7   ShelveLoc_1    0.249022
0     CompPrice    0.109614
5           Age    0.094272
2   Advertising    0.076186
1        Income    0.056458
8   ShelveLoc_2    0.055525
3    Population    0.034821
6     Education    0.026255
10         US_1    0.006461
9       Urban_1    0.005262


<!-- Price (Importance: 0.286122):

This feature is highly important according to the model.
It indicates the price of the product.
A higher importance suggests that variations in the price have a significant impact on sales.
ShelveLoc_1 (Importance: 0.249022):

This feature is also highly important.
It represents a categorical variable related to the shelving location of the product.
The importance suggests that different shelving locations have a substantial influence on sales.
CompPrice (Importance: 0.109614):

This feature measures the price competitors charge for similar products.
It is moderately important in determining sales.
Age (Importance: 0.094272):

This feature represents the age of the product.
It has a moderate impact on sales, indicating that newer or older products may have different sales patterns.
Advertising (Importance: 0.076186):

This feature reflects the advertising budget for the product.
It is moderately important, suggesting that the amount spent on advertising influences sales.
Income (Importance: 0.056458):

This feature represents the income level of the consumers.
It has a moderate impact on sales, indicating that consumer income plays a role in purchasing decisions.
ShelveLoc_2 (Importance: 0.055525):

This is another categorical variable related to the shelving location.
Its importance suggests that different shelving locations have varying effects on sales.
Population (Importance: 0.034821):

This feature represents the population size of the area where the product is sold.
It has a relatively lower importance compared to other features.
Education (Importance: 0.026255):

This feature reflects the education level of consumers.
It has a relatively lower importance, indicating that education level has a less significant impact on sales.
US_1 (Importance: 0.006461):

This is a binary variable indicating whether the product is manufactured in the United States or not.
It has a relatively low importance compared to other features.
Urban_1 (Importance: 0.00526):

This is a binary variable indicating whether the product is sold in an urban area or not.
It has the lowest importance among all the features. -->

# Price (Importance: 0.286122):

# This feature is highly important according to the model.
# It indicates the price of the product.
# A higher importance suggests that variations in the price have a significant impact on sales.

# ShelveLoc_1 (Importance: 0.249022):

# This feature is also highly important.
# It represents a categorical variable related to the shelving location of the product.
# The importance suggests that different shelving locations have a substantial influence on sales.
# CompPrice (Importance: 0.109614):

# This feature measures the price competitors charge for similar products.
# It is moderately important in determining sales.

# Age (Importance: 0.094272):

# This feature represents the age of the product.
# It has a moderate impact on sales, indicating that newer or older products may have different sales patterns.
# Advertising (Importance: 0.076186):

# This feature reflects the advertising budget for the product.
# It is moderately important, suggesting that the amount spent on advertising influences sales.

# Income (Importance: 0.056458):

# This feature represents the income level of the consumers.
# It has a moderate impact on sales, indicating that consumer income plays a role in purchasing decisions.

# ShelveLoc_2 (Importance: 0.055525):

# This is another categorical variable related to the shelving location.
# Its importance suggests that different shelving locations have varying effects on sales.
# Population (Importance: 0.034821):

# This feature represents the population size of the area where the product is sold.
# It has a relatively lower importance compared to other features.

# Education (Importance: 0.026255):

# This feature reflects the education level of consumers.
# It has a relatively lower importance, indicating that education level has a less significant impact on sales.

# US_1 (Importance: 0.006461):

# This is a binary variable indicating whether the product is manufactured in the United States or not.
# It has a relatively low importance compared to other features.

# Urban_1 (Importance: 0.00526):

# This is a binary variable indicating whether the product is sold in an urban area or not.
# It has the lowest importance among all the features.