In [1]:
# Import necessary libraries
import pandas as pd
import numpy as numpy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import GridSearchCV


In [2]:
# Load the dataset
data = pd.read_csv('D:/sales_marketing/notebooks/data/Train.csv')

In [3]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
# Data preprocessing
# Handle missing values
data['Item_Weight'].fillna(data['Item_Weight'].mean(), inplace=True)
data['Outlet_Size'].fillna(data['Outlet_Size'].mode()[0], inplace=True)


In [5]:
# Encode categorical variables
data = pd.get_dummies(data, columns=['Item_Fat_Content', 'Outlet_Location_Type', 'Outlet_Type'], drop_first=True)


In [6]:
# Feature selection
X = data.drop(['Item_Identifier', 'Outlet_Establishment_Year', 'Item_Outlet_Sales'], axis=1)
y = data['Item_Outlet_Sales']

In [7]:
X

Unnamed: 0,Item_Weight,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Fat_Content_low fat,Item_Fat_Content_reg,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.300,0.016047,Dairy,249.8092,OUT049,Medium,True,False,False,False,False,False,True,False,False
1,5.920,0.019278,Soft Drinks,48.2692,OUT018,Medium,False,True,False,False,False,True,False,True,False
2,17.500,0.016760,Meat,141.6180,OUT049,Medium,True,False,False,False,False,False,True,False,False
3,19.200,0.000000,Fruits and Vegetables,182.0950,OUT010,Medium,False,True,False,False,False,True,False,False,False
4,8.930,0.000000,Household,53.8614,OUT013,High,True,False,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,0.056783,Snack Foods,214.5218,OUT013,High,True,False,False,False,False,True,True,False,False
8519,8.380,0.046982,Baking Goods,108.1570,OUT045,Medium,False,True,False,False,True,False,True,False,False
8520,10.600,0.035186,Health and Hygiene,85.1224,OUT035,Small,True,False,False,False,True,False,True,False,False
8521,7.210,0.145221,Snack Foods,103.1332,OUT018,Medium,False,True,False,False,False,True,False,True,False


In [8]:
y

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
          ...    
8518    2778.3834
8519     549.2850
8520    1193.1136
8521    1845.5976
8522     765.6700
Name: Item_Outlet_Sales, Length: 8523, dtype: float64

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Feature scaling
from sklearn.preprocessing import StandardScaler

# Identify numeric columns
numeric_columns = X_train.select_dtypes(include=['float64', 'int64']).columns

# Scale only the numeric columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
X_test_scaled = scaler.transform(X_test[numeric_columns])


In [14]:
print(f'Number of features after scaling: {X_train_scaled.shape[1]}')


Number of features after scaling: 3


In [15]:
# Feature selection
selector = SelectKBest(f_regression, k='all')
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)


In [16]:
# Hyperparameter tuning
param_grid = {'n_estimators': [100, 200, 300],
              'max_depth': [None, 5, 10],
              'min_samples_split': [2, 5, 10]}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
grid_search.fit(X_train_selected, y_train)
best_model = grid_search.best_estimator_


In [17]:
# Make predictions
y_pred = best_model.predict(X_test_selected)


In [18]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
accuracy = best_model.score(X_test_selected, y_test)

In [19]:
print("Mean Squared Error:", mse)
print("Accuracy:", accuracy)

Mean Squared Error: 1738338.550011471
Accuracy: 0.3604276326062259
