## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset over here

In [2]:
data=pd.read_csv("A_year_of_pizza_sales_from_a_pizza_place_872_68.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,X,id,date,time,name,size,type,price
0,1,1,2015-000001,2015-01-01,11:38:36,hawaiian,M,classic,13.25
1,2,2,2015-000002,2015-01-01,11:57:40,classic_dlx,M,classic,16.0
2,3,3,2015-000002,2015-01-01,11:57:40,mexicana,M,veggie,16.0
3,4,4,2015-000002,2015-01-01,11:57:40,thai_ckn,L,chicken,20.75
4,5,5,2015-000002,2015-01-01,11:57:40,five_cheese,L,veggie,18.5


In [4]:
data.shape

(49574, 9)

## Taking care of duplicate observations if present inside the dataset over here

In [5]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [6]:
data.isnull().sum()

Unnamed: 0    0
X             0
id            0
date          0
time          0
name          0
size          0
type          0
price         0
dtype: int64

## Filtering all the numerical features over here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

Unnamed: 0
X
price


In [8]:
data[numerical_features]

Unnamed: 0.1,Unnamed: 0,X,price
0,1,1,13.25
1,2,2,16.00
2,3,3,16.00
3,4,4,20.75
4,5,5,18.50
...,...,...,...
49569,49570,49570,17.95
49570,49571,49571,12.00
49571,49572,49572,16.75
49572,49573,49573,20.25


## Filtering all the categorical features over here

In [9]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

id
date
time
name
size
type


In [10]:
data[cat_features]

Unnamed: 0,id,date,time,name,size,type
0,2015-000001,2015-01-01,11:38:36,hawaiian,M,classic
1,2015-000002,2015-01-01,11:57:40,classic_dlx,M,classic
2,2015-000002,2015-01-01,11:57:40,mexicana,M,veggie
3,2015-000002,2015-01-01,11:57:40,thai_ckn,L,chicken
4,2015-000002,2015-01-01,11:57:40,five_cheese,L,veggie
...,...,...,...,...,...,...
49569,2015-021348,2015-12-31,21:23:10,four_cheese,L,veggie
49570,2015-021348,2015-12-31,21:23:10,napolitana,S,classic
49571,2015-021348,2015-12-31,21:23:10,ckn_alfredo,M,chicken
49572,2015-021349,2015-12-31,22:09:54,mexicana,L,veggie


## Encoding the categorical features into numerical features over here

In [11]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [12]:
data

Unnamed: 0.1,Unnamed: 0,X,id,date,time,name,size,type,price
0,1,1,0,0,0,0,0,0,13.25
1,2,2,1,0,1,1,0,0,16.00
2,3,3,1,0,1,2,0,1,16.00
3,4,4,1,0,1,3,1,2,20.75
4,5,5,1,0,1,4,1,1,18.50
...,...,...,...,...,...,...,...,...,...
49569,49570,49570,21347,357,13281,21,1,1,17.95
49570,49571,49571,21347,357,13281,22,2,0,12.00
49571,49572,49572,21347,357,13281,29,0,2,16.75
49572,49573,49573,21348,357,16381,2,1,1,20.25


## Creating the features and labels over here

In [13]:
X=data.iloc[:,:-1]
y=data.iloc[:,-1]

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training set over here

In [15]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=10,random_state=0)
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [16]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)

In [17]:
feature_importances = regressor.feature_importances_
print(feature_importances)

[3.73e-16 5.20e-14 2.47e-14 1.49e-14 3.87e-14 1.68e-01 7.87e-01 4.53e-02]


In [18]:
# Create a DataFrame for better visualization
feature_importances_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': feature_importances
})

In [20]:
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

In [21]:
feature_importances_df

Unnamed: 0,feature,importance
6,size,0.7867227
5,name,0.1679669
7,type,0.04531045
1,X,5.197935e-14
4,time,3.866446e-14
2,id,2.471645e-14
3,date,1.486087e-14
0,Unnamed: 0,3.729798e-16


In [22]:
top_features = feature_importances_df.head(3)['feature']
print(f"Top 3 features: {list(top_features)}")

Top 3 features: ['size', 'name', 'type']


In [23]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV

In [24]:
X_train_top3 = X_train[top_features]
X_test_top3 = X_test[top_features]

# List of regressors to evaluate
regressors = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'XGBRegressor': XGBRegressor()
}

# Dictionary to store results
results = {}

# Evaluate each regressor
for name, regressor in regressors.items():
    regressor.fit(X_train_top3, y_train)
    y_pred = regressor.predict(X_test_top3)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}
    print(f'{name} - Mean Squared Error: {mse}, R^2 Score: {r2}')

# Find the best model
best_model_name = max(results, key=lambda k: results[k]['R2'])
best_model = regressors[best_model_name]
print(f'Best model: {best_model_name}')

# Hyperparameter tuning for the best model
if best_model_name == 'RandomForestRegressor':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
elif best_model_name == 'AdaBoostRegressor':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    }
elif best_model_name == 'XGBRegressor':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1]
    }
else:
    param_grid = {}

if param_grid:
    grid_search = GridSearchCV(estimator=best_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='r2')
    grid_search.fit(X_train_top3, y_train)
    best_model = grid_search.best_estimator_
    print(f'Best parameters for {best_model_name}: {grid_search.best_params_}')

# Final model training with best parameters
final_regressor = best_model
final_regressor.fit(X_train_top3, y_train)

# Evaluate the final model
y_pred = final_regressor.predict(X_test_top3)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Final Model - {best_model_name} - Mean Squared Error: {mse}, R^2 Score: {r2}')


LinearRegression - Mean Squared Error: 11.23845549646571, R^2 Score: 0.14160306127323397
Ridge - Mean Squared Error: 11.238454842349121, R^2 Score: 0.1416031112348748
Lasso - Mean Squared Error: 12.699172484532662, R^2 Score: 0.030033015789922235
KNeighborsRegressor - Mean Squared Error: 0.0, R^2 Score: 1.0
DecisionTreeRegressor - Mean Squared Error: 3.480559654413875e-27, R^2 Score: 1.0
RandomForestRegressor - Mean Squared Error: 3.136921500991415e-28, R^2 Score: 1.0
AdaBoostRegressor - Mean Squared Error: 0.9993033370658527, R^2 Score: 0.9236728814144851
XGBRegressor - Mean Squared Error: 3.4800451488114325e-08, R^2 Score: 0.99999999734193
Best model: KNeighborsRegressor
Final Model - KNeighborsRegressor - Mean Squared Error: 0.0, R^2 Score: 1.0
