In [23]:
import pickle
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, root_mean_squared_error
from datetime import datetime, timedelta

In [2]:
sales_data = pd.read_csv('dataset/Project Dataset/sales_data.csv')
amount_data = pd.read_csv('dataset/Project Dataset/amount_data.csv')

In [3]:
sales_data['Date'] = pd.to_datetime(sales_data['Date'], dayfirst = True)
sales_data = sales_data.sort_values(by = 'Date')
sales_data = sales_data.reset_index().drop(columns=['index'])
sales_data

Unnamed: 0,Date,Model,Quantity
0,2022-04-01,18 F AC,0.0
1,2022-04-01,3X7NEXX,0.0
2,2022-04-01,2T5GDEL,0.0
3,2022-04-01,2T5RRLX-GX,0.0
4,2022-04-01,2T5RRLX-XX,0.0
...,...,...,...
93598,2024-04-30,40 inch LED,0.0
93599,2024-04-30,32 inch LED,0.0
93600,2024-04-30,Multiplug,0.0
93601,2024-04-30,1D4GDEH,1.0


In [4]:
amount_data['Date'] = pd.to_datetime(amount_data['Date'], dayfirst = True)
amount_data = amount_data.sort_values(by = 'Date')
amount_data = amount_data.reset_index().drop(columns=['index'])
amount_data

Unnamed: 0,Date,Amount
0,2022-04-01,0
1,2022-04-02,239400
2,2022-04-03,274140
3,2022-04-04,177000
4,2022-04-05,106000
...,...,...
756,2024-04-26,0
757,2024-04-27,582650
758,2024-04-28,367700
759,2024-04-29,451000


In [5]:
sales_data_matrix = sales_data.pivot_table(index = 'Date', columns = 'Model', values = 'Quantity', aggfunc = 'sum', fill_value = 0).reset_index()
data = pd.merge(sales_data_matrix, amount_data, on = 'Date', how = 'inner')

In [6]:
data['day_of_year'] = data['Date'].dt.dayofyear
data['month'] = data['Date'].dt.month
data['day_of_week'] = data['Date'].dt.dayofweek
data['day_of_month'] = data['Date'].dt.day
data = data.drop(columns='Date')
data

Unnamed: 0,12 C AC,12 inch Fan,14 inch Fan,17 inch Fan,18 A AC,18 C AC,18 F AC,1B3GDEL,1B6GDEH,1B6GDEL,...,Tornedo Fan,Wall Move,Washing Machine,Water Filter,Weight Scale,Amount,day_of_year,month,day_of_week,day_of_month
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,91,4,4,1
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,239400,92,4,5,2
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,274140,93,4,6,3
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,177000,94,4,0,4
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,106000,95,4,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,117,4,4,26
757,0.0,0.0,3.0,11.0,1.0,4.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,582650,118,4,5,27
758,1.0,0.0,2.0,20.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,367700,119,4,6,28
759,0.0,0.0,5.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,451000,120,4,0,29


In [7]:
# products = data.columns.to_list()[:-4]
# sum = 0
# for product in products:
#     print(product)
#     classification_data = data[['day_of_year', 'month', 'day_of_week', product]].copy()

#     # Defining conditions here
#     conditions = [
#         classification_data[product] > 2,
#         classification_data[product] == 2,
#         classification_data[product] == 1,  
#         classification_data[product] == 0  
#     ]

#     # Defining the corresponding outputs for each condition
#     choices = [3, 2, 1, 0]

#     # Apply np.select to assign values based on the conditions
#     classification_data.loc[:, 'Sale'] = np.select(conditions, choices, default=0)

#     X = classification_data[['day_of_year', 'month', 'day_of_week']]
#     y_class = classification_data['Sale']  # Classification target
#     y_reg = classification_data[product]  # Regression target

#     X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class, test_size = 0.2, random_state = 42)

#     clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
#     clf.fit(X_train_class, y_train_class)

#     y_pred_class = clf.predict(X_test_class)
#     # print("Classification Report:\n", classification_report(y_test_class, y_pred_class))

#     y_pred_class_all = clf.predict(X)
#     # print("Classification Report on Full Data:\n", classification_report(y_class, y_pred_class_all))

#     X_train_reg = X[y_class == 3]
#     y_train_reg = y_reg[y_class == 3]

#     if (len(X_train_reg) > 0):
#         # Initialize the SGDRegressor with a small learning rate
#         reg = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)

#         # Scaling the data for regression
#         scaler = StandardScaler()
#         X_train_reg_scaled = scaler.fit_transform(X_train_reg)

#         # Train the regression model with SGDRegressor
#         reg.fit(X_train_reg_scaled, y_train_reg)

#         for i in range(len(y_pred_class_all)):
#             if y_pred_class_all[i] == 3:  # If a sale is predicted
#                 y_pred_class_all[i] = reg.predict([X.iloc[i]])[0]  # Predict the quantity

#     print(mean_absolute_error(y_reg, y_pred_class_all))
#     sum += mean_absolute_error(y_reg, y_pred_class_all)

# print(sum)

In [8]:
classification_data = data[['day_of_year', 'month', 'day_of_week', 'day_of_month']].copy()

columns_to_concat = []

products = data.columns.to_list()[:-5]

for product in products:
    # Defining conditions here
    conditions = [
        data[product] > 2,
        data[product] == 2,
        data[product] == 1,
        data[product] == 0
    ]

    # Defining the corresponding outputs for each condition
    choices = [3, 2, 1, 0]

    # Apply np.select and store the result in a separate DataFrame
    classification_column = pd.DataFrame({
        product: np.select(conditions, choices, default=0)
    })

    # Append this column to the list of columns to concatenate
    columns_to_concat.append(classification_column)

# Step 3: Concatenate all columns at once
classification_data = pd.concat([classification_data] + columns_to_concat, axis=1)

classification_data

Unnamed: 0,day_of_year,month,day_of_week,day_of_month,12 C AC,12 inch Fan,14 inch Fan,17 inch Fan,18 A AC,18 C AC,...,Room Heater,Ruti Tawa,Speaker,Stabilizer,Stand Fan,Tornedo Fan,Wall Move,Washing Machine,Water Filter,Weight Scale
0,91,4,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,92,4,5,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,93,4,6,3,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,94,4,0,4,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
4,95,4,1,5,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,117,4,4,26,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
757,118,4,5,27,0,0,3,3,1,3,...,0,0,0,0,2,0,0,1,0,0
758,119,4,6,28,1,0,2,3,0,1,...,0,0,0,0,1,0,0,0,0,0
759,120,4,0,29,0,0,3,3,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Getting the the product column names
product_columns = classification_data.columns.difference(['day_of_year', 'month', 'day_of_week', 'day_of_month'])

# Setting the dependent and independent variable
X = classification_data[['day_of_year', 'month', 'day_of_week', 'day_of_month']]
y = classification_data[product_columns]  

# Spliting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Using MultiOutputClassifier to handle multiple outputs
from sklearn.multioutput import MultiOutputClassifier
clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
clf.fit(X_train, y_train)

In [11]:
start_date = datetime(2022, 4, 1)
dates = [start_date + timedelta(days=i) for i in range(365 * 3)]

df = pd.DataFrame({'Date': dates})

df['day_of_year'] = df['Date'].dt.dayofyear
df['month'] = df['Date'].dt.month
df['day_of_week'] = df['Date'].dt.dayofweek  # Monday=0, Sunday=6
df['day_of_month'] = df['Date'].dt.day

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['day_of_year', 'month', 'day_of_week', 'day_of_month']])

df_scaled = pd.DataFrame(scaled_features, columns=['day_of_year_scaled', 'month_scaled', 'day_of_week_scaled', 'day_of_month_scaled'])
df = pd.concat([df, df_scaled], axis=1)

df

Unnamed: 0,Date,day_of_year,month,day_of_week,day_of_month,day_of_year_scaled,month_scaled,day_of_week_scaled,day_of_month_scaled
0,2022-04-01,91,4,4,1,-0.874648,-0.732147,0.497149,-1.673860
1,2022-04-02,92,4,5,2,-0.865167,-0.732147,0.997036,-1.560137
2,2022-04-03,93,4,6,3,-0.855686,-0.732147,1.496924,-1.446414
3,2022-04-04,94,4,0,4,-0.846205,-0.732147,-1.502402,-1.332690
4,2022-04-05,95,4,1,5,-0.836724,-0.732147,-1.002515,-1.218967
...,...,...,...,...,...,...,...,...,...
1090,2025-03-26,85,3,2,26,-0.931535,-1.022093,-0.502627,1.169220
1091,2025-03-27,86,3,3,27,-0.922054,-1.022093,-0.002739,1.282943
1092,2025-03-28,87,3,4,28,-0.912573,-1.022093,0.497149,1.396666
1093,2025-03-29,88,3,5,29,-0.903091,-1.022093,0.997036,1.510389


In [12]:
# y_train_pred = clf.predict(X_train)

# position = 0
# for product in product_columns:
#     X_train_reg = pd.DataFrame(columns = ['day_of_year', 'month', 'day_of_week'])
#     y_train_reg = pd.Series(dtype = 'float64')
#     i = 0
#     for index in X_train.index:
#         if y_train_pred[i][position] == 3:
#             X_train_reg.loc[index] = [X['day_of_year'][index], X['month'][index], X['day_of_week'][index]]
#             y_train_reg[index] = data[product][index]
#         i += 1
    
#     print(X_train_reg)
#     print(y_train_reg)
#     position += 1

In [13]:
# Predict the classification output for the training data
y_train_pred = clf.predict(X_train)

# Create a dictionary to store regression models for each product
regression_models = {}

# Loop through each product
for position, product in enumerate(product_columns):
    
    # Filter the rows where the predicted label is 3 for the current product
    mask = y_train_pred[:, position] == 3
    
    # Select the relevant rows from X_train based on the mask
    X_train_reg = X_train[mask].copy()
    
    # Create the target series (y_train_reg) for regression, based on the original 'data' DataFrame
    y_train_reg = data.loc[X_train_reg.index, product]
    
    if (len(X_train_reg) > 0):
        # Initialize the SGDRegressor with a small learning rate
        reg = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)

        # Scaling the data for regression
        scaler = StandardScaler()
        X_train_reg_scaled = scaler.fit_transform(X_train_reg)

        # X_scaled = []
        # for i in X_train_reg.index:
        #     scaled_data = df[(df['day_of_year'] == X_train_reg['day_of_year'][i])
        #              & (df['month'] == X_train_reg['month'][i])
        #              & (df['day_of_week'] == X_train_reg['day_of_week'][i])]
        #     scaled_data = scaled_data.to_numpy()
        #     row = [scaled_data[0][4], scaled_data[0][5], scaled_data[0][6]]
        #     X_scaled.append(row)

        merged_df = pd.merge(X_train_reg, df, on=['day_of_year', 'month', 'day_of_week'], how='left')

        # Extract the relevant scaled columns from the merged DataFrame
        X_scaled = merged_df.iloc[:, -3:].to_numpy()   
            
        # Train the regression model with SGDRegressor
        reg.fit(X_train_reg_scaled, y_train_reg)
        
        # Store the regression model
        regression_models[product] = reg

In [14]:
# Saving model to pickle file
with open("model/classifier.pkl", "wb") as file: 
    pickle.dump(clf, file)

In [15]:
# Save the models in one pickle file
with open('model/regressor.pkl', 'wb') as f:
    pickle.dump(regression_models, f)

In [16]:
# Opening saved model
with open("model/classifier.pkl", "rb") as file:
    classifier = pickle.load(file)

# Prediction
y_pred = classifier.predict(X_test)
print(mean_absolute_error(y_test, y_pred))

0.08007864392369414


In [17]:
with open('model/regressor.pkl', 'rb') as f:
    regressor = pickle.load(f)

regressor

{'12 inch Fan': SGDRegressor(random_state=42),
 '14 inch Fan': SGDRegressor(random_state=42),
 '17 inch Fan': SGDRegressor(random_state=42),
 '18 A AC': SGDRegressor(random_state=42),
 '1B6RXXX': SGDRegressor(random_state=42),
 '1D4GDEL': SGDRegressor(random_state=42),
 '1D4GDSH': SGDRegressor(random_state=42),
 '1D5GDEL': SGDRegressor(random_state=42),
 '1F3GDEH': SGDRegressor(random_state=42),
 '1F3GDEL': SGDRegressor(random_state=42),
 '1F3GDSH': SGDRegressor(random_state=42),
 '1F3RXXX': SGDRegressor(random_state=42),
 '24 inch LED': SGDRegressor(random_state=42),
 '2A3GDEL': SGDRegressor(random_state=42),
 '2A3GDSH': SGDRegressor(random_state=42),
 '2A3GDXX': SGDRegressor(random_state=42),
 '2B3GDXX': SGDRegressor(random_state=42),
 '2E0GDEL': SGDRegressor(random_state=42),
 '2T5GDEL': SGDRegressor(random_state=42),
 '32 inch LED': SGDRegressor(random_state=42),
 '3X7GDEL': SGDRegressor(random_state=42),
 'Blender': SGDRegressor(random_state=42),
 'Ceiling Fan': SGDRegressor(rando

In [18]:
y_pred_all = np.array(y_pred).copy() 

y_test_all = pd.DataFrame()

# Loop through each product
for position, product in enumerate(product_columns):
    test_df = pd.DataFrame()

    # Extract the relevant test data for this product
    y_test_all = pd.concat([y_test_all, data.loc[y_test.index, product]], axis=1)
    
    indices_to_update = []
    for i, index in enumerate(X_test.index):
        if y_pred_all[i][position] == 3 and product in regressor:
            test_row = pd.DataFrame({
                'day_of_year': X_test.loc[index, 'day_of_year'], 
                'month': X_test.loc[index, 'month'], 
                'day_of_week': X_test.loc[index, 'day_of_week'],
                'day_of_month': X_test.loc[index, 'day_of_month']
            }, index=[index])
            test_df = pd.concat([test_df, test_row])
            indices_to_update.append(i)

    if not test_df.empty:
        scaler = StandardScaler()
        test_df_scaled = scaler.fit_transform(test_df)

        merged_df = pd.merge(test_df, df, on=['day_of_year', 'month', 'day_of_week', 'day_of_month'], how='left')

        # Extract the relevant scaled columns from the merged DataFrame
        test_X_scaled = merged_df.iloc[:, -3:].to_numpy()  

        results = regressor[product].predict(test_df_scaled)
        print(f"{product}: {results}")

        # Update y_pred_all based on the predictions
        for j, i in enumerate(indices_to_update):
            y_pred_all[i, position] = results[j]

# Print the final y_pred_all to verify updates
print("Final y_pred_all:")
print(y_pred_all)

12 inch Fan: [3.6000891]
14 inch Fan: [6.51864308 5.31869377 4.02471084 4.37469893]
17 inch Fan: [5.51533293 6.38065949 7.88149074 6.09132332 7.24420891 5.16738727
 7.02474397 6.90329842 6.29825507 6.28032691 7.44882746 4.27300775
 7.12276336 6.30452166 7.53123188 7.21093009 6.70494646 7.68441164]
1D5GDEL: [3.29752238]
1F3GDEH: [3.58673349]
1F3GDEL: [4.30216862]
2A3GDXX: [2.88724942]
32 inch LED: [2.968283]
Blender: [2.59744664]
Grinder: [3.1041995]
Iron: [3.11565324 3.53463703]
Rice Cooker: [2.90534732 2.87958287]
Final y_pred_all:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [19]:
mean_absolute_error(y_test_all, y_pred_all)

0.09155640576013603

In [20]:
mean_squared_error(y_test_all, y_pred_all)

0.28954779743875875

In [24]:
root_mean_squared_error(y_test_all, y_pred_all)

0.306082253544127

In [25]:
mean_absolute_percentage_error(y_test_all, y_pred_all)

103382487859294.05