In [315]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, mean_absolute_error

In [316]:
sales_data = pd.read_csv('dataset/Project Dataset/sales_data.csv')
amount_data = pd.read_csv('dataset/Project Dataset/amount_data.csv')

In [317]:
sales_data['Date'] = pd.to_datetime(sales_data['Date'], dayfirst = True)
sales_data = sales_data.sort_values(by = 'Date')
sales_data = sales_data.reset_index().drop(columns=['index'])
sales_data

Unnamed: 0,Date,Model,Quantity
0,2022-04-01,18 F AC,0.0
1,2022-04-01,3X7NEXX,0.0
2,2022-04-01,2T5GDEL,0.0
3,2022-04-01,2T5RRLX-GX,0.0
4,2022-04-01,2T5RRLX-XX,0.0
...,...,...,...
93598,2024-04-30,40 inch LED,0.0
93599,2024-04-30,32 inch LED,0.0
93600,2024-04-30,Multiplug,0.0
93601,2024-04-30,1D4GDEH,1.0


In [318]:
amount_data['Date'] = pd.to_datetime(amount_data['Date'], dayfirst = True)
amount_data = amount_data.sort_values(by = 'Date')
amount_data = amount_data.reset_index().drop(columns=['index'])
amount_data

Unnamed: 0,Date,Amount
0,2022-04-01,0
1,2022-04-02,239400
2,2022-04-03,274140
3,2022-04-04,177000
4,2022-04-05,106000
...,...,...
756,2024-04-26,0
757,2024-04-27,582650
758,2024-04-28,367700
759,2024-04-29,451000


In [319]:
sales_data_matrix = sales_data.pivot_table(index = 'Date', columns = 'Model', values = 'Quantity', aggfunc = 'sum', fill_value = 0).reset_index()
data = pd.merge(sales_data_matrix, amount_data, on = 'Date', how = 'inner')

In [320]:
data['day_of_year'] = data['Date'].dt.dayofyear
data['month'] = data['Date'].dt.month
data['day_of_week'] = data['Date'].dt.dayofweek
data = data.drop(columns='Date')
data

Unnamed: 0,12 C AC,12 inch Fan,14 inch Fan,17 inch Fan,18 A AC,18 C AC,18 F AC,1B3GDEL,1B6GDEH,1B6GDEL,...,Stand Fan,Tornedo Fan,Wall Move,Washing Machine,Water Filter,Weight Scale,Amount,day_of_year,month,day_of_week
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,91,4,4
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,239400,92,4,5
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,274140,93,4,6
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,177000,94,4,0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,106000,95,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,117,4,4
757,0.0,0.0,3.0,11.0,1.0,4.0,0.0,0.0,0.0,1.0,...,2.0,0.0,0.0,1.0,0.0,0.0,582650,118,4,5
758,1.0,0.0,2.0,20.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,367700,119,4,6
759,0.0,0.0,5.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,451000,120,4,0


In [321]:
# products = data.columns.to_list()[:-4]
# sum = 0
# for product in products:
#     print(product)
#     classification_data = data[['day_of_year', 'month', 'day_of_week', product]].copy()

#     # Defining conditions here
#     conditions = [
#         classification_data[product] > 2,
#         classification_data[product] == 2,
#         classification_data[product] == 1,  
#         classification_data[product] == 0  
#     ]

#     # Defining the corresponding outputs for each condition
#     choices = [3, 2, 1, 0]

#     # Apply np.select to assign values based on the conditions
#     classification_data.loc[:, 'Sale'] = np.select(conditions, choices, default=0)

#     X = classification_data[['day_of_year', 'month', 'day_of_week']]
#     y_class = classification_data['Sale']  # Classification target
#     y_reg = classification_data[product]  # Regression target

#     X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class, test_size = 0.2, random_state = 42)

#     clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
#     clf.fit(X_train_class, y_train_class)

#     y_pred_class = clf.predict(X_test_class)
#     # print("Classification Report:\n", classification_report(y_test_class, y_pred_class))

#     y_pred_class_all = clf.predict(X)
#     # print("Classification Report on Full Data:\n", classification_report(y_class, y_pred_class_all))

#     X_train_reg = X[y_class == 3]
#     y_train_reg = y_reg[y_class == 3]

#     if (len(X_train_reg) > 0):
#         # Initialize the SGDRegressor with a small learning rate
#         reg = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)

#         # Scaling the data for regression
#         scaler = StandardScaler()
#         X_train_reg_scaled = scaler.fit_transform(X_train_reg)

#         # Train the regression model with SGDRegressor
#         reg.fit(X_train_reg_scaled, y_train_reg)

#         for i in range(len(y_pred_class_all)):
#             if y_pred_class_all[i] == 3:  # If a sale is predicted
#                 y_pred_class_all[i] = reg.predict([X.iloc[i]])[0]  # Predict the quantity

#     print(mean_absolute_error(y_reg, y_pred_class_all))
#     sum += mean_absolute_error(y_reg, y_pred_class_all)

# print(sum)

In [322]:
classification_data = data[['day_of_year', 'month', 'day_of_week']].copy()

columns_to_concat = []

products = data.columns.to_list()[:-4]

for product in products:
    # Defining conditions here
    conditions = [
        data[product] > 2,
        data[product] == 2,
        data[product] == 1,
        data[product] == 0
    ]

    # Defining the corresponding outputs for each condition
    choices = [3, 2, 1, 0]

    # Apply np.select and store the result in a separate DataFrame
    classification_column = pd.DataFrame({
        product: np.select(conditions, choices, default=0)
    })

    # Append this column to the list of columns to concatenate
    columns_to_concat.append(classification_column)

# Step 3: Concatenate all columns at once
classification_data = pd.concat([classification_data] + columns_to_concat, axis=1)

classification_data

Unnamed: 0,day_of_year,month,day_of_week,12 C AC,12 inch Fan,14 inch Fan,17 inch Fan,18 A AC,18 C AC,18 F AC,...,Room Heater,Ruti Tawa,Speaker,Stabilizer,Stand Fan,Tornedo Fan,Wall Move,Washing Machine,Water Filter,Weight Scale
0,91,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,92,4,5,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,93,4,6,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,94,4,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,95,4,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,117,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
757,118,4,5,0,0,3,3,1,3,0,...,0,0,0,0,2,0,0,1,0,0
758,119,4,6,1,0,2,3,0,1,0,...,0,0,0,0,1,0,0,0,0,0
759,120,4,0,0,0,3,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [323]:
product_columns = classification_data.columns.difference(['day_of_year', 'month', 'day_of_week'])

X = classification_data[['day_of_year', 'month', 'day_of_week']]
y = classification_data[product_columns]  

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [324]:
# Use MultiOutputClassifier to handle multiple outputs
from sklearn.multioutput import MultiOutputClassifier
clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
clf.fit(X_train, y_train)

In [325]:
# Prediction
y_pred = clf.predict(X_test)
print(mean_absolute_error(y_test, y_pred))

0.08719910728519048


In [326]:
# y_train_pred = clf.predict(X_train)

# position = 0
# for product in product_columns:
#     X_train_reg = pd.DataFrame(columns = ['day_of_year', 'month', 'day_of_week'])
#     y_train_reg = pd.Series(dtype = 'float64')
#     i = 0
#     for index in X_train.index:
#         if y_train_pred[i][position] == 3:
#             X_train_reg.loc[index] = [X['day_of_year'][index], X['month'][index], X['day_of_week'][index]]
#             y_train_reg[index] = data[product][index]
#         i += 1
    
#     print(X_train_reg)
#     print(y_train_reg)
#     position += 1

In [327]:
# Predict the classification output for the training data
y_train_pred = clf.predict(X_train)

# Create a dictionary to store regression models for each product
regression_models = {}

# Loop through each product
for position, product in enumerate(product_columns):
    
    # Filter the rows where the predicted label is 3 for the current product
    mask = y_train_pred[:, position] == 3
    
    # Select the relevant rows from X_train based on the mask
    X_train_reg = X_train[mask].copy()
    
    # Create the target series (y_train_reg) for regression, based on the original 'data' DataFrame
    y_train_reg = data.loc[X_train_reg.index, product]
    
    if (len(X_train_reg) > 0):
        # Initialize the SGDRegressor with a small learning rate
        reg = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)

        # Scaling the data for regression
        scaler = StandardScaler()
        X_train_reg_scaled = scaler.fit_transform(X_train_reg)

        # Train the regression model with SGDRegressor
        reg.fit(X_train_reg_scaled, y_train_reg)
        
        # Store the regression model
        regression_models[product] = reg

In [328]:
# Saving model to pickle file
with open("model/classifier.pkl", "wb") as file: 
    pickle.dump(clf, file)

In [329]:
# Save the models in one pickle file
with open('model/regressor.pkl', 'wb') as f:
    pickle.dump(regression_models, f)

In [331]:
# Opening saved model
with open("model/classifier.pkl", "rb") as file:
    clf = pickle.load(file)

# Prediction
y_pred = clf.predict(X_test)
print(mean_absolute_error(y_test, y_pred))

0.08719910728519048


In [332]:
with open('model/regressor.pkl', 'rb') as f:
    loaded_models = pickle.load(f)

loaded_models    

{'12 inch Fan': SGDRegressor(random_state=42),
 '14 inch Fan': SGDRegressor(random_state=42),
 '17 inch Fan': SGDRegressor(random_state=42),
 '18 A AC': SGDRegressor(random_state=42),
 '1B6RXXX': SGDRegressor(random_state=42),
 '1D4GDEL': SGDRegressor(random_state=42),
 '1D4GDSH': SGDRegressor(random_state=42),
 '1D5GDEL': SGDRegressor(random_state=42),
 '1F3GDEH': SGDRegressor(random_state=42),
 '1F3GDEL': SGDRegressor(random_state=42),
 '1F3GDSH': SGDRegressor(random_state=42),
 '1F3RXXX': SGDRegressor(random_state=42),
 '24 inch LED': SGDRegressor(random_state=42),
 '2A3GDEL': SGDRegressor(random_state=42),
 '2A3GDSH': SGDRegressor(random_state=42),
 '2A3GDXX': SGDRegressor(random_state=42),
 '2B3GDXX': SGDRegressor(random_state=42),
 '2E0GDEL': SGDRegressor(random_state=42),
 '2T5GDEL': SGDRegressor(random_state=42),
 '32 inch LED': SGDRegressor(random_state=42),
 '3X7GDEL': SGDRegressor(random_state=42),
 'Blender': SGDRegressor(random_state=42),
 'Ceiling Fan': SGDRegressor(rando