In [1754]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, mean_absolute_error


In [1755]:
sales_data = pd.read_csv('dataset/Project Dataset/sales_data.csv')
amount_data = pd.read_csv('dataset/Project Dataset/amount_data.csv')

In [1756]:
sales_data['Date'] = pd.to_datetime(sales_data['Date'], dayfirst = True)
sales_data = sales_data.sort_values(by = 'Date')
sales_data = sales_data.reset_index().drop(columns=['index'])
sales_data

Unnamed: 0,Date,Model,Quantity
0,2022-04-01,18 F AC,0.0
1,2022-04-01,3X7NEXX,0.0
2,2022-04-01,2T5GDEL,0.0
3,2022-04-01,2T5RRLX-GX,0.0
4,2022-04-01,2T5RRLX-XX,0.0
...,...,...,...
93598,2024-04-30,40 inch LED,0.0
93599,2024-04-30,32 inch LED,0.0
93600,2024-04-30,Multiplug,0.0
93601,2024-04-30,1D4GDEH,1.0


In [1757]:
amount_data['Date'] = pd.to_datetime(amount_data['Date'], dayfirst = True)
amount_data = amount_data.sort_values(by = 'Date')
amount_data = amount_data.reset_index().drop(columns=['index'])
amount_data

Unnamed: 0,Date,Amount
0,2022-04-01,0
1,2022-04-02,239400
2,2022-04-03,274140
3,2022-04-04,177000
4,2022-04-05,106000
...,...,...
756,2024-04-26,0
757,2024-04-27,582650
758,2024-04-28,367700
759,2024-04-29,451000


In [1758]:
sales_data_matrix = sales_data.pivot_table(index = 'Date', columns = 'Model', values = 'Quantity', aggfunc = 'sum', fill_value = 0).reset_index()
sales_data_matrix
data = pd.merge(sales_data_matrix, amount_data, on = 'Date', how = 'inner')

In [1759]:
data['day_of_year'] = data['Date'].dt.dayofyear
data['month'] = data['Date'].dt.month
data['day_of_week'] = data['Date'].dt.dayofweek
data = data.drop(columns='Date')
data

Unnamed: 0,12 C AC,12 inch Fan,14 inch Fan,17 inch Fan,18 A AC,18 C AC,18 F AC,1B3GDEL,1B6GDEH,1B6GDEL,...,Stand Fan,Tornedo Fan,Wall Move,Washing Machine,Water Filter,Weight Scale,Amount,day_of_year,month,day_of_week
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,91,4,4
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,239400,92,4,5
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,274140,93,4,6
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,177000,94,4,0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,106000,95,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,117,4,4
757,0.0,0.0,3.0,11.0,1.0,4.0,0.0,0.0,0.0,1.0,...,2.0,0.0,0.0,1.0,0.0,0.0,582650,118,4,5
758,1.0,0.0,2.0,20.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,367700,119,4,6
759,0.0,0.0,5.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,451000,120,4,0


In [1760]:
product = '18 C AC'

classification_data = data[['day_of_year', 'month', 'day_of_week', product]].copy()
# Define conditions
conditions = [
    classification_data[product] > 2,
    classification_data[product] == 2,
    classification_data[product] == 1,  
    classification_data[product] == 0  
]

# Define the corresponding outputs for each condition
choices = [
    3,
    2,
    1,
    0
]

# Apply np.select to assign values based on the conditions
classification_data.loc[:, 'Sale'] = np.select(conditions, choices, default=0)
classification_data

Unnamed: 0,day_of_year,month,day_of_week,18 C AC,Sale
0,91,4,4,0.0,0
1,92,4,5,0.0,0
2,93,4,6,1.0,1
3,94,4,0,0.0,0
4,95,4,1,0.0,0
...,...,...,...,...,...
756,117,4,4,0.0,0
757,118,4,5,4.0,3
758,119,4,6,1.0,1
759,120,4,0,0.0,0


In [1761]:
X = classification_data[['day_of_year', 'month', 'day_of_week']]
y_class = classification_data['Sale']  # Classification target
y_reg = classification_data[product]  # Regression target

In [1762]:
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_class, test_size = 0.2, random_state = 42)

In [1763]:
clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
clf.fit(X_train_class, y_train_class)

In [1764]:
y_pred_class = clf.predict(X_test_class)
print("Classification Report:\n", classification_report(y_test_class, y_pred_class))

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93       138
           1       0.17      0.09      0.12        11
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         3

    accuracy                           0.88       153
   macro avg       0.27      0.26      0.26       153
weighted avg       0.83      0.88      0.85       153



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1765]:
y_pred_class_all = clf.predict(X)
print("Classification Report on Full Data:\n", classification_report(y_class, y_pred_class_all))

Classification Report on Full Data:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       716
           1       0.85      0.74      0.79        38
           2       1.00      0.75      0.86         4
           3       0.00      0.00      0.00         3

    accuracy                           0.98       761
   macro avg       0.71      0.62      0.66       761
weighted avg       0.97      0.98      0.97       761



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1766]:
X_train_reg = X[y_class == 3]
y_train_reg = y_reg[y_class == 3]

In [1767]:
X_train_reg

Unnamed: 0,day_of_year,month,day_of_week
380,106,4,6
618,344,12,6
757,118,4,5


In [1768]:
y_train_reg

380    3.0
618    4.0
757    4.0
Name: 18 C AC, dtype: float64

In [1769]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_train_reg, y_train_reg, test_size = 0.2, random_state = 42)

In [1770]:
# Initialize the SGDRegressor with a small learning rate
reg = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)

# Scaling the data for regression
scaler = StandardScaler()
X_train_reg_scaled = scaler.fit_transform(X_train_reg)
X_test_reg_scaled = scaler.transform(X_test_reg)

# Train the regression model with SGDRegressor
reg.fit(X_train_reg_scaled, y_train_reg)

In [1771]:
for i in range(len(y_pred_class_all)):
    if y_pred_class_all[i] == 3:  # If a sale is predicted
        y_pred_class_all[i] = reg.predict([X.iloc[i]])[0]  # Predict the quantity

print(mean_absolute_error(y_reg, y_pred_class_all))

0.03679369250985545
