In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time
import os

In [2]:
# Seeds for Reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x1d91754fbf0>

In [3]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [4]:
# Data Loading and Feature Engineering
try:
          df = pd.read_csv("ML-Dataset.csv")
except FileNotFoundError:
          print("Error: 'ML-Dataset.csv' not found. Please ensure the file is in the correct path.")
          exit()

In [5]:
TARGET_COLUMN = 'Profit'
y = df[TARGET_COLUMN]
y

0      542.95
1      448.71
2      625.54
3      410.59
4      489.58
        ...  
395    185.63
396    168.84
397    121.04
398    147.64
399     96.81
Name: Profit, Length: 400, dtype: float64

In [7]:
COLUMNS_TO_DROP = [
    'CustomerName', 'CustomerAddress', 'CustomerEmail', 'CustomerPhone',
    'EmployeeName', 'EmployeeEmail', 'EmployeePhone', 'EmployeeHireDate',
    'WarehouseAddress', 'WarehouseName', 'PostalCode',
    'ProductStandardCost', 'ProductListPrice'
]

X = df.drop(COLUMNS_TO_DROP + [TARGET_COLUMN], axis=1)
X

Unnamed: 0,RegionName,CountryName,State,City,EmployeeJobTitle,CategoryName,ProductName,ProductDescription,CustomerCreditLimit,Status,OrderDate,OrderItemQuantity,PerUnitPrice,TotalItemQuantity
0,South America,United States of America,Texas,Southlake,Public Accountant,CPU,Intel Xeon E5-2699 V3 (OEM/Tray),"Speed:2.3GHz,Cores:18,TDP:145W",5000,Shipped,17-Nov-16,132,469.99,122
1,South America,United States of America,Texas,Southlake,Accounting Manager,CPU,Intel Xeon E5-2697 V3,"Speed:2.6GHz,Cores:14",5000,Shipped,20-Feb-17,124,519.99,123
2,South America,United States of America,Texas,Southlake,Administration Assistant,CPU,Intel Xeon E5-2698 V3 (OEM/Tray),Speed:2.3GHz,1200,Canceled,03-Jan-17,92,800.74,123
3,South America,United States of America,Texas,Southlake,President,CPU,Intel Xeon E5-2697 V4,"Cores:18,TDP:145W",2400,Pending,15-Oct-17,128,849.99,124
4,South America,United States of America,Texas,Southlake,Administration Vice President,CPU,Intel Xeon E5-2685 V3 (OEM/Tray),"Speed:2.6GHz,Cores:12",1200,Shipped,09-Apr-17,106,109.99,125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,Asia,India,Maharashtra,Bombay,Stock Clerk,Video Card,MSI GTX 1080 TI LIGHTNING Z,Chipset:GeForce GTX 1080 Ti,4900,Shipped,27-Sep-17,32,725.99,107
396,Asia,India,Maharashtra,Bombay,Stock Manager,Video Card,Asus ROG-POSEIDON-GTX1080TI-P11G-GAMING,Chipset:GeForce GTX 1080 Ti,5000,Pending,16-Aug-16,66,798.26,118
397,Asia,India,Maharashtra,Bombay,Stock Manager,Video Card,MSI GTX 1080 TI LIGHTNING X,Chipset:GeForce GTX 1080 Ti,4000,Canceled,27-May-16,82,849.99,118
398,Asia,India,Maharashtra,Bombay,Stock Manager,Video Card,Zotac ZT-P10810A-10P,Chipset:GeForce GTX 1080 Ti,3000,Shipped,27-May-17,157,821.99,95


In [None]:
print(f"Original shape: {df.shape}")

Original shape: (400, 28)


In [9]:
# Feature Engineering (Temporal Data)
X['OrderDate'] = pd.to_datetime(X['OrderDate'], format='%d-%b-%y')
X['Order_Month'] = X['OrderDate'].dt.month
X['Order_Year'] = X['OrderDate'].dt.year
X['Order_Weekday'] = X['OrderDate'].dt.dayofweek
X = X.drop('OrderDate', axis=1)
X

Unnamed: 0,RegionName,CountryName,State,City,EmployeeJobTitle,CategoryName,ProductName,ProductDescription,CustomerCreditLimit,Status,OrderItemQuantity,PerUnitPrice,TotalItemQuantity,Order_Month,Order_Year,Order_Weekday
0,South America,United States of America,Texas,Southlake,Public Accountant,CPU,Intel Xeon E5-2699 V3 (OEM/Tray),"Speed:2.3GHz,Cores:18,TDP:145W",5000,Shipped,132,469.99,122,11,2016,3
1,South America,United States of America,Texas,Southlake,Accounting Manager,CPU,Intel Xeon E5-2697 V3,"Speed:2.6GHz,Cores:14",5000,Shipped,124,519.99,123,2,2017,0
2,South America,United States of America,Texas,Southlake,Administration Assistant,CPU,Intel Xeon E5-2698 V3 (OEM/Tray),Speed:2.3GHz,1200,Canceled,92,800.74,123,1,2017,1
3,South America,United States of America,Texas,Southlake,President,CPU,Intel Xeon E5-2697 V4,"Cores:18,TDP:145W",2400,Pending,128,849.99,124,10,2017,6
4,South America,United States of America,Texas,Southlake,Administration Vice President,CPU,Intel Xeon E5-2685 V3 (OEM/Tray),"Speed:2.6GHz,Cores:12",1200,Shipped,106,109.99,125,4,2017,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,Asia,India,Maharashtra,Bombay,Stock Clerk,Video Card,MSI GTX 1080 TI LIGHTNING Z,Chipset:GeForce GTX 1080 Ti,4900,Shipped,32,725.99,107,9,2017,2
396,Asia,India,Maharashtra,Bombay,Stock Manager,Video Card,Asus ROG-POSEIDON-GTX1080TI-P11G-GAMING,Chipset:GeForce GTX 1080 Ti,5000,Pending,66,798.26,118,8,2016,1
397,Asia,India,Maharashtra,Bombay,Stock Manager,Video Card,MSI GTX 1080 TI LIGHTNING X,Chipset:GeForce GTX 1080 Ti,4000,Canceled,82,849.99,118,5,2016,4
398,Asia,India,Maharashtra,Bombay,Stock Manager,Video Card,Zotac ZT-P10810A-10P,Chipset:GeForce GTX 1080 Ti,3000,Shipped,157,821.99,95,5,2017,5


In [10]:
# Preprocessing using ColumnTransformer (SKLearn)
NUMERICAL_FEATURES = ['CustomerCreditLimit', 'OrderItemQuantity', 'TotalItemQuantity', 'Order_Month', 'Order_Year', 'Order_Weekday']
CATEGORICAL_FEATURES = X.select_dtypes(include=['object']).columns.tolist()

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
          transformers=[
                    ('num', StandardScaler(), NUMERICAL_FEATURES),
                    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), CATEGORICAL_FEATURES)
          ],
          remainder = 'passthrough'
)

print("Preprocessing data...")
X_train_np = preprocessor.fit_transform(X_train_raw)
X_test_np = preprocessor.transform(X_test_raw)

Preprocessing data...


In [11]:
# Extracting 20% validation split from training data for callbacks
X_train_data_np, X_val_np, y_train_data_np, y_val_np = train_test_split(
          X_train_np, y_train.values, test_size=0.2, random_state=42
)

In [12]:

input_dim = X_train_np.shape[1]
print(f"Processed feature count (Input Dimension): {input_dim}")

Processed feature count (Input Dimension): 498


In [13]:
X_train_np

array([[ 1.93470588e-01,  1.20495243e+00, -2.06006865e-02, ...,
         1.00000000e+00,  0.00000000e+00,  8.49990000e+02],
       [-8.99981044e-01, -2.74234458e-01, -6.56500237e-02, ...,
         0.00000000e+00,  0.00000000e+00,  8.99990000e+02],
       [-5.78377623e-01, -3.34609433e-01,  1.07559985e+00, ...,
         0.00000000e+00,  0.00000000e+00,  6.68900000e+01],
       ...,
       [-9.64301728e-01,  1.11438997e+00,  7.90287383e-01, ...,
         0.00000000e+00,  0.00000000e+00,  1.89999000e+03],
       [ 7.08036062e-01, -3.34609433e-01,  1.31586298e+00, ...,
         0.00000000e+00,  0.00000000e+00,  1.46996000e+03],
       [ 1.93470588e-01, -7.57234259e-01, -1.77752484e+00, ...,
         0.00000000e+00,  1.00000000e+00,  1.46996000e+03]],
      shape=(320, 498))

In [14]:
y_train

3      410.59
18     526.88
202    118.71
250    346.35
274     14.81
        ...  
71     519.86
106    218.37
270      0.00
348      0.00
102    226.48
Name: Profit, Length: 320, dtype: float64

In [15]:
y_train.values

array([4.1059e+02, 5.2688e+02, 1.1871e+02, 3.4635e+02, 1.4810e+01,
       6.3760e+01, 1.4870e+01, 2.0000e+02, 1.6553e+02, 1.1808e+02,
       5.8060e+01, 2.2038e+02, 2.2075e+02, 7.6520e+01, 9.3100e+01,
       3.5000e+02, 4.9720e+01, 4.8692e+02, 1.6900e+01, 5.1551e+02,
       1.7350e+01, 2.2496e+02, 2.3275e+02, 1.6809e+02, 1.2873e+02,
       4.8964e+02, 4.7506e+02, 1.0449e+02, 0.0000e+00, 0.0000e+00,
       2.9647e+02, 4.6196e+02, 2.0000e+02, 4.9157e+02, 2.6500e+02,
       1.3003e+02, 4.1290e+02, 1.3940e+01, 1.9620e+03, 1.7600e+00,
       7.1340e+01, 9.3400e+01, 1.9304e+02, 0.0000e+00, 2.2481e+02,
       1.3006e+02, 0.0000e+00, 5.2688e+02, 1.1454e+02, 2.4380e+01,
       2.5000e+02, 9.1150e+01, 0.0000e+00, 1.6421e+02, 1.8492e+02,
       5.8480e+01, 4.9720e+01, 0.0000e+00, 3.5290e+01, 3.2000e+02,
       1.2720e+02, 1.9496e+02, 2.2500e+02, 1.2478e+02, 1.0218e+02,
       3.8940e+01, 1.1904e+02, 1.7989e+02, 7.8590e+01, 8.7000e+01,
       2.7550e+02, 4.7506e+02, 2.9200e+00, 1.1877e+02, 2.2839e

In [16]:
y_train_data_np

array([6.6980e+01, 0.0000e+00, 2.5982e+02, 9.6810e+01, 1.0547e+02,
       9.3300e+00, 1.4010e+02, 1.0792e+02, 0.0000e+00, 7.4890e+01,
       1.4360e+01, 4.6196e+02, 0.0000e+00, 2.3129e+02, 5.7260e+01,
       4.3360e+01, 1.3886e+02, 1.8670e+01, 3.2000e+02, 4.8327e+02,
       1.4870e+01, 9.1000e+01, 2.9647e+02, 2.3275e+02, 4.2990e+01,
       1.5068e+02, 4.9720e+01, 4.6196e+02, 3.0954e+02, 5.0020e+01,
       8.9630e+01, 6.2430e+01, 1.7600e+00, 0.0000e+00, 2.3940e+02,
       3.1886e+02, 5.1986e+02, 0.0000e+00, 5.0538e+02, 1.1904e+02,
       9.5480e+01, 1.6900e+01, 1.4267e+02, 2.8208e+02, 5.1551e+02,
       2.9835e+02, 2.6850e+02, 1.4764e+02, 1.9200e+02, 1.2050e+01,
       2.9200e+00, 1.0500e+02, 3.5000e+02, 1.2546e+02, 5.8060e+01,
       1.2143e+02, 8.7000e+01, 1.8960e+01, 3.3000e+02, 7.8590e+01,
       9.6630e+01, 1.3940e+01, 1.9596e+02, 4.9720e+01, 6.6520e+01,
       1.7989e+02, 8.4850e+01, 0.0000e+00, 1.8048e+02, 4.4697e+02,
       5.8060e+01, 1.0346e+02, 1.8078e+02, 2.3940e+02, 3.5290e

In [17]:
# PyTorch Data Conversion

# Convert NumPy arrays to PyTorch Tensors
X_train_tensor = torch.tensor(X_train_data_np, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_data_np, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val_np, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_np, dtype=torch.float32).unsqueeze(1)
X_tst_tensor = torch.tensor(X_test_np, dtype=torch.float32)

In [18]:
# Create TensorDatasets and DataLoaders
BATCH_SIZE = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)