In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Read and mount data to data frames:

path1 = '/content/drive/MyDrive/VU/ANN/prepared_train_data.csv'
path2 = '/content/drive/MyDrive/VU/ANN/prepared_test_data.csv'
train_df = pd.read_csv(path1)
test_df = pd.read_csv(path2)

# Dimensionality reduction:

In [4]:
# Pop out target data and id column:
Y_train = train_df['sales']
X_train = train_df.drop('sales', axis=1)
X_train = X_train.drop('id', axis=1)

# Convert dataframes to tensors
X_tensor = torch.tensor(X_train.values[:,:,None], dtype=torch.float32)
print(X_tensor.shape)


torch.Size([3052566, 88, 1])


In [5]:
# Create a Dataset from tensor
dataset = TensorDataset(X_tensor, torch.tensor(Y_train.values, dtype=torch.float32))

# Defining the DataLoader
BATCH_SIZE = 1713  # each store-item has 1713 entries
data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [6]:
class Autoencoder(nn.Module):
    def __init__(self, input_channels, sequence_length, encoded_dim=16):
        super(Autoencoder, self).__init__()

        # Activation Function
        self.relu = nn.ReLU()

        # Encoder Layers with Batch Normalization
        self.fc1_enc = nn.Linear(sequence_length * input_channels, 256)
        self.bn1_enc = nn.BatchNorm1d(256)
        self.fc2_enc = nn.Linear(256, 128)
        self.bn2_enc = nn.BatchNorm1d(128)
        self.fc3_enc = nn.Linear(128, 64)
        self.bn3_enc = nn.BatchNorm1d(64)
        self.fc4_enc = nn.Linear(64, encoded_dim)

        # Decoder Layers with Batch Normalization
        self.fc1_dec = nn.Linear(encoded_dim, 64)
        self.bn1_dec = nn.BatchNorm1d(64)
        self.fc2_dec = nn.Linear(64, 128)
        self.bn2_dec = nn.BatchNorm1d(128)
        self.fc3_dec = nn.Linear(128, 256)
        self.bn3_dec = nn.BatchNorm1d(256)
        self.fc4_dec = nn.Linear(256, sequence_length * input_channels)

    def forward(self, x):
        # Encoding
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.relu(self.bn1_enc(self.fc1_enc(x)))
        x = self.relu(self.bn2_enc(self.fc2_enc(x)))
        x = self.relu(self.bn3_enc(self.fc3_enc(x)))
        encoded = self.fc4_enc(x)

        # Decoding
        x = self.relu(self.bn1_dec(self.fc1_dec(encoded)))
        x = self.relu(self.bn2_dec(self.fc2_dec(x)))
        x = self.relu(self.bn3_dec(self.fc3_dec(x)))
        decoded = self.fc4_dec(x)

        return decoded

    def encode(self, x):
        x = x.view(x.size(0), -1)
        x = self.relu(self.bn1_enc(self.fc1_enc(x)))
        x = self.relu(self.bn2_enc(self.fc2_enc(x)))
        x = self.relu(self.bn3_enc(self.fc3_enc(x)))
        encoded = self.fc4_enc(x)
        return encoded


In [7]:
# 0. Use GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Model Initialization
input_channels = X_tensor.shape[1]       # 88
sequence_length = X_tensor.shape[2]      # 1
model = Autoencoder(input_channels, sequence_length).to(device)  # Move model to GPU if available

In [8]:
### Train encoder:

# 2. Loss Function
criterion = nn.MSELoss()

# 3. Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# 4. Training Loop
num_epochs = 8
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, target) in enumerate(data_loader):
        data = data.to(device)

        optimizer.zero_grad()  # Reset gradients
        reconstructed = model(data)
        loss = criterion(reconstructed, data.view(data.size(0), -1))        # -1 - infer the size for that particular dimension based on the original size and the size of the other specified dimensions
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}] Loss: {running_loss/len(data_loader):.4f}")

# 6. Model Saving (optional)
torch.save(model.state_dict(), 'model_weights.pth')

Epoch [1/8] Loss: 0.0220
Epoch [2/8] Loss: 0.0118
Epoch [3/8] Loss: 0.0099
Epoch [4/8] Loss: 0.0074
Epoch [5/8] Loss: 0.0042
Epoch [6/8] Loss: 0.0028
Epoch [7/8] Loss: 0.0024
Epoch [8/8] Loss: 0.0022


Encode training data:

In [9]:
# Load the saved weights
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()  # Set the model to evaluation mode by ignoring drop outs & batch normalization

encoded_data = []
with torch.no_grad():
    for data, _ in data_loader:  # no need of targets for encoding
        data = data.to(device)
        encoded = model.encode(data)
        encoded_data.append(encoded.cpu())

# Convert list of tensors to a single tensor
encoded_data = torch.cat(encoded_data, dim=0)

print(encoded_data.shape)

torch.Size([3052566, 16])


Encode test data:

In [10]:
encoded_data.shape

torch.Size([3052566, 16])

#Storing encoded data:

In [11]:
# Convert list of tensors to numpy array
encoded_data_array = np.vstack([tensor.numpy() for tensor in encoded_data])

# Convert numpy array to DataFrame
encoded_df = pd.DataFrame(encoded_data_array)
# Concatenate encoded_df and Y_train along columns
encoded_df_plus_sales = pd.concat([encoded_df, train_df['id'].reset_index(drop=True), Y_train.reset_index(drop=True)], axis=1)

# Save to CSV
encoded_df_plus_sales.to_csv('/content/drive/MyDrive/VU/ANN/encoded_data.csv', index=False)