In [103]:
# Notebook for model training

# This notebook trains and evaluates machine learning models on the preprocessed dataset.

In [104]:
# get the path from config file

import yaml

with open("../../config.yaml", "r") as file:
    config = yaml.safe_load(file)


raw_data_path = "../../" + config["paths"]["raw_data"]
processed_data_path = "../../" + config["paths"]["processed_data"]
learning_rate = float(config["model"]["learning_rate"])
epoch = int(config["model"]["epochs"])

In [105]:
# import the libraries

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


In [106]:
# Load the dataset
df = pd.read_csv(processed_data_path)
df.shape

(284315, 29)

In [107]:
# We will train the model on normal data only (unsupervised learning)
X_train = df.values

In [108]:
# Define the AutoEncoder

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        
        # Encoder part
        self.encoder = nn.Sequential(
            nn.Linear(29, 14),  # Input layer (29 features) -> hidden layer (14 features)
            nn.ReLU(),
            nn.Linear(14, 7),   # Hidden layer -> smaller hidden layer (7 features)
            nn.ReLU(),
            nn.Linear(7, 3),    # Bottleneck layer (compressed representation)
            nn.ReLU()
        )
        
        # Decoder part
        self.decoder = nn.Sequential(
            nn.Linear(3, 7),    # Bottleneck layer -> hidden layer (7 features)
            nn.ReLU(),
            nn.Linear(7, 14),   # Hidden layer -> hidden layer (14 features)
            nn.ReLU(),
            nn.Linear(14, 29),  # Hidden layer -> output layer (29 features)
            nn.Sigmoid()        # To bring the output in range [0,1] (same as input range)
        )
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


In [109]:
# Convert data to torch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)

In [110]:
# Initialize the model, loss function, and optimizer
model = Autoencoder()
criterion = nn.MSELoss()  # Mean Squared Error loss for reconstruction
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [114]:
# Initialize an empty list to store logs
logs = []

# Train the model
for epoch in range(epoch):
    model.train()  # Ensure the model is in training mode

    # Forward pass
    output = model(X_train_tensor) # automatically calls model.forward
    loss = criterion(output, X_train_tensor)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        logs.append({"Epoch": f'[{epoch+1}/{epoch}]', "train_loss": loss.item()})
        print(f'Epoch [{epoch+1}/{epoch}], Loss: {loss.item():.4f}')

# Save logs to CSV
pd.DataFrame(logs).to_csv("../../models/model_logs/autoencoder_training_logs.csv", index=False)
    

Epoch [10/9], Loss: 1.1612
Epoch [20/19], Loss: 1.1333
Epoch [30/29], Loss: 1.1030
Epoch [40/39], Loss: 1.0769


In [None]:
# saving the model

torch.save(model.state_dict(), "../../models/saved_models/autoencoder_model.pth")