In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
from torch.cuda.amp import autocast, GradScaler


In [14]:
df = pd.read_csv("/kaggle/input/house-a/house_a_combined_dataset.csv")

In [15]:
# Assuming 'df' is the DataFrame that contains your data
# Preprocessing: Drop the Hour, Resident columns for simplicity
data = df.drop(columns=['Hour', 'Resident1', 'Resident2']).values  # Remove non-sensor data

In [16]:
# Normalize or standardize data if needed
scaler = StandardScaler()
data = scaler.fit_transform(data)  # Normalize sensor data (optional)

In [17]:
# Convert data to tensor
data_tensor = torch.tensor(data, dtype=torch.float32)

In [18]:
# Create Dataset and DataLoader for batch processing
batch_size = 2**17  # Adjust based on GPU memory availability
dataset = TensorDataset(data_tensor)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [19]:
import torch
import torch.nn as nn

class LSTM_Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, seq_len):
        super(LSTM_Autoencoder, self).__init__()
        self.encoder = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, input_size)
        self.hidden_size = hidden_size
        self.seq_len = seq_len

    def forward(self, x):
        # Initialize hidden and cell states for encoder
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)  # batch_size, hidden_size
        c0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)  # batch_size, hidden_size
        
        # Pass through encoder
        encoded, (hn, cn) = self.encoder(x, (h0, c0))
        
        # Pass through decoder
        decoded, _ = self.decoder(encoded, (hn, cn))
        
        # Map decoded output back to input size
        decoded = self.output_layer(decoded)
        
        return decoded


In [20]:
# Model Hyperparameters
input_size = data.shape[1]  # Number of sensor features
hidden_size = 64  # You can adjust this based on your model's complexity
seq_len = 1  # We are feeding one time step at a time (if you want multi-step sequence, change this)

In [21]:
# Initialize the model and move it to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM_Autoencoder(input_size=input_size, hidden_size=hidden_size, seq_len=seq_len).to(device)

In [22]:
model = nn.DataParallel(model)

model = model.to(device)

In [23]:
# Loss and Optimizer
criterion = nn.MSELoss()  # Mean Squared Error for reconstruction loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [24]:
# Initialize GradScaler for mixed precision training
scaler = GradScaler()

num_epochs = 300  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (batch_data,) in enumerate(data_loader):
        batch_data = batch_data.to(device)
        batch_data = batch_data.unsqueeze(1)  # Ensure correct shape

        # Forward and backward pass under autocast for mixed precision
        with autocast():
            output = model(batch_data)
            loss = criterion(output, batch_data)
        
        # Scale loss to prevent underflow and backpropagate
        scaler.scale(loss).backward()
        
        # Optimizer step with scaled gradients
        scaler.step(optimizer)
        scaler.update()  # Update scaler for next iteration
        optimizer.zero_grad()  # Reset gradients
        
        running_loss += loss.item()
    
    avg_loss = running_loss / len(data_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')


  scaler = GradScaler()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch [1/300], Loss: 0.9786
Epoch [2/300], Loss: 0.8998
Epoch [3/300], Loss: 0.7578
Epoch [4/300], Loss: 0.5766
Epoch [5/300], Loss: 0.4203
Epoch [6/300], Loss: 0.3072
Epoch [7/300], Loss: 0.2293
Epoch [8/300], Loss: 0.1735
Epoch [9/300], Loss: 0.1344
Epoch [10/300], Loss: 0.1073
Epoch [11/300], Loss: 0.0885
Epoch [12/300], Loss: 0.0756
Epoch [13/300], Loss: 0.0658
Epoch [14/300], Loss: 0.0579
Epoch [15/300], Loss: 0.0515
Epoch [16/300], Loss: 0.0460
Epoch [17/300], Loss: 0.0412
Epoch [18/300], Loss: 0.0371
Epoch [19/300], Loss: 0.0336
Epoch [20/300], Loss: 0.0303
Epoch [21/300], Loss: 0.0275
Epoch [22/300], Loss: 0.0251
Epoch [23/300], Loss: 0.0229
Epoch [24/300], Loss: 0.0210
Epoch [25/300], Loss: 0.0193
Epoch [26/300], Loss: 0.0178
Epoch [27/300], Loss: 0.0164
Epoch [28/300], Loss: 0.0152
Epoch [29/300], Loss: 0.0141
Epoch [30/300], Loss: 0.0131
Epoch [31/300], Loss: 0.0122
Epoch [32/300], Loss: 0.0114
Epoch [33/300], Loss: 0.0107
Epoch [34/300], Loss: 0.0100
Epoch [35/300], Loss: 0

In [25]:
import numpy as np
from torch.cuda.amp import autocast

model.eval()
reconstruction_errors = []

with torch.no_grad():
    for batch_idx, (batch_data,) in enumerate(data_loader):
        batch_data = batch_data.to(device)
        batch_data = batch_data.unsqueeze(1)  

        with autocast():
            output = model(batch_data)

        error = torch.mean((batch_data - output) ** 2, dim=[1, 2]).cpu().numpy()
        reconstruction_errors.extend(error)

  with autocast():


In [26]:
threshold = np.percentile(reconstruction_errors, 95)

anomalies = np.array(reconstruction_errors) > threshold

In [33]:
total_anomalies = (anomalies==True).sum()
total_anomalies

129600

In [28]:
(anomalies==False).sum()

2462400

In [29]:
df["Anomaly"] = anomalies

In [30]:
df.to_csv("/kaggle/working/house_a_detected_anomalies.csv")

In [31]:
len(df)

2592000

In [34]:
print(f"Percentage Anomalies = {(total_anomalies/len(df))*100}%")

Percentage Anomalies = 5.0%
