In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
from torch.cuda.amp import autocast, GradScaler


In [2]:
df = pd.read_csv("/kaggle/input/house-b-csv/house_b_combined_dataset.csv")

In [3]:
# Assuming 'df' is the DataFrame that contains your data
# Preprocessing: Drop the Hour, Resident columns for simplicity
data = df.drop(columns=['Hour', 'Resident1', 'Resident2']).values  # Remove non-sensor data

In [4]:
# Normalize or standardize data if needed
scaler = StandardScaler()
data = scaler.fit_transform(data)  # Normalize sensor data (optional)

In [5]:
# Convert data to tensor
data_tensor = torch.tensor(data, dtype=torch.float32)

In [6]:
# Create Dataset and DataLoader for batch processing
batch_size = 2**17  # Adjust based on GPU memory availability
dataset = TensorDataset(data_tensor)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [7]:
import torch
import torch.nn as nn

class LSTM_Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, seq_len):
        super(LSTM_Autoencoder, self).__init__()
        self.encoder = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, input_size)
        self.hidden_size = hidden_size
        self.seq_len = seq_len

    def forward(self, x):
        # Initialize hidden and cell states for encoder
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)  # batch_size, hidden_size
        c0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)  # batch_size, hidden_size
        
        # Pass through encoder
        encoded, (hn, cn) = self.encoder(x, (h0, c0))
        
        # Pass through decoder
        decoded, _ = self.decoder(encoded, (hn, cn))
        
        # Map decoded output back to input size
        decoded = self.output_layer(decoded)
        
        return decoded


In [8]:
# Model Hyperparameters
input_size = data.shape[1]  # Number of sensor features
hidden_size = 64  # You can adjust this based on your model's complexity
seq_len = 1  # We are feeding one time step at a time (if you want multi-step sequence, change this)

In [9]:
# Initialize the model and move it to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM_Autoencoder(input_size=input_size, hidden_size=hidden_size, seq_len=seq_len).to(device)

In [10]:
model = nn.DataParallel(model)

model = model.to(device)

In [11]:
# Loss and Optimizer
criterion = nn.MSELoss()  # Mean Squared Error for reconstruction loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [12]:
# Initialize GradScaler for mixed precision training
scaler = GradScaler()

num_epochs = 300  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (batch_data,) in enumerate(data_loader):
        batch_data = batch_data.to(device)
        batch_data = batch_data.unsqueeze(1)  # Ensure correct shape

        # Forward and backward pass under autocast for mixed precision
        with autocast():
            output = model(batch_data)
            loss = criterion(output, batch_data)
        
        # Scale loss to prevent underflow and backpropagate
        scaler.scale(loss).backward()
        
        # Optimizer step with scaled gradients
        scaler.step(optimizer)
        scaler.update()  # Update scaler for next iteration
        optimizer.zero_grad()  # Reset gradients
        
        running_loss += loss.item()
    
    avg_loss = running_loss / len(data_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')


  scaler = GradScaler()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch [1/300], Loss: 0.9816
Epoch [2/300], Loss: 0.9058
Epoch [3/300], Loss: 0.7740
Epoch [4/300], Loss: 0.6337
Epoch [5/300], Loss: 0.5102
Epoch [6/300], Loss: 0.4235
Epoch [7/300], Loss: 0.3637
Epoch [8/300], Loss: 0.3151
Epoch [9/300], Loss: 0.2763
Epoch [10/300], Loss: 0.2456
Epoch [11/300], Loss: 0.2215
Epoch [12/300], Loss: 0.2001
Epoch [13/300], Loss: 0.1812
Epoch [14/300], Loss: 0.1642
Epoch [15/300], Loss: 0.1491
Epoch [16/300], Loss: 0.1354
Epoch [17/300], Loss: 0.1236
Epoch [18/300], Loss: 0.1142
Epoch [19/300], Loss: 0.1056
Epoch [20/300], Loss: 0.0982
Epoch [21/300], Loss: 0.0916
Epoch [22/300], Loss: 0.0854
Epoch [23/300], Loss: 0.0799
Epoch [24/300], Loss: 0.0749
Epoch [25/300], Loss: 0.0700
Epoch [26/300], Loss: 0.0657
Epoch [27/300], Loss: 0.0618
Epoch [28/300], Loss: 0.0581
Epoch [29/300], Loss: 0.0546
Epoch [30/300], Loss: 0.0513
Epoch [31/300], Loss: 0.0484
Epoch [32/300], Loss: 0.0457
Epoch [33/300], Loss: 0.0432
Epoch [34/300], Loss: 0.0408
Epoch [35/300], Loss: 0

In [13]:
import numpy as np
from torch.cuda.amp import autocast

model.eval()
reconstruction_errors = []

with torch.no_grad():
    for batch_idx, (batch_data,) in enumerate(data_loader):
        batch_data = batch_data.to(device)
        batch_data = batch_data.unsqueeze(1)  

        with autocast():
            output = model(batch_data)

        error = torch.mean((batch_data - output) ** 2, dim=[1, 2]).cpu().numpy()
        reconstruction_errors.extend(error)

  with autocast():


In [14]:
threshold = np.percentile(reconstruction_errors, 95)

anomalies = np.array(reconstruction_errors) > threshold

In [15]:
total_anomalies = (anomalies==True).sum()
total_anomalies

129597

In [16]:
(anomalies==False).sum()

2462403

In [17]:
df["Anomaly"] = anomalies

In [18]:
df.to_csv("/kaggle/working/house_b_detected_anomalies.csv")

In [19]:
len(df)

2592000

In [20]:
print(f"Percentage Anomalies = {(total_anomalies/len(df))*100}%")

Percentage Anomalies = 4.999884259259259%
