In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
from torch.cuda.amp import autocast, GradScaler

In [2]:
df = pd.read_csv("/kaggle/input/house-a/house_a_combined_dataset.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,photocell_wardrobe,photocell_couch,ir_tv_receiver,force_couch_1,force_couch_2,distance_chair_1,distance_chair_2,photocell_fridge,photocell_kitchen_drawer,...,contact_shower_door,sonar_hall,sonar_kitchen,distance_tap,distance_water_closet,temperature_kitchen,force_bed,Resident1,Resident2,Hour
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,12,17,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,12,17,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,12,17,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,12,17,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,12,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591995,86395,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,12,2,23
2591996,86396,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,12,2,23
2591997,86397,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,12,2,23
2591998,86398,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,12,2,23


In [5]:
weeks = [[(i*7)*86400 , (i*7+7)*86400 if len(df)>(i*7+7)*86400 else len(df) ] for i in range(int(30/7)+1)]
for index,week in enumerate(weeks):
    df.loc[week[0]:week[1],"Week"]= int(index)+1

In [6]:
for i in range(int(len(df)/86400)):
    df.loc[i*86400:(i*86400)+86400,"Day Of Week"] = (i%7)+1
df

Unnamed: 0.1,Unnamed: 0,photocell_wardrobe,photocell_couch,ir_tv_receiver,force_couch_1,force_couch_2,distance_chair_1,distance_chair_2,photocell_fridge,photocell_kitchen_drawer,...,sonar_kitchen,distance_tap,distance_water_closet,temperature_kitchen,force_bed,Resident1,Resident2,Hour,Week,Day Of Week
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,12,17,0,1.0,1.0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,12,17,0,1.0,1.0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,12,17,0,1.0,1.0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,12,17,0,1.0,1.0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,12,17,0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591995,86395,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,12,2,23,5.0,2.0
2591996,86396,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,12,2,23,5.0,2.0
2591997,86397,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,12,2,23,5.0,2.0
2591998,86398,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,12,2,23,5.0,2.0


In [8]:
# Assuming 'df' is the DataFrame that contains your data
# Preprocessing: Drop the Hour, Resident columns for simplicity
data = df.drop(columns=['Unnamed: 0']) # Remove non-sensor data
print(data.columns)
data = data.values 

Index(['photocell_wardrobe', 'photocell_couch', 'ir_tv_receiver',
       'force_couch_1', 'force_couch_2', 'distance_chair_1',
       'distance_chair_2', 'photocell_fridge', 'photocell_kitchen_drawer',
       'photocell_wardrobe_2', 'photocell_bathroom_cabinet',
       'contact_house_door', 'contact_bathroom_door', 'contact_shower_door',
       'sonar_hall', 'sonar_kitchen', 'distance_tap', 'distance_water_closet',
       'temperature_kitchen', 'force_bed', 'Resident1', 'Resident2', 'Hour',
       'Week', 'Day Of Week'],
      dtype='object')


In [9]:
# Normalize or standardize data if needed
sc = StandardScaler()
data = sc.fit_transform(data)  # Normalize sensor data (optional)

In [10]:
from joblib import dump

dump(sc, "/kaggle/working/standardscaler_ha.joblib")

['/kaggle/working/standardscaler_ha.joblib']

In [11]:
# Convert data to tensor
data_tensor = torch.tensor(data, dtype=torch.float32)

In [29]:
# Create Dataset and DataLoader for batch processing
batch_size = 2**17  # Adjust based on GPU memory availability
dataset = TensorDataset(data_tensor)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [30]:
import torch
import torch.nn as nn

class LSTM_Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, seq_len):
        super(LSTM_Autoencoder, self).__init__()
        self.encoder = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, input_size)
        self.hidden_size = hidden_size
        self.seq_len = seq_len

    def forward(self, x):
        # Initialize hidden and cell states for encoder
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)  # batch_size, hidden_size
        c0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)  # batch_size, hidden_size
        
        # Pass through encoder
        encoded, (hn, cn) = self.encoder(x, (h0, c0))
        
        # Pass through decoder
        decoded, _ = self.decoder(encoded, (hn, cn))
        
        # Map decoded output back to input size
        decoded = self.output_layer(decoded)
        
        return decoded


In [31]:
# Model Hyperparameters
input_size = data.shape[1]  # Number of sensor features
hidden_size = 256  # You can adjust this based on your model's complexity
seq_len = 1  # We are feeding one time step at a time (if you want multi-step sequence, change this)

In [32]:
# Initialize the model and move it to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM_Autoencoder(input_size=input_size, hidden_size=hidden_size, seq_len=seq_len).to(device)

In [33]:
model = nn.DataParallel(model)

model = model.to(device)

In [34]:
# Loss and Optimizer
criterion = nn.MSELoss()  # Mean Squared Error for reconstruction loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [35]:
# Initialize GradScaler for mixed precision training
scaler = GradScaler()

num_epochs = 100  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (batch_data,) in enumerate(data_loader):
        batch_data = batch_data.to(device)
        batch_data = batch_data.unsqueeze(1)  # Ensure correct shape

        # Forward and backward pass under autocast for mixed precision
        with autocast():
            output = model(batch_data)
            loss = criterion(output, batch_data)
        
        # Scale loss to prevent underflow and backpropagate
        scaler.scale(loss).backward()
        
        # Optimizer step with scaled gradients
        scaler.step(optimizer)
        scaler.update()  # Update scaler for next iteration
        optimizer.zero_grad()  # Reset gradients
        
        running_loss += loss.item()
    
    avg_loss = running_loss / len(data_loader)
    if((epoch+1)%5==0):
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')


  scaler = GradScaler()
  with autocast():


Epoch [5/100], Loss: 0.0283
Epoch [10/100], Loss: 0.0040
Epoch [15/100], Loss: 0.0027
Epoch [20/100], Loss: 0.0021
Epoch [25/100], Loss: 0.0016
Epoch [30/100], Loss: 0.0012
Epoch [35/100], Loss: 0.0009
Epoch [40/100], Loss: 0.0007
Epoch [45/100], Loss: 0.0005
Epoch [50/100], Loss: 0.0004
Epoch [55/100], Loss: 0.0004
Epoch [60/100], Loss: 0.0003
Epoch [65/100], Loss: 0.0003
Epoch [70/100], Loss: 0.0002
Epoch [75/100], Loss: 0.0002
Epoch [80/100], Loss: 0.0002
Epoch [85/100], Loss: 0.0002
Epoch [90/100], Loss: 0.0001
Epoch [95/100], Loss: 0.0001
Epoch [100/100], Loss: 0.0001


In [36]:
import numpy as np
from torch.cuda.amp import autocast

model.eval()
reconstruction_errors = []

with torch.no_grad():
    for batch_idx, (batch_data,) in enumerate(data_loader):
        batch_data = batch_data.to(device)
        batch_data = batch_data.unsqueeze(1)  

        with autocast():
            output = model(batch_data)

        error = torch.mean((batch_data - output) ** 2, dim=[1, 2]).cpu().numpy()
        reconstruction_errors.extend(error)

  with autocast():


In [37]:
threshold = np.percentile(reconstruction_errors, 95)

anomalies = np.array(reconstruction_errors) > threshold

In [38]:
total_anomalies = (anomalies==True).sum()
total_anomalies

129581

In [39]:
(anomalies==False).sum()

2462419

In [40]:
df["Anomaly"] = anomalies

In [41]:
df.to_csv("/kaggle/working/house_a_detected_anomalies.csv",index=False)

In [42]:
len(df)

2592000

In [43]:
print(f"Percentage Anomalies = {(total_anomalies/len(df))*100}%")

Percentage Anomalies = 4.999266975308642%


In [44]:
torch.save(model.state_dict(), "/kaggle/working/lstm_autoencoder_ha.pth")
np.save("/kaggle/working/reconstruction_errors_ha.npy", reconstruction_errors)

<h1>Testing it on custom Data</h1>


In [45]:
import torch
# Remove "module." prefix if present
from collections import OrderedDict


class LSTM_Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, seq_len):
        super(LSTM_Autoencoder, self).__init__()
        self.encoder = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, input_size)
        self.hidden_size = hidden_size
        self.seq_len = seq_len

    def forward(self, x):
        # Initialize hidden and cell states for encoder
        h1 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        c1 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        
        # Pass through encoder
        encoded, (hn, cn) = self.encoder(x, (h1, c1))
        
        # Pass through decoder
        decoded, _ = self.decoder(encoded, (hn, cn))
        
        # Map decoded output back to input size
        decoded = self.output_layer(decoded)
        
        return decoded

# Initialize the model (ensure the parameters match the saved model)
input_size = 25  # Adjust based on your data
hidden_size = 256  # Adjust based on your saved model
seq_len = 1      # Adjust based on your sequence length
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LSTM_Autoencoder(input_size=input_size, hidden_size=hidden_size, seq_len=seq_len).to(device)

In [46]:
# Load the state_dict
state_dict = torch.load("/kaggle/working/lstm_autoencoder_ha.pth", map_location=device)

# Remove the "module." prefix from the keys
new_state_dict = {}
for key, value in state_dict.items():
    new_key = key.replace("module.", "")  # Remove the "module." prefix
    new_state_dict[new_key] = value

# Load the updated state_dict into the model
model.load_state_dict(new_state_dict)

# Set the model to evaluation mode
model.eval()


  state_dict = torch.load("/kaggle/working/lstm_autoencoder_ha.pth", map_location=device)


LSTM_Autoencoder(
  (encoder): LSTM(25, 256, batch_first=True)
  (decoder): LSTM(256, 256, batch_first=True)
  (output_layer): Linear(in_features=256, out_features=25, bias=True)
)

In [47]:
reconstruction_errors = np.load("/kaggle/working/reconstruction_errors_ha.npy")

threshold = np.percentile(reconstruction_errors, 95)
anomalies = reconstruction_errors > threshold

total_anomalies = anomalies.sum()
print(f"Total anomalies: {total_anomalies}")

Total anomalies: 129581


In [48]:
threshold

0.00016964174574241042

In [49]:
from joblib import load

# Load the scaler from the file
sc = load('/kaggle/working/standardscaler_ha.joblib')
sc

In [51]:
import numpy as np
import torch

# Replace with actual new data
new_data = np.array([[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]])

# Transform new data using the StandardScaler
new_data = sc.transform(new_data)

# Convert to tensor and reshape for LSTM
new_data_tensor = torch.tensor(new_data, dtype=torch.float32).unsqueeze(0).to(device)  
# Shape: (batch_size=1, sequence_length=1, input_size=22)

# Forward pass through the model
model.eval()
with torch.no_grad():
    new_data_reconstructed = model(new_data_tensor)
    reconstruction_error = torch.mean((new_data_tensor - new_data_reconstructed) ** 2).item()

# Compare the error with the threshold
is_anomaly = reconstruction_error > threshold
print(f"Reconstruction Error: {reconstruction_error}, Anomaly: {is_anomaly}")


Reconstruction Error: 100.18390655517578, Anomaly: True
