In [None]:
from torch.utils.data import Dataset
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Subset
from sklearn.model_selection import train_test_split
import torch.multiprocessing as mp
import pandas as pd
import zipfile
import os
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
with zipfile.ZipFile('/content/drive/MyDrive/bgu-i-know-what-you-did-last-measurement-time.zip', 'r') as zip_ref:
        # Extract all contents to the specified directory
        zip_ref.extractall('/content')

**Data Set Implemntation**

In [None]:
class SequenceDataSet(Dataset):
    def __init__(self, csv_file, train=True):
        self.sequences = pd.read_csv(csv_file)
        self.data = self.sequences['Sequence_Path']
        self.targets = self.sequences['Label']
        self.noise_std = 0.01
        self.padding = 4000
        self.train = train
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence_path = self.data[idx]
        sequence =  pd.read_csv(sequence_path)
        label = self.targets[idx]

        if len(sequence.columns) == 3:
            seq_tens = torch.tensor(sequence.values)
        else:
            accelration = sequence[sequence['measurement type'] == 'acceleration [m/s/s]']
            accelration = accelration.drop(columns='measurement type')
            seq_tens = torch.tensor(accelration.values)
        if self.train:
          seq_tens = seq_tens + torch.randn_like(seq_tens) * self.noise_std
        if seq_tens.shape[0] != 4000:
           padding = 4000 - seq_tens.shape[0]
          #paddingLeft = torch.zeros((padding // 2, seq_tens.size(1)))
           paddingRight = torch.zeros((padding, seq_tens.size(1)))
           seq_tens = torch.cat([seq_tens, paddingRight], dim=0)

        tensor_float32 = seq_tens.to(dtype=torch.float32)
        return tensor_float32, label

**Data Loaders**

In [None]:
dataset = SequenceDataSet('/content/train_data.csv')

val_size = 0.2

train_indices, val_indices = train_test_split(list(range(len(dataset))), test_size=val_size, random_state=42)

train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)

batch_size = 32
shuffle_train = True
shuffle_val = False
num_workers = 2

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle_train)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle_val)

**Train and Test Functions**

In [None]:
def train_step(model, dataloader, optimizer, criterion, device='cuda'):
    model.train()
    running_loss = 0.0
    model.to(device)
    for step, (sequence, target) in enumerate(dataloader):
        sequence, target = sequence.to(device),target.to(device)
        # getting the output of the model
        logits = model(sequence)
        # getting the cost of the model
        loss = criterion(logits, target)
        # Backpropagation and updating the weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
def test(model, dataloader, criterion, device='cuda'):
    avg_acc, avg_loss = 0, 0
    model.to(device)
    for seq,target in dataloader:
        seq,target = seq.to(device),target.to(device)
        logits = model(seq)
        loss = criterion(logits, target)
        avg_loss += loss.item()


    return avg_loss / len(dataloader)

**GRU Model**

In [None]:
class GRUClassifier(nn.Module):
    def __init__(self, inputdim, hidden_dim, output_dim, num_layers, dropout=0):
        super(GRUClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        out,_ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out

**Simple CNN Model**

In [None]:
class CNN1D(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size=3, stride=1, padding=1):
        super(CNN1D, self).__init__()
        self.conv1d = nn.Conv1d(in_channels=3, out_channels=64, kernel_size=kernel_size, stride=stride, padding=padding)  # Adjusted input channels
        self.relu = nn.ReLU()
        self.conv1d2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=kernel_size, stride=stride, padding=padding)
        self.conv1d3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=kernel_size, stride=stride, padding=padding)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.fc1 = nn.Linear(256 * (seq_len // 2), output_dim)

    def forward(self, x):
        # Input shape: (batch_size, 3, seq_len)  # Adjusted input channels
        x = torch.transpose(x, 1, 2)
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.conv1d2(x)
        x = self.relu(x)
        x = self.conv1d3(x)
        x = self.relu(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)  # Flatten for fully connected layer
        x = self.fc1(x)
        return x

**Improved CNN**

In [None]:
class CNN1DIMP(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size=3, stride=1, padding=1, dropout_prob=0.5):
        super(CNN1DIMP, self).__init__()
        self.conv1d = nn.Conv1d(in_channels=input_dim, out_channels=64, kernel_size=kernel_size, stride=stride, padding=padding)
        self.bn1 = nn.BatchNorm1d(64)  # Batch normalization layer after the first convolution
        self.conv1d2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=kernel_size, stride=stride, padding=padding)
        self.bn2 = nn.BatchNorm1d(128)  # Batch normalization layer after the second convolution
        self.conv1d3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=kernel_size, stride=stride, padding=padding)
        self.bn3 = nn.BatchNorm1d(256)  # Batch normalization layer after the third convolution
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.fc1 = nn.Linear(256 * (seq_len // 2), output_dim)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        x = torch.transpose(x, 1, 2)  # Adjust input shape
        x = self.conv1d(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv1d2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.conv1d3(x)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)  # Apply dropout
        x = self.fc1(x)
        return x

**Self-Supervised**

**Auto Encoder**

In [None]:
# Define a simple autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_size, latent_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, latent_size),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_size, input_size),
            nn.Sigmoid()  # Sigmoid activation for reconstruction
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

**Simple GRU Training**

In [None]:
input_dim = 3
hidden_dim = 32
output_dim = 18
num_layers = 5
num_epochs = 10
learning_rate = 0.001

model = GRUClassifier(input_dim, hidden_dim, output_dim, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
train_loss, v_loss = [], []

for epoch in range(num_epochs):
    #train of each epoch:
    epoch_loss = train_step(model, train_dataloader, optimizer, criterion, device='cuda')
    model.eval()
    #save the results of each epoch
    loss = test(model, train_dataloader, criterion, device='cuda')
    val_loss = test(model, val_dataloader, criterion, device='cuda')
    train_loss.append(loss)
    v_loss.append(val_loss)
    print(f'Epoch [{epoch+1}], Training Loss: {loss:.4f}, Validation Loss: {val_loss:.4f}')

**Learning Curve for simple GRU**

In [None]:
epochs = range(1, 11)

plt.plot(epochs, train_loss, label='Training Loss')
plt.plot(epochs, v_loss, label='Validation Loss')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.show()

**Simple CNN Training**

In [None]:

num_epochs = 10
learning_rate = 0.001
model = CNN1D(3,18)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
train_loss, v_loss = [], []

for epoch in range(num_epochs):
    #train of each epoch:
    epoch_loss = train_step(model, train_dataloader, optimizer, criterion, device='cuda')
    model.eval()
    #save the results of each epoch
    loss = test(model, train_dataloader, criterion, device='cuda')
    val_loss = test(model, val_dataloader, criterion, device='cuda')
    train_loss.append(loss)
    v_loss.append(val_loss)
    print(f'Epoch [{epoch+1}], Training Loss: {loss:.4f}, Validation Loss: {val_loss:.4f}')

**Learning Curve for simple CNN**

In [None]:
epochs = range(1, 11)

plt.plot(epochs, train_loss, label='Training Loss')
plt.plot(epochs, v_loss, label='Validation Loss')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.show()

**Improved GRU Training**

In [None]:
input_dim = 3
hidden_dim = 32
output_dim = 18
num_layers = 5
num_epochs = 20
learning_rate = 0.001

model = GRUClassifier(input_dim, hidden_dim, output_dim, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99)

train_loss, v_loss = [], []

for epoch in range(num_epochs):
    #train of each epoch:
    epoch_loss = train_step(model, train_dataloader, optimizer, criterion, device='cuda')
    model.eval()
    #save the results of each epoch
    loss = test(model, train_dataloader, criterion, device='cuda')
    val_loss = test(model, val_dataloader, criterion, device='cuda')
    train_loss.append(loss)
    v_loss.append(val_loss)
    scheduler.step()
    print(f'Epoch [{epoch+1}], Training Loss: {loss:.4f}, Validation Loss: {val_loss:.4f}')

**Learning Curve for Improved GRU**

In [None]:
epochs = range(1, 21)

plt.plot(epochs, train_loss, label='Training Loss')
plt.plot(epochs, v_loss, label='Validation Loss')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.show()

**Improved CNN Training**

In [None]:
num_epochs = 10
learning_rate = 0.001
model = CNN1DIMP(3,18)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
train_loss, v_loss = [], []

for epoch in range(num_epochs):
    #train of each epoch:
    epoch_loss = train_step(model, train_dataloader, optimizer, criterion, device='cuda')
    model.eval()
    #save the results of each epoch
    loss = test(model, train_dataloader, criterion, device='cuda')
    val_loss = test(model, val_dataloader, criterion, device='cuda')
    train_loss.append(loss)
    v_loss.append(val_loss)
    print(f'Epoch [{epoch+1}], Training Loss: {loss:.4f}, Validation Loss: {val_loss:.4f}')

**Learning Curve For Improved CNN**

In [None]:
epochs = range(1, 11)

plt.plot(epochs, train_loss, label='Training Loss')
plt.plot(epochs, v_loss, label='Validation Loss')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.show()

**Self Supervised Training**

In [None]:
# Define hyperparameters
input_size = 3  # Dimensionality of each time step in the sequence
latent_size = 2  # Dimensionality of the latent space
batch_size = 32
learning_rate = 0.001
num_epochs = 10

# Create data loader
dataset = SequenceDataSet('/content/train_data.csv')
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Step 4: Create Data Loaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)

# Initialize model, loss function, and optimizer
model = Autoencoder(input_size, latent_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
model.to('cuda')
# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for input, _ in train_dataloader:
        input = input.to('cuda')
        optimizer.zero_grad()
        recon_batch = model(input)
        loss = criterion(recon_batch, input)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Epoch {epoch + 1}, Train Loss: {train_loss / len(train_dataloader.dataset)}")
    model.eval()
    val_loss = 0
    for input, _ in val_dataloader:
      input = input.to('cuda')
      with torch.no_grad():
        recon_batch = model(input)
        loss = criterion(recon_batch, input)
        val_loss += loss

    print(f"Epoch {epoch + 1}, Vall Loss: {train_loss / len(val_dataloader.dataset)}")


# Save the encoder part of the trained autoencoder
torch.save(model.encoder.state_dict(), 'encoder.pth')

**Tunning the Clasifier**

In [None]:
# Define hyperparameters for classifier training
num_classes = 18  # Number of classes for classification
classifier_learning_rate = 0.001
classifier_num_epochs = 10

# Load the pre-trained encoder
pretrained_encoder = nn.Sequential(
    nn.Linear(input_size, latent_size),
    nn.ReLU()
)
pretrained_encoder.load_state_dict(torch.load('encoder.pth'))


# Initialize classifier model, loss function, and optimizer
classifier_model = GRUModel(latent_size, hidden_size=32, num_layers=5, num_classes=18)
classifier_criterion = nn.CrossEntropyLoss()
classifier_optimizer = optim.Adam(classifier_model.parameters(), lr=classifier_learning_rate)
val_loss_lst, train_loss_lst = [], []
# Training loop for classifier
classifier_model.to('cuda')
pretrained_encoder.to('cuda')
for epoch in range(classifier_num_epochs):
    total_loss = 0
    classifier_model.train()
    for inputs, labels in train_dataloader:
        #inputs = inputs.view(-1, input_size)  # Flatten input sequences
        inputs, labels = inputs.to('cuda'), labels.to('cuda')
        encoded = pretrained_encoder(inputs)  # Use pre-trained encoder for feature extraction
        classifier_optimizer.zero_grad()
        outputs = classifier_model(encoded)
        loss = classifier_criterion(outputs, labels)
        loss.backward()
        classifier_optimizer.step()
        total_loss += loss.item()
    train_loss_lst.append(total_loss/len(train_dataloader.dataset))
    print(f"Classifier Epoch {epoch + 1}, Train Loss: {total_loss / len(train_dataloader.dataset)}")

    classifier_model.eval()
    val_loss = 0
    for inputs, labels in val_dataloader:
      inputs, labels = inputs.to('cuda'), labels.to('cuda')
      with torch.no_grad():
        encoded = pretrained_encoder(inputs)  # Use pre-trained encoder for feature extraction
        outputs = classifier_model(encoded)
        loss = classifier_criterion(outputs, labels)
        val_loss += loss
    val_loss_lst.append(val_loss/len(val_dataloader.dataset))

    print(f"Epoch {epoch + 1}, Vall Loss: {val_loss / len(val_dataloader.dataset)}")

**Function for detecting good and bad clasification of the model**

In [None]:
input_dim = 3
hidden_dim = 32
output_dim = 18  # Number of classes
num_layers = 5

model = GRUClassifier(input_dim, hidden_dim, output_dim, num_layers,0)

# Load the state dictionary from the .pth file
state_dict = torch.load('BasicGRU.pth')

# Load the state dictionary into your model
model.load_state_dict(state_dict)

model.to(device)

bad = []
good = []

#Iterate over the test loader
with torch.no_grad():
    for inputs, targets in val_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        probabilities = F.softmax(outputs, dim=1)
        predictions = probabilities.argmax(dim=1)
        for i in range(32):
          if probabilities.shape[0] == 32:
            if predictions[i] != targets[i]:
              if probabilities[i][int(targets[i])] < 0.01:
                bad.append([targets[i], predictions[i], probabilities[i][int(targets[i])]])

            elif predictions[i] == targets[i]:
              if probabilities[i][int(targets[i])] > 0.93:
                good.append([targets[i], probabilities[i][int(targets[i])]])

**Plot Good Clasification**

In [None]:
value_counts = {i: 0 for i in range(0, 18)}  # Initialize with zeros from 1 to 18

# Iterate through the data list and count occurrences of each value
for row in good:
    value = row[0].item()
    value_counts[value] += 1

# Extract keys and values
values = list(value_counts.keys())
counts = list(value_counts.values())

# Plot bar plot for each value
plt.bar(values, counts)
plt.xlabel('Label')
plt.ylabel('Count')
plt.title('Bar Plot for Each Value for Good Classification')
plt.xticks(range(0, 18))  # Set x-axis ticks from 1 to 18
plt.show()

**Plot Bad Classification**

In [None]:
value_counts = {i: 0 for i in range(0, 18)}  # Initialize with zeros from 1 to 18

# Iterate through the data list and count occurrences of each value
for row in bad:
    value = row[0].item()
    value_counts[value] += 1

# Extract keys and values
values = list(value_counts.keys())
counts = list(value_counts.values())

# Plot bar plot for each value
plt.bar(values, counts)
plt.xlabel('Label')
plt.ylabel('Count')
plt.title('Bar Plot for Each Label for Bad Classification')
plt.xticks(range(0, 18))  # Set x-axis ticks from 1 to 18
plt.show()

**Classical ML**

**Data Set**

In [None]:
class SequenceDataSetML(Dataset):
    def __init__(self, csv_file):
        self.sequences = pd.read_csv(csv_file)
        self.data = self.sequences['Sequence_Path']
        self.targets = self.sequences['Label']
        self.noise_std = 0.01


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence_path = self.data[idx]
        sequence =  pd.read_csv(sequence_path)
        label = self.targets[idx]

        if len(sequence.columns) == 3:
            seq_tens = torch.tensor(sequence.values)
        else:
            accelration = sequence[sequence['measurement type'] == 'acceleration [m/s/s]']
            accelration = accelration.drop(columns='measurement type')
            seq_tens = torch.tensor(accelration.values)

        seq_tens = seq_tens + torch.randn_like(seq_tens) * self.noise_std
        seq_len = len(seq_tens)
        subtensors = torch.chunk(seq_tens, 10, dim=0)
        subtensor_stats = [seq_len]
        for subtensor in subtensors:
            mean = subtensor.mean(dim=0).tolist()
            std = subtensor.std(dim=0).tolist()
            for i in range(3):
              subtensor_stats.append(mean[i])
              subtensor_stats.append(std[i])



        return subtensor_stats, label

**Data Loader**

In [None]:
dataset = SequenceDataSetML('/content/train_data (1).csv')

# Define the proportion of samples for the validation set
val_size = 0.2

# Split the dataset into training and validation sets
train_indices, val_indices = train_test_split(list(range(len(dataset))), test_size=val_size, random_state=42)

# Create Subset instances for the training and validation sets
train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)

# Define DataLoader parameters
batch_size = 32
shuffle_train = True
shuffle_val = False
num_workers = 2

# Create DataLoaders for training and validation sets
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle_train)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle_val)

**Feature Extraction**

In [None]:
model = xgb.XGBClassifier()
features_list_train, labels_list_train = [], []
features_list_val, labels_list_val = [], []

for step, (sequence_features, target) in enumerate(train_dataloader):
    features_list_train.append(np.array(sequence_features).T) # Convert features to NumPy array
    labels_list_train.append(np.array(target).T) # Convert labels to NumPy array

features_array_train = np.concatenate(features_list_train[:-1], axis=0)
labels_array_train = np.concatenate(labels_list_train[:-1], axis=0)

for step, (sequence_features, target) in enumerate(val_dataloader):
    features_list_val.append(np.array(sequence_features).T)  # Convert features to NumPy array
    labels_list_val.append(np.array(target).T)  # Convert labels to NumPy array


features_array_val = np.concatenate(features_list_val[:-1], axis=0)
labels_array_val = np.concatenate(labels_list_val[:-1], axis=0)

**Logistic Regression Model**

In [None]:
pipeline = make_pipeline(StandardScaler(), LogisticRegression(penalty='l2', solver='liblinear', multi_class='auto'))

true_train, true_val = 0, 0
pipeline.fit(features_array_train, labels_array_train)
y_hat_train = pipeline.predict(features_array_train)
true_train += (y_hat_train == labels_array_train).sum()
y_hat_val = pipeline.predict(features_array_val)
true_val += (y_hat_val == labels_array_val).sum()

print(f'Training Acc: {true_train/len(train_dataloader.dataset):.4f}, Validation Acc: {true_val/len(val_dataloader.dataset):.4f}')


**Random Forest Model**

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

true_train, true_val = 0, 0
model.fit(features_array_train, labels_array_train)
y_hat_train = model.predict(features_array_train)
true_train += (y_hat_train == labels_array_train).sum()
y_hat_val = model.predict(features_array_val)
true_val += (y_hat_val == labels_array_val).sum()

print(f'Training Acc: {true_train/len(train_dataloader.dataset):.4f}, Validation Acc: {true_val/len(val_dataloader.dataset):.4f}')


**XGB Model**

In [None]:
model = xgb.XGBClassifier()

true_train, true_val = 0, 0
model.fit(features_array_train, labels_array_train)
y_hat_train = model.predict(features_array_train)
true_train += (y_hat_train == labels_array_train).sum()
y_hat_val = model.predict(features_array_val)
true_val += (y_hat_val == labels_array_val).sum()

print(f'Training Acc: {true_train/len(train_dataloader.dataset):.4f}, Validation Acc: {true_val/len(val_dataloader.dataset):.4f}')
