# TOM pattern recognition : Deep Time-Series Clustering

```
Author: Gcinizwe Dlamini
```
<hr>

```
The notebook contains the following main sections : 
  1. Retrieve the data
  2. Set subsequence size m
  3. Define Auto encoder model 
  4. Train model 

Main libraries used :     
- torch
```

## Import libraries

In [None]:
#!g1.1
import torch 
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [None]:
#!g1.1
from utils import *
import numpy as np
from scipy import stats
import plotly.express as px
import pandas as pd
from sklearn.cluster import KMeans

## Retrieve dataset


In [None]:
#!g1.1
all_data = get_data(target_metrics=['total_removed', 'total_added', 'total_changed'])

In [None]:
#!g1.1
np.diff([1,2,3,4,5])

In [None]:
#!g1.1
stats.mode(np.diff(np.sort(all_data.get('0xPrateek/Stardox').get('time_stamps')))/np.timedelta64(1, 'h'))[1][0]

## Create Subsequences & Scale data

In [None]:
#!g1.1
def calc_seq_stats(T, t_stamps=None):
    # get timestamp stats
    sorted_t_stamps_diff = np.diff(np.sort(t_stamps))/np.timedelta64(1, 'h')
    tstamps_min, tstamps_max, tstamps_mode = sorted_t_stamps_diff.min(), sorted_t_stamps_diff.max(), stats.mode(sorted_t_stamps_diff)[0][0]
    tstamps_mean, tstamps_std = sorted_t_stamps_diff.mean(), sorted_t_stamps_diff.std()
    
    return np.array([np.mean(T), np.std(T), stats.mode(T)[0][0], np.min(T), np.max(T),
                     tstamps_min, tstamps_max, tstamps_mode, tstamps_mean, tstamps_std])

In [None]:
#!g1.1
window_size = 10

def generate_subsequences(all_data, window_size = 10):
    X = []
    Stats = []
    indx_repo_map = {}
    i = 0
    for key, val in all_data.items():
        temp = np.stack([val[metric] for metric in ['total_removed', 'total_added', 'total_changed']])
        tstamp = val.get('time_stamps')
        start = 1
        end = start + window_size
        while temp.shape[1] > end :
            X.append(temp[:,start:end])
            res = [calc_seq_stats(temp[j,start:end], tstamp[start:end]) for j in range(temp.shape[0])]
            Stats.append(res)
            start += 1
            end += 1
            indx_repo_map[i] = key
            i += 1
    
    return np.array(X), indx_repo_map, Stats

xtrain, indx_repo_map, S = generate_subsequences(all_data, window_size=window_size)

scaler = MinMaxScaler()
xtrain_std = scaler.fit_transform(xtrain.reshape(-1, window_size)).reshape(-1,3, window_size)

In [None]:
#!g1.1
# np.mean([len(u) for u in S])
np.array(S)[0]

## Create Tensor Dataset and Loader

In [None]:
#!g1.1
batch_size = 128 
x_tensor = torch.from_numpy(xtrain_std).float()

# Create dataset and Data Loader
dataset = torch.utils.data.TensorDataset(x_tensor,x_tensor)
trainloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

In [None]:
#!g1.1
x_tensor.shape

## Define model Architecture 

In [None]:
#!g1.1
class Simple1DModel(torch.nn.Module):
    def __init__(self):
        super(Simple1DModel, self).__init__()
        self.encoder = torch.nn.Sequential(torch.nn.Conv1d(in_channels=3, out_channels=3, kernel_size=3, stride=1), 
                                           torch.nn.Tanh(),
                                           torch.nn.Flatten(),
                                           torch.nn.Linear(24, 10)
                                          )
        self.decoder = torch.nn.Sequential(torch.nn.Tanh(),
                                           torch.nn.ConvTranspose1d(in_channels=1, out_channels=3, kernel_size=3, stride=1),
                                           torch.nn.Tanh(),
                                           #torch.nn.BatchNorm1d(3),
                                           torch.nn.Conv1d(in_channels=3, out_channels=3, kernel_size=3, stride=1)
                                          )
    def forward(self, x):
        encoded = self.encoder(x)
        x = encoded.unsqueeze(1)
        decoded = self.decoder(x)
        
        return decoded

## Unsupervised Data-driven Automotive Diagnostics with Improved Deep Temporal Clustering


In [None]:
#!g1.1
x_in = torch.randn(1, 3, 10)
layer = torch.nn.Sequential(torch.nn.Conv1d(in_channels=3, out_channels=15, kernel_size=3, stride=1, padding='same'),
                            torch.nn.LeakyReLU(),
                            torch.nn.MaxPool1d(kernel_size=2),
                            torch.nn.Conv1d(in_channels=15, out_channels=1, kernel_size=3, stride=1, padding='same'),
                            torch.nn.LeakyReLU(),
                            torch.nn.MaxPool1d(kernel_size=2)
                           )

z = layer(x_in)
print(z.shape)

d = torch.nn.Sequential(torch.nn.Upsample(scale_factor=2),
                      torch.nn.Conv1d(in_channels=1, out_channels=15, kernel_size=3, stride=1, padding='same'),
                      torch.nn.LeakyReLU(),
                      torch.nn.Upsample(scale_factor=2.5),
                      torch.nn.Conv1d(in_channels=15, out_channels=3, kernel_size=3, stride=1, padding='same'),
                      torch.nn.LeakyReLU()
                     )

# decoded = d(z) 
o1 = torch.nn.Upsample(scale_factor=2)(z)
print(o1.shape)
o2 = torch.nn.Conv1d(in_channels=1, out_channels=15, kernel_size=3, stride=1, padding='same')(o1)
print(o2.shape)
o3 = torch.nn.Upsample(scale_factor=2.5)(o2)
print(o3.shape)
o4 = torch.nn.Conv1d(in_channels=15, out_channels=3, kernel_size=3, stride=1, padding='same')(o3)
print(o4.shape)

In [None]:
#!g1.1
class Model2(torch.nn.Module):
    def __init__(self):
        super(Model2, self).__init__()
        self.encoder = torch.nn.Sequential(torch.nn.Conv1d(in_channels=3, out_channels=15, kernel_size=3, stride=1, padding='same'), 
                                           torch.nn.LeakyReLU(),
                                           torch.nn.MaxPool1d(kernel_size=2),
                                           torch.nn.Conv1d(in_channels=15, out_channels=1, kernel_size=3, stride=1, padding='same'),
                                           torch.nn.LeakyReLU(),
                                           torch.nn.MaxPool1d(kernel_size=2)
                                          )
        self.decoder = torch.nn.Sequential(torch.nn.Upsample(scale_factor=2),
                                           torch.nn.Conv1d(in_channels=1, out_channels=15, kernel_size=3, stride=1, padding='same'),
                                           torch.nn.LeakyReLU(),
                                           torch.nn.Upsample(scale_factor=2.5),
                                           torch.nn.Conv1d(in_channels=15, out_channels=3, kernel_size=3, stride=1, padding='same'),
                                           torch.nn.LeakyReLU()
                                          )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        
        return decoded

## Define training parameters

In [None]:
#!g1.1
n_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
# model = Simple1DModel().to(device)
model = Model2().to(device)

criterion = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

## Train Model

In [None]:
#!g1.1
import warnings
warnings.filterwarnings("ignore") 

In [None]:
#!g1.1
model.train()
train_loss_hist = []
for epoch in range(1, n_epochs+1):
    running_loss = 0.0
    for i, (data, label) in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data.to(device), label.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 100 == 0:
            print(f'Batch : {i+1} , Loss : {running_loss:.10f}')
    print(f'Epoch : {epoch} , Loss : {running_loss/len(trainloader):.10f}')
    train_loss_hist.append(running_loss/len(trainloader))
        

## Train Loss plot

In [None]:
#!g1.1
fig = px.line(x=np.arange(len(train_loss_hist)), y=train_loss_hist)
fig.update_traces(mode="markers+lines")
fig.update_layout(title="Train loss Plot")
fig.show()

## Cluster the latent space of AE

In [None]:
#!g1.1

model.eval()
encoded_data = model.encoder(x_tensor.to(device)).cpu().detach().numpy()

kmeans = KMeans(3)
cluster_labels = kmeans.fit_predict(encoded_data)

np.unique(cluster_labels,return_counts=True)

In [None]:
#!g1.1
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(encoded_data)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
#!g1.1
fig = px.line(x=K, y=Sum_of_squared_distances)
fig.update_traces(mode="markers+lines")
fig.update_layout(title="Elbow Method")
fig.show()

In [None]:
#!g1.1
Sum_of_squared_distances

In [None]:
#!g1.1
