# TOM pattern recognition : Deep Time-Series Clustering

```
Author: Gcinizwe Dlamini
```
<hr>

```
The notebook contains the following main sections : 
  1. Retrieve the data
  2. Set subsequence size m
  3. Define Auto encoder model 
  4. Train model 

Main libraries used :     
- torch
```

## Import libraries

In [26]:
import torch 
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Training device : {device}')

Training device : cpu


In [2]:
from utils import get_data
import numpy as np
from scipy import stats
import plotly.express as px
import pandas as pd
from sklearn.cluster import KMeans

## Retrieve dataset


In [3]:
all_data = get_data(target_metrics=['total_removed', 'total_added', 'total_changed'])

total filtered repos : 464
Max commits in data : 1838


## Create Subsequences & Scale data

In [6]:
def calc_seq_stats(T, t_stamps=None):
    # get timestamp stats
    sorted_t_stamps_diff = np.diff(np.sort(t_stamps))/np.timedelta64(1, 'h')
    tstamps_min, tstamps_max, tstamps_mode = sorted_t_stamps_diff.min(), sorted_t_stamps_diff.max(), stats.mode(sorted_t_stamps_diff)[0][0]
    tstamps_mean, tstamps_std = sorted_t_stamps_diff.mean(), sorted_t_stamps_diff.std()
    
    return np.array([np.mean(T), np.std(T), stats.mode(T)[0][0], np.min(T), np.max(T),
                     tstamps_min, tstamps_max, tstamps_mode, tstamps_mean, tstamps_std])

In [7]:
window_size = 10

def generate_subsequences(all_data, window_size = 10):
    X = []
    Stats = []
    indx_repo_map = {}
    i = 0
    for key, val in all_data.items():
        temp = np.stack([val[metric] for metric in ['total_removed', 'total_added', 'total_changed']])
        tstamp = val.get('time_stamps')
        start = 1
        end = start + window_size
        while temp.shape[1] > end :
            X.append(temp[:,start:end])
            res = [calc_seq_stats(temp[j,start:end], tstamp[start:end]) for j in range(temp.shape[0])]
            Stats.append(res)
            start += 1
            end += 1
            indx_repo_map[i] = key
            i += 1
    
    return np.array(X), indx_repo_map, Stats

xtrain, indx_repo_map, S = generate_subsequences(all_data, window_size=window_size)

scaler = MinMaxScaler()
xtrain_std = scaler.fit_transform(xtrain.reshape(-1, window_size)).reshape(-1,3, window_size)

## Create Tensor Dataset and Loader

In [8]:
batch_size = 128 
x_tensor = torch.from_numpy(xtrain_std).float()

# Create dataset and Data Loader
dataset = torch.utils.data.TensorDataset(x_tensor,x_tensor)
trainloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

## Define model Architecture 

In [None]:
#!g1.1
class Simple1DModel(torch.nn.Module):
    def __init__(self):
        super(Simple1DModel, self).__init__()
        self.encoder = torch.nn.Sequential(torch.nn.Conv1d(in_channels=3, out_channels=3, kernel_size=3, stride=1), 
                                           torch.nn.Tanh(),
                                           torch.nn.Flatten(),
                                           torch.nn.Linear(24, 10)
                                          )
        self.decoder = torch.nn.Sequential(torch.nn.Tanh(),
                                           torch.nn.ConvTranspose1d(in_channels=1, out_channels=3, kernel_size=3, stride=1),
                                           torch.nn.Tanh(),
                                           #torch.nn.BatchNorm1d(3),
                                           torch.nn.Conv1d(in_channels=3, out_channels=3, kernel_size=3, stride=1)
                                          )
    def forward(self, x):
        encoded = self.encoder(x)
        x = encoded.unsqueeze(1)
        decoded = self.decoder(x)
        
        return decoded

## Unsupervised Data-driven Automotive Diagnostics with Improved Deep Temporal Clustering

DeepCluster: A Deep Convolutional Auto-encoder with Embedded Clustering

The proposed approach embeds clustering algorithm Kmeans into a DCAE framework which is jointly optimized and trained in a fully unsupervised manner. The methods alternately learn effective feature representation and cluster assignment through DCAE.


In [18]:
class DCAE_Kmeans(torch.nn.Module):
    def __init__(self):
        super(DCAE_Kmeans, self).__init__()
        self.encoder = torch.nn.Sequential(torch.nn.Conv1d(in_channels=3, out_channels=15, kernel_size=3, stride=1, padding='same'), 
                                           torch.nn.LeakyReLU(),
                                           torch.nn.MaxPool1d(kernel_size=2),
                                           torch.nn.Conv1d(in_channels=15, out_channels=1, kernel_size=3, stride=1, padding='same'),
                                           torch.nn.LeakyReLU(),
                                           torch.nn.MaxPool1d(kernel_size=2)
                                          )
        self.decoder = torch.nn.Sequential(torch.nn.Upsample(scale_factor=2),
                                           torch.nn.Conv1d(in_channels=1, out_channels=15, kernel_size=3, stride=1, padding='same'),
                                           torch.nn.LeakyReLU(),
                                           torch.nn.Upsample(scale_factor=2.5),
                                           torch.nn.Conv1d(in_channels=15, out_channels=3, kernel_size=3, stride=1, padding='same'),
                                           torch.nn.LeakyReLU()
                                          )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        
        return decoded, encoded.squeeze()

## DCAE - Kmeans Loss function

In [19]:
from sklearn.cluster import KMeans
def clustering_loss(latent_represntation, Lambda=0.3):
    loss = KMeans().fit_transform(latent_represntation).sum()
    
    return Lambda * np.mean(loss**2)

In [24]:
n_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = DCAE_Kmeans().to(device)

DAEcriterion = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

cpu


In [25]:
model.train()
train_loss_hist = []
for epoch in range(1, n_epochs+1):
    running_loss = 0.0
    for i, (data, label) in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data.to(device), label.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward pass
        outputs, enc_data = model(inputs)
        
        # Calculate the loss
        DAEloss = DAEcriterion(outputs, labels)
        kmensloss = clustering_loss(enc_data.cpu().detach().numpy())
        
        loss = DAEloss + kmensloss
        
        # backward pass
        loss.backward()
        
        # optimize
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        #if i % 100 == 0:
            #print(f'Batch : {i+1} , Loss : {running_loss:.10f}')
    print(f'Epoch : {epoch} , Loss : {running_loss/len(trainloader):.10f}')
    train_loss_hist.append(running_loss/len(trainloader))
   

Epoch : 1 , Loss : 0.0072275710
Epoch : 2 , Loss : 0.0008240599
Epoch : 3 , Loss : 0.0012509177
Epoch : 4 , Loss : 0.0011560170
Epoch : 5 , Loss : 0.0010756677
Epoch : 6 , Loss : 0.0020573153
Epoch : 7 , Loss : 0.0023036032
Epoch : 8 , Loss : 0.0020752278
Epoch : 9 , Loss : 0.0025694313
Epoch : 10 , Loss : 0.0029757300


## Define training parameters

In [None]:
#!g1.1
n_epochs = 10
model = Simple1DModel().to(device)

criterion = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

## Train Model

In [12]:
#!g1.1
import warnings
warnings.filterwarnings("ignore") 

In [None]:
#!g1.1
model.train()
train_loss_hist = []
for epoch in range(1, n_epochs+1):
    running_loss = 0.0
    for i, (data, label) in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data.to(device), label.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 100 == 0:
            print(f'Batch : {i+1} , Loss : {running_loss:.10f}')
    print(f'Epoch : {epoch} , Loss : {running_loss/len(trainloader):.10f}')
    train_loss_hist.append(running_loss/len(trainloader))
        

## Train Loss plot

In [None]:
#!g1.1
fig = px.line(x=np.arange(len(train_loss_hist)), y=train_loss_hist)
fig.update_traces(mode="markers+lines")
fig.update_layout(title="Train loss Plot")
fig.show()

## Cluster the latent space of AE

In [None]:
#!g1.1

model.eval()
encoded_data = model.encoder(x_tensor.to(device)).cpu().detach().numpy()

kmeans = KMeans(3)
cluster_labels = kmeans.fit_predict(encoded_data)

np.unique(cluster_labels,return_counts=True)

In [None]:
#!g1.1
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(encoded_data)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
#!g1.1
fig = px.line(x=K, y=Sum_of_squared_distances)
fig.update_traces(mode="markers+lines")
fig.update_layout(title="Elbow Method")
fig.show()

In [None]:
#!g1.1
Sum_of_squared_distances

In [None]:
#!g1.1
