# TOM pattern recognition from time series data 

```
Author: Gcinizwe Dlamini
```
<hr>

```
The notebook contains the following main sections : 
  1. Data preprocessing
  2. EDA
  3. Profile matrix calculation
  4. Conclusions

Main libraries used :     
- stumpy
- matrixprofile
- plotly (for visualization)
```

## 1. Install the necessary libraries

In [None]:
# !pip install scipy==1.5
# !pip install stumpy
# !pip install matrixprofile

## 2. Import libraries

In [None]:
import numpy as np
import pandas as pd
import zipfile

# for data visualization
import plotly.express as px 
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# For data preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

%matplotlib inline
import matplotlib.pyplot as plt

# For matrix profile calculations
import stumpy

## 3. Read the data

In [None]:
cols_to_select = ['commit_datetime', 'full_name', 'total_files', 'total_added','total_removed','total_changed']
zf = zipfile.ZipFile('../Data/sample_data.zip')
repository_hist_df = pd.read_csv(zf.open('commits_data.csv'), usecols= cols_to_select,parse_dates=['commit_datetime'],index_col='commit_datetime')
repository_hist_df.head()   

In [None]:
repository_hist_df.full_name.unique()

In [None]:
cols_to_select = ['commit_datetime', 'full_name', 'total_files', 'total_added','total_removed','total_changed']
data_path = '../Data/repositories.csv'
repository_hist_df = pd.read_csv(data_path, usecols= cols_to_select,\
                                 parse_dates=['commit_datetime'],index_col='commit_datetime')
repository_hist_df.head()

## 4. Sample one repository for visualisation and experimentation

In [None]:
repo_to_select = 'spring-projects/spring-boot'
df = repository_hist_df.query('full_name == @repo_to_select').sort_index()
df.drop('full_name',axis=1,inplace=True)
col_mapping = {'total_files': 'T1', 'total_added': 'T2','total_removed':'T3',
              'total_changed':'T4'}
df.rename(col_mapping, axis=1,inplace=True)
df.index = np.arange(0,len(df))
df.head()

## 5. Scale the data for easy calculation of distance

In [None]:
scaler = MinMaxScaler()
df[list(df.columns)] = scaler.fit_transform(df[list(df.columns)].values)
df.head()

In [None]:
fig, axs = plt.subplots(df.shape[1],1,sharex=True,figsize=(15,10),gridspec_kw={'hspace': 0})
plt.suptitle(f'{repo_to_select}')
for i in range(df.shape[1]):
    axs[i].set_ylabel(f'T{i + 1}', fontsize='20')
    axs[i].set_xlabel('Commit number', fontsize ='20')
    axs[i].plot(df[f'T{i + 1}'])
# plt.savefig('multiPlot.eps')
plt.show()

## 6. Calculation of matrix profiles

In [None]:
def calc_profile_matrix(df, m=30, tot_motifs=3):
  """Function to calulate 1-dimensional matrix profiles given subsequence size (m)"""
  mps = {}  # Store the 1-dimensional matrix profiles
  motifs_idx = {}  # Store the index locations for each pair of 1-dimensional motifs (i.e., the index location of two smallest matrix profile values within each dimension)
  for dim_name in df.columns:
    mps[dim_name] = stumpy.stump(df[dim_name].astype(np.float64), m)
    motif_distance = np.round(mps[dim_name][:, 0].min(), 1)
    motifs_idx[dim_name] = np.argsort(mps[dim_name][:, 0])[:tot_motifs]
    print(f'The motif pair matrix profile value in {dim_name} is {motif_distance}')
    # motifs_idx.keys()

  return mps, motifs_idx

## 7. Plotting the Motifs

In [None]:
def plot_data(df, mps, motifs_idx, m):
  fig, axs = plt.subplots(df.shape[1] * 2,1,sharex=True,figsize=(25,15),gridspec_kw={'hspace': 0})
  for k, dim_name in enumerate(df.columns):
    axs[k].set_ylabel(dim_name, fontsize='20')
    axs[k].plot(df[dim_name].iloc[:])
    axs[k+ df.shape[1]].plot(mps[dim_name][:,0],c='green', linewidth=2)
    axs[k+ df.shape[1]].set_ylabel(f"P{k+1}", fontsize='20')
    axs[k+ df.shape[1]].axes.set_yticks([])
    axs[k].axes.set_yticks([])

    for idx in motifs_idx[dim_name]:
      axs[k].plot(df[dim_name].iloc[idx:idx+m], c='red', linewidth=2)
      axs[k].axvline(x=idx, linestyle="dashed", c='black')

  plt.savefig(f'All_data_{m}.eps')
  plt.show()

## Motifs > 2 in time series

In [None]:
for m  in [5, 7, 10, 14, 30]:
  mps, motifs_idx = calc_profile_matrix(df,m,4)
  plot_data(df, mps, motifs_idx, m)

## 8. Multi-dimensional Matrix Profiles

In [None]:
m = 7
mps, indices = stumpy.mstump(df, m)

Consequently, the “k-dimensional motif” can be found by locating the lowest values in the corresponding k-dimensional matrix profile, mps:

In [None]:
motifs_idx = np.argmin(mps, axis=1)
nn_idx = indices[np.arange(len(motifs_idx)), motifs_idx]

In [None]:
fig, axs = plt.subplots(mps.shape[0] * 2,1,sharex=True,figsize=(25,10),gridspec_kw={'hspace': 0})
for k, dim_name in enumerate(df.columns):
    axs[k].set_ylabel(dim_name, fontsize='20')
    axs[k].plot(df[dim_name])
    axs[k].set_xlabel('Time', fontsize ='20')

    axs[k + mps.shape[0]].set_ylabel(dim_name.replace('T', 'P'), fontsize='20')
    axs[k + mps.shape[0]].plot(mps[k], c='orange')
    axs[k + mps.shape[0]].set_xlabel('Time', fontsize ='20')

    axs[k].axvline(x=motifs_idx[2], linestyle="dashed", c='black')
    axs[k].axvline(x=nn_idx[2], linestyle="dashed", c='black')
    axs[k + mps.shape[0]].axvline(x=motifs_idx[2], linestyle="dashed", c='black')
    axs[k + mps.shape[0]].axvline(x=nn_idx[2], linestyle="dashed", c='black')
    
    axs[k].plot(df[dim_name].iloc[motifs_idx[k] : motifs_idx[k] + m], c='red', linewidth=5)
    axs[k].plot(df[dim_name].iloc[nn_idx[k] : nn_idx[k] + m], c='red', linewidth=4)
   
    axs[k + mps.shape[0]].plot(motifs_idx[k], mps[k, motifs_idx[k]] + 1, marker="v", markersize=10, color='black')
    axs[k + mps.shape[0]].plot(nn_idx[k], mps[k, nn_idx[k]] + 1, marker="v", markersize=10, color='black')

plt.show()

## Choosing value of $k$ in multidimentional 

approach is to turn this into a classic elbow/knee finding problem by plotting the minimum matrix profile value in each dimension against 𝑘 and then you look for the “turning point”

In [None]:
plt.plot(mps[range(mps.shape[0]), motifs_idx[:]], c='red', linewidth='4')
plt.xlabel('k (zero-based)', fontsize='20')
plt.ylabel('Matrix Profile Value', fontsize='20')
plt.xticks(range(mps.shape[0]))
plt.show()

# Matrixprofile lib approach 

In [None]:
cols_to_select = ['commit_datetime', 'full_name', 'total_files', 'total_added','total_removed','total_changed']
data_path = '../Data/repositories.csv'
repository_hist_df = pd.read_csv(data_path, usecols= cols_to_select,\
                                 parse_dates=['commit_datetime'],index_col='commit_datetime')


repo_to_select = 'brianchandotcom/liferay-portal'
df = repository_hist_df.query('full_name == @repo_to_select').sort_index()
df.drop('full_name',axis=1,inplace=True)
col_mapping = {'total_files': 'T1', 'total_added': 'T2','total_removed':'T3',
              'total_changed':'T4'}
df.rename(col_mapping, axis=1,inplace=True)
df.head()