# TOM pattern recognition from time series data 

```
Author: Gcinizwe Dlamini
```
<hr>

```
The notebook contains the following main sections : 
  1. Retrieve the data
  2. Set subsequence size m
  3. Find consensus pattern
  4. Match consensus pattern in all repositories
  

Main libraries used :     
- stumpy
```

## Import utils and used libraries

In [None]:
from utils import *
import numpy as np
import plotly.express as px
import pandas as pd

## Read data 
* Repositories full data
* Issues data 

Print the data statistics

In [None]:
target_metrics = ['total_removed', 'total_added', 'total_changed']
print('Reading Data...')
all_data = get_data(target_metrics=target_metrics)
print('Finished reading data...')

In [None]:
len(all_data)

In [None]:
def get_patterns(target_metric, all_data):
    Ts = []
    projects_names_map = {}
    i = 0
    for k, v in all_data.items():
        repo_timestamps = v.get('time_stamps')
        last_commit = max(repo_timestamps)
        if last_commit.is_leap_year and last_commit.day > 28:
            last_commit = last_commit.replace(day = last_commit.day - 2)

        mask = np.where(last_commit.replace(year = last_commit.year - 1) > repo_timestamps)[0]
        res = v.get(target_metric)[mask]
        if(len(res) > 10):
            Ts.append(res)
            projects_names_map[i] = k
            i+=1

    p = len(Ts)*0.25
    z = 4
    resultR, allPatterns = approach1(Ts, p, z, d=0.3333)

    if len(resultR) == 0: return None

    overall_result = {}
    for m, sub_data in allPatterns.items():
        subsequence_result = {}
        for i, patterns_found in sub_data['patterns'].items():
            repo_name = projects_names_map[i]
            temp = {}
            for n, p in enumerate(patterns_found,1):
                temp[f'position {n}'] = [datetime.strftime(l,'%Y-%m-%d %H:%M:%S') for l in all_data[repo_name]['time_stamps'][p:p+5]]
            subsequence_result[repo_name] = temp
        overall_result[str(m)] = subsequence_result

    print('Saving result ...')
    with open(f'../results/{target_metric}_result.json', 'w') as fp:
        json.dump(overall_result, fp)

In [None]:
for metric in target_metrics:
    print(f'getting patterns for [{metric}]')
    get_patterns(metric, all_data)