# TOM pattern recognition from time series data 

```
Author: Gcinizwe Dlamini
```
<hr>

```
The notebook contains the following main sections : 
  1. Retrieve the data
  2. Set subsequence size m
  3. Find consensus pattern
  4. Match consensus pattern in all repositories
  

Main libraries used :     
- stumpy
```

## Import utils and used libraries

In [1]:
#!c1.8
# %pip install -r ../requirements.txt

In [2]:
#!c1.8
from utils import *
import numpy as np
import plotly.express as px
import pandas as pd
from datetime import datetime

## Read data 
* Repositories full data
* Issues data 

Print the data statistics

In [10]:
def approach1(Ts, p, z, d=0.3333):
    R = set()
    all_patterns = {}
    for m in np.arange(5,15):
        k = 0
        print(f'subsequence : {m}')
        central_radius, central_Ts_idx, central_subseq_idx = consensus_motif(Ts, m)
        consensus_pattern = Ts[central_Ts_idx][central_subseq_idx:central_subseq_idx+m]

        
        print(f'Consensus pattern : {*consensus_pattern,}')
        print(f'Radius {central_radius}')
        for ii, threshold in enumerate([central_radius/e for e in range(1,10)]):
            match_collection = {}
            k = 0
            for i in range(len(Ts)):
                distance_profile = search_pattern(Ts[i], consensus_pattern, max_distance=threshold)
                if distance_profile is None: continue
                tmatches = len(distance_profile)
                if tmatches > z and tmatches < 6:
                    match_collection[i] = distance_profile[:,1]
                    k += 1
            if k > 0 :
                pr = k/len(Ts)
                #print(threshold,pr)
                if pr > 0.05 and pr < 0.20:
                    print(f'threshold:{round(threshold,4)}, percentage: {round(pr,4)}')
                    all_patterns[m] = {}
                    all_patterns[m][f'threshold-{round(threshold,4)}'] = {}
                    all_patterns[m][f'threshold-{round(threshold,4)}']['consensus_pattern'] = consensus_pattern
                    all_patterns[m][f'threshold-{round(threshold,4)}']['patterns'] = match_collection
                    all_patterns[m][f'threshold-{round(threshold,4)}']['percentage'] = round(pr,4)

            if k < p and k > len(Ts)*0.05:
                print(f'adding : {*consensus_pattern,}')
                R.add(tuple(consensus_pattern))

    return R, all_patterns

def get_patterns(target_metric, all_data):
    Ts = []
    projects_names_map = {}
    i = 0
    for k, v in all_data.items():
        repo_timestamps = v.get('time_stamps')
        #last_commit = max(repo_timestamps)
        #if last_commit.is_leap_year and last_commit.day > 28:
        #   last_commit = last_commit.replace(day = last_commit.day - 2)

        #mask = np.where(last_commit.replace(year = last_commit.year - 3) > repo_timestamps)[0]
        res = v.get(target_metric)
        if(len(res) > 15):
            Ts.append(res)
            projects_names_map[i] = k
            i+=1

    p = len(Ts)*0.25
    z = 4
    resultR, allPatterns = approach1(Ts, p, z, d=0.3333)

    if len(resultR) == 0: return None

    overall_result = {}
    for m, sub_data in allPatterns.items():
        subsequence_result = {}
        for thrshld, thrshld_sub_data  in sub_data.items():
            thrshld_result = {}
            for i, patterns_found in thrshld_sub_data['patterns'].items():
                repo_name = projects_names_map.get(i)
                temp = {}
                for n, p in enumerate(patterns_found,1):
                    temp[f'position {n}'] = [l.strftime('%Y-%m-%d %H:%M:%S') for l in all_data[repo_name]['time_stamps'][p:p+m]]
                thrshld_result[repo_name] = temp
            subsequence_result[thrshld] = thrshld_result
            subsequence_result[thrshld]['percentage'] = thrshld_sub_data['percentage']
            subsequence_result[thrshld]['consensus_pattern'] =  thrshld_sub_data['consensus_pattern'].tolist()
        overall_result[str(m)] = subsequence_result

    print('Saving result ...')
    with open(f'../results/{target_metric}_result.json', 'w') as fp:
        json.dump(overall_result, fp)

In [11]:
#!c1.8
all_data = get_data(target_metrics=['total_removed', 'total_added', 'total_changed'])
print('Finished reading data...')
for metric in ['total_removed', 'total_added', 'total_changed']:
    print(f'getting patterns for [{metric}]')
    get_patterns(metric, all_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group.sort_index(inplace=True)


total filtered repos : 464
Max commits in data : 1838
Finished reading data...
getting patterns for [total_removed]
subsequence : 5
Consensus pattern : (1.0, 0.0, 2.0, 1.0, 23.0)
Radius 0.31775781986202517
threshold:0.3178, percentage: 0.0819
adding : (1.0, 0.0, 2.0, 1.0, 23.0)
threshold:0.1589, percentage: 0.0582
adding : (1.0, 0.0, 2.0, 1.0, 23.0)
subsequence : 6
Consensus pattern : (29.0, 1.0, 3.0, 2.0, 2.0, 0.0)
Radius 0.497952205599897
threshold:0.498, percentage: 0.0948
adding : (29.0, 1.0, 3.0, 2.0, 2.0, 0.0)
threshold:0.249, percentage: 0.056
adding : (29.0, 1.0, 3.0, 2.0, 2.0, 0.0)
subsequence : 7
Consensus pattern : (76.0, 2.0, 2.0, 0.0, 6.0, 0.0, 0.0)
Radius 0.6612591119892433
threshold:0.6613, percentage: 0.0991
adding : (76.0, 2.0, 2.0, 0.0, 6.0, 0.0, 0.0)
threshold:0.3306, percentage: 0.0711
adding : (76.0, 2.0, 2.0, 0.0, 6.0, 0.0, 0.0)
threshold:0.2204, percentage: 0.0625
adding : (76.0, 2.0, 2.0, 0.0, 6.0, 0.0, 0.0)
subsequence : 8
Consensus pattern : (240.0, 30.0, 4.0,