# TOM pattern recognition : Extracting Pull request data from found patterns 


**Author**: Gcinizwe Dlamini

<hr>

## Import libraries

In [1]:
import pandas as pd
import numpy as np

import json, zipfile

## 1. Read pull request data

In [2]:
target_file = 'tom_pull_info.csv'
cols_to_select = ['repo_fullname', 'state', 'title',"created_at_ext","updated_at_ext"]

zf = zipfile.ZipFile('../Data/tom_sample_data.zip')

pull_rq_data = pd.read_csv(zf.open(target_file),usecols=cols_to_select,parse_dates=["created_at_ext","updated_at_ext"])

In [3]:
repos_in_pr = set(pull_rq_data.repo_fullname)

## 2. Read found patterns data

In [4]:
def read_found_pattens(data_path='../results/total_added_result.json'):
    with open(data_path) as json_file:
        data = json.load(json_file)
    return data

## 3. Match pull request data to found patterns

In [5]:
def get_pr_in_patterns(target_metric):
    # Read target metric data (found patterns)
    data = read_found_pattens(data_path=f'../results/{target_metric}_result.json')
    
    result = {}
    for key in data.keys():
        temp = {}
        thr = list(data.get(key).keys())[0]
        for repo_name, patterns in data.get(key).get(thr).items():
            pr_list = []
            pr_create_dates = []
            if repo_name in repos_in_pr:
                temp_filter = pull_rq_data.query('repo_fullname == @repo_name')
                for p, ttstamps in patterns.items():
                    start = pd.Timestamp(ttstamps[0])
                    end = pd.Timestamp(ttstamps[-1]) + pd.Timedelta(days=7)
                    
                    # Filter the pull requests based on pattern offset timestamp and end timestamp + 7 days 
                    found = temp_filter.query('created_at_ext > @start and created_at_ext < @end')
                    
                    if len(found) != 0:
                        pr_list += found.title.values.tolist()
                        pr_create_dates += [l.strftime('%Y-%m-%d %H:%M:%S') for l in found.created_at_ext]

            if len(pr_list) != 0: 
                temp[repo_name] = dict(zip(pr_list,pr_create_dates))

        if len(temp) != 0: result[key] = temp
    
    #dump results to json
    with open(f'../results/pr_{target_metric}_result.json', 'w') as fp:
        json.dump(result, fp)

In [6]:
for target_metric in ['total_added', 'total_changed', 'total_removed']:
    get_pr_in_patterns(target_metric)