In [16]:
import numpy as np
import pandas as pd
import git
import os
import re

from tqdm import tqdm

DATA_PATH = 'data'

In [17]:
patterns_df = pd.read_csv(os.path.join(DATA_PATH, 'patterns.csv'))
patterns_df.head()

Unnamed: 0,Pattern size,Pattern ID,Pattern frequency,Pattern repos,Pattern commits,Sample ID,Repository,Commit hash,Datetime
0,36,2668,3,1,1,23314597,https://github.com/deepmipt/DeepPavlov.git,0601468959e72806a2e55dfdb015c90d0d2926b5,07.08.2018 22:28:46
1,36,2668,3,1,1,23314595,https://github.com/deepmipt/DeepPavlov.git,0601468959e72806a2e55dfdb015c90d0d2926b5,07.08.2018 22:28:46
2,36,2668,3,1,1,23314596,https://github.com/deepmipt/DeepPavlov.git,0601468959e72806a2e55dfdb015c90d0d2926b5,07.08.2018 22:28:46
3,36,271,3,1,1,4228010,https://github.com/OpenMined/PySyft.git,641e3fe14ff5b3d94b88929079fc5a3b8f2df8d1,25.04.2019 12:38:10
4,36,271,3,1,1,4228007,https://github.com/OpenMined/PySyft.git,641e3fe14ff5b3d94b88929079fc5a3b8f2df8d1,25.04.2019 12:38:10


In [18]:
repo_urls = list(patterns_df['Repository'].unique())
patterns_df['Repository'].value_counts()

https://github.com/sagemath/sage.git                         2239
https://github.com/biolab/orange3.git                        1896
https://github.com/scikit-learn/scikit-learn.git             1420
https://github.com/matplotlib/matplotlib.git                 1409
https://github.com/quodlibet/quodlibet.git                   1167
                                                             ... 
https://github.com/clips/pattern.git                           10
https://github.com/MechanicalSoup/MechanicalSoup.git            9
https://github.com/alfredfrancis/ai-chatbot-framework.git       8
https://github.com/fredrik-johansson/mpmath.git                 1
https://github.com/pydata/numexpr.git                           1
Name: Repository, Length: 87, dtype: int64

In [None]:
%%time

FIX_MSG_PATTERN = r'fix|error|bug|issue|mistake|incorrect|fault|defect|flaw'

def check_if_bugfix(row):
    msg = commit_msg_by_hash.get(row['Commit hash'], None)
    return msg is not None and re.search(FIX_MSG_PATTERN, msg, re.IGNORECASE) is not None

fixes_dfs = []

for url in tqdm(repo_urls):
    repo_name = url.rpartition('/')[2][:-4]
    try:
        repo = git.Repo(os.path.join(DATA_PATH, 'repos', repo_name))
        print(f'Repository "{repo_name}" succesfully opened')
    except Exception:
        repo = git.Repo.clone_from(url, os.path.join(DATA_PATH, 'repos', repo_name))
        print(f'Repository "{repo_name}" succesfully cloned')
    
    commit_msg_by_hash = {}
    for commit in repo.iter_commits():
        commit_msg_by_hash[commit.binsha.hex()] = commit.message
    print(f'Messages by commit hash extracted')
    
    current_df = patterns_df[patterns_df['Repository'] == url]
    current_fixes_df = current_df[current_df.apply(check_if_bugfix, axis=1)]
    fixes_dfs.append(current_fixes_df)
    
fixes_df = pd.concat(fixes_dfs)
fixes_df.to_csv(os.path.join(DATA_PATH, 'fix_patterns.csv'))

### Postprocessing


In [51]:
fix_patterns_df = pd.read_csv(os.path.join(DATA_PATH, 'fix_patterns.csv'), index_col=0)
fix_patterns_df.shape, patterns_df.shape

((7960, 9), (28481, 9))

In [52]:
pids = patterns_df['Pattern ID'].unique()
total_fix_pids = []

for pid in tqdm(pids):
    all_cnt = patterns_df[patterns_df['Pattern ID'] == pid].shape[0]
    fix_cnt = fix_patterns_df[patterns_df['Pattern ID'] == pid].shape[0]
    if all_cnt == fix_cnt:
        total_fix_pids.append(pid)

  
100%|██████████| 5690/5690 [00:14<00:00, 403.12it/s]


In [64]:
total_fix_patterns_df = fix_patterns_df[fix_patterns_df['Pattern ID'].isin(set(total_fix_pids))]
total_fix_patterns_df.to_csv(os.path.join(DATA_PATH, 'absolute_fix_patterns.csv'))

In [65]:
patterns_df['Pattern ID'].nunique(), fix_patterns_df['Pattern ID'].nunique(), total_fix_patterns_df['Pattern ID'].nunique()

(5690, 2046, 1248)