## Git repository accessor

In [995]:
import os
import git
from git import GitCommandError
import time
from itertools import chain
import re

In [996]:
class Repo(object):
    _TIME_FORMAT = '%y-%m-%d-%H:%M'
    
    def __init__(self, repo_path):
        self._repo = git.Repo(repo_path)

    def _format_commit_info(cls, file, diff, hash_, date, message, committer, label=None):
        """
        _format_commit_info create dict type object from a commit log information
        for insert MongoDB document.
        
        @param file is a filepath in the commit log
        @param diff is a difference position in the commit log
        @param hash_ is a hash string indicating a commit
        @param date is a commited date
        @param message is a commit message
        @param committer is a person name whose create this commit
        @param label is Bug/Non_Bug label at the commit 
        @return dict type object
        """
        return {
            'file' : file,
            'diff' : diff,
            'hash' : hash_,
            'date' : date,
            'message' : message,
            'committer' : committer,
            'label' : label
        }
    
    def filter_commit_by_message(self, keyword):
        """
        filtering all of commit logs include keyword in commit message
        """
        return [
            c
            for c in self._repo.iter_commits()
            if keyword in c.message.upper()
        ]
    
    def find_commit(self, commit_id):
        return self._repo.commit(commit_id)
    
    def find_blame(self, commit_id, filepath):
        """
        search commit history specific commit_id and file 
        """
        try:
            return self._repo.blame(self.find_commit(commit_id).parents[0], filepath)
        except GitCommandError:
            return []
            
    def split_diff_block(self, diff_list, splitter=r'@@.*?@@'):
        """
        split diff place in source code.
        each block splitted by splitter
        """
        start = None
        for num, line in enumerate(diff_list):
            if re.match(splitter, line):
                end = num
                if start is not None:
                    yield diff_list[start:end]
                start = end
        else:
            if start is not None:
                yield diff_list[start:]
            else:
                yield diff_list
                
    
    def create_diff_info(self, commit_id, filter_fn=None, convertor_fn=None,
                       label=None, coding='utf-8', splitter='\n', delimiter='\n'):
        """
        create list of diff information formatted by _format_commit_info() method  
        """
        commit = self.find_commit(commit_id)
        parent = commit.parents[0]
        return [
            self._format_commit_info(
                file=d.b_path,
                diff=delimiter.join(self.convert_diff_info(block, convertor_fn)),
                hash_=commit.hexsha,
                date=time.strftime(self._TIME_FORMAT, time.gmtime(commit.committed_date)),
                message=commit.message,
                committer=commit.committer.name,
                label=label
            )
            for d in parent.diff(commit, create_patch=True)
            if d.b_path is not None
            if (filter_fn(d.b_path) 
                if filter_fn is not None 
                else True)
            for block in self.split_diff_block(d.diff.decode(coding).split(splitter))
        ]
    
    def convert_diff_info(self, diff, convertor_fn=None):
        """
        diff contents converted by conbertor_fn
        """
        if convertor_fn is None:
            return diff
        return convertor_fn(diff)
    
    def find_previous_commit(self, commit_id, filepath, diffs, regex=r'^@@ -([0-9]*),([0-9]*).*?@@'):
        """
        find previous commit information each replace position in diff source code
        """
        blame_list = self.find_blame(commit_id, filepath)
        for diff in diffs:
            matching = re.search(regex, diff)
            if matching is not None:
                start = int(matching.group(1))
                end = start + int(matching.group(2))
                tcl = 0
                for commit, lines in blame_list:
                    if not (((tcl + len(lines)) < start) or (end < tcl)):
                        yield (commit, lines)
                    tcl += len(lines)
        
    def find_bug_commit(self, commit_list, key, splitter='\n'):
        """
        search bug included commit
        """
        for file in commit_list[key]:
            prev_commits = list(self.find_previous_commit(key, file, commit_list[key][file]))
            for diff in commit_list[key][file]:
                bug_lines = [
                    re.sub(r'^-', '', l)
                    for l in diff.split(splitter)
                    if l != '-'
                    if l.startswith('-')
                ]
                commits = [
                    commit
                    for bug_line in bug_lines
                    for commit, lines in prev_commits
                    if bug_line in set(lines)
                ]
                if commits:
                    yield list(set(commits))

## How to use
1. cloning git hub project into ~k5user/jupyter/ai_working/repos
1. creating Repo instance
1. extructing non-bug commit information
1. extructing bug commit information
1. save commit information to mongoDB

In [997]:
# loading repository
repo = Repo('../repos/RxJava/')

In [976]:
# search non_bug_commit
commit_list = repo.filter_commit_by_message('fix')
non_bug_list = [
    repo.create_diff_info(
        commit,
        filter_fn=lambda x: x.endswith('.java') and re.match(r'^.*(test|Test).*$', x) is None,
        #convertor_fn=lambda x: [e for e in x if not e.startswith('-')],
        label='non_bug'
    )
    for commit in commit_list
]
non_bug_commits = list(chain.from_iterable(non_bug_list))

In [977]:
# search bug_commit
def create_commit_data(commits):
    """
    this function use to reduce processing overhead to find_bug_commit
    """
    commit_list = {
        c['hash']: {}
        for c in commits
    }
    for c in commits:
        commit_list[c['hash']][c['file']] = []

    for c in commits:
        commit_list[c['hash']][c['file']].append(c['diff'])
        
    return commit_list


bug_commit_dict = create_commit_data(non_bug_commits)
bug_commit_list_no_refactor = [
        list(repo.find_bug_commit(bug_commit_dict, keys))
        for keys in bug_commit_dict.keys()
    ]
bug_commit_list = list(set(
    chain.from_iterable(
        chain.from_iterable(bug_commit_list_no_refactor))
))

bug_list = [
    repo.create_diff_info(
        commit,
        filter_fn=lambda x: x.endswith('.java') and re.match('^.*(test|Test).*$', x) is None,
        #convertor_fn=lambda x: [e for e in x if not e.startswith('-')],
        label='bug'
    )
    for commit in bug_commit_list
]

bug_commits = list(chain.from_iterable(bug_list))

## MongoDB accessor

In [907]:
from pymongo import MongoClient

In [951]:
class CommitDB(object):
    _DOCUMENT_KEYS = {
        '_id', 'file', 'diff', 'hash', 'date',
        'message', 'committer', 'label'
    }
    
    def check_correct_document(cls, document):
        if type(document) != dict:
            return False
        if document.keys() <=  cls._DOCUMENT_KEYS:
                return True
        else:
            return False
    
    def __init__(self, host, ip, db_name, collection_name):
        self._client = MongoClient(host, ip)
        self._db = self._client[db_name]
        self._collection = self._db[collection_name]
        
    def insert(self, document):
        if self.check_correct_document(document):
            return self._collection.insert_one(document)
        else:
            return None
    
    def insert_all(self, documents):
        for doc in documents:
            if not self.check_correct_document(doc):
                return None
        return self._collection.insert_many(documents)
    
    def get(self, **kw):
        return self._collection.find(kw)

In [964]:
db = CommitDB('localhost', 27017, 'test', 'test_dataset')

In [None]:
# insert commit information
db.insert_all(non_bug_commits[:100])

In [None]:
for doc in db.get():
    print(doc)

In [986]:
print(len(bug_commits))
print(bug_commits[20]['diff'])

40353
@@ -34,7 +34,7 @@ import rx.observables.Notification;
 import rx.observables.Observable;
 import rx.observables.Observer;
 import rx.observables.Subscription;
-import rx.util.FunctionLanguageAdaptor;
+import rx.util.functions.FunctionLanguageAdaptor;
 
 public class JRubyAdaptor implements FunctionLanguageAdaptor {
 



In [1006]:
print(len(non_bug_commits))
print(non_bug_commits[20]['diff'])

12275
@@ -220,6 +217,7 @@ public final class FlowableGroupBy<T, K, V> extends AbstractFlowableWithUpstream
                     evictedGroups.clear();
                 }
                 done = true;
+                finished = true;
                 drain();
             }
         }


In [1004]:
len(repo.filter_commit_by_message('BUG'))# + len(repo.filter_commit_by_message('fix'))

169

In [1001]:
len(repo.filter_commit_by_message('CLEAN'))

163