# Setup

In [206]:
import numpy as np
import pandas as pd
import os
import math
import re
import copy
from pathlib import Path
from grepfunc import grep
from collections import Counter

# Specify git executable file for GitPython in Jupyter Notebook (In IDE, it can still work without this line.)
os.environ["GIT_PYTHON_GIT_EXECUTABLE"] = "/usr/bin/git"

import git
from git import RemoteProgress

from git import Repo
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [207]:
class Progress(RemoteProgress):
    def update(self, op_code, cur_count, max_count=None, message=''):
        print(self._cur_line)

In [244]:
repos = {"bc-java":{"remote":"https://github.com/bcgit/bc-java", "local":"repo/bc-java", "commit":"9bc10bbaa9620d691c58e2b37f31f0d31ceea61f"}, \
         "spark":{"remote":"https://github.com/apache/spark", "local":"repo/spark", "commit":"8efc6e986554ae66eab93cd64a9035d716adbab0"}, \
         "cxf":{"remote":"https://github.com/apache/cxf", "local":"repo/cxf", "commit":"d9e2a6e7260ea12efa5355ffdfbf0b2415bccd14"}}

name = "bc-java"

In [245]:
remote_link = repos[name]["remote"]
local_link = repos[name]["local"]
fixing_commit = repos[name]["commit"]
# Uncomment to clone
# Repo.clone_from(remote_link, local_link, progress=Progress())
repo = Repo(local_link)


# Exercise 3

In [246]:
#finding latest commit hash from each deleted line in the fixing commit
commit_data = repo.commit(fixing_commit).diff(fixing_commit + "^")
files = []
deletion_commits = []
for file in commit_data:
    diff_data = repo.git.diff("-U0", "--ignore-blank-lines", fixing_commit+"^", fixing_commit, file.b_path).splitlines()
#     diff_data = grep(diff_data, "^\-")
    for line in diff_data:
        match = re.search('(?<=@@ -)(?!\d+,0)\d+(,\d+)?', line)
        if match:
            ranges = match.group().split(',')
            ranges = [int(x) for x in ranges]
            ranges = (ranges if len(ranges)>1 else ranges+[1])
            blame_data = repo.blame("-L {},+{}".format(ranges[0], ranges[1]), file.b_path, fixing_commit+"^")
            for line in blame_data:
                deletion_commits.append(line.commit)
                
for commit in deletion_commits:
    print(commit)

In [247]:
diff_data = repo.commit(fixing_commit).diff(fixing_commit + "^")
files = []
# for each file
for file in diff_data:
    line_numbers = []
    # for each blame entry
    for entry in repo.blame_incremental(fixing_commit, file.b_path):
        if str(entry.commit) == fixing_commit:
            #add effected lines to array
            line_numbers += entry.linenos
    files.append((file.b_path, line_numbers))
            
print(files)

[('core/src/main/java/org/bouncycastle/crypto/modes/GCMBlockCipher.java', [46, 180, 461, 528, 529, 530, 531, 532, 533])]


In [248]:
#find all line numbers that have been added by the fixing commit
#parse file into array of lines and add an empty string to the start so index matches with line number (readability)

# for each line in target_lines
# indentify all lines within scope as target or old

    #mark target line as 'touched' so it want be iterated on again
    #from target line iterate up
        #depth counter = 0
        #if the next line is a target line, mark as 'touched' in target_lines
        #else add commit of line to commits array
        #if you reach a '}' depth counter += 1
        #if you reach a '{' depth counter -= 1
        #if depth counter < 0
            #you have reached the start of the innermost scope
    #from target line iterate down
        #depth counter = 0
        #if the next line is a target line, mark as 'touched' in target_lines
        #else add commit of line to commits array
        #if you reach a '{' depth counter += 1
        #if you reach a '}' depth counter -= 1
        #if depth counter < 0
            #you have reached the end of the innermost scope
            
    #remove all same same_scope_target_lines from target_lines
    #record blame commit for each line in same_scope_old_lines

#now you should have all the commit for every line that belongs to the innermost scope of a target line
#show most common commit/s
    

In [249]:
commits = copy.copy(deletion_commits) 
inscope_lines = []
file_data = copy.deepcopy(files)
for path, target_lines in file_data:
    lines = [("","notACommit")]
    blame_data = repo.blame(fixing_commit, path)
    
    for c, l in blame_data:
        for line in l:
            lines.append((line, str(c)))

    for target_line in target_lines:
        
        if target_line < 0:
            continue
            
        #mark off target line
        target_lines[:] = [-1 if x==target_line else x for x in target_lines]
        
        #search upwards for opening scope
        depth_counter = 0
        current_line = target_line - 1
        #since lines array is padded at the start, dont look above line number 1 because it is blank
        while current_line > 0 and depth_counter >= 0:
            
            if current_line in target_lines:
                target_lines[:] = [-1 if x==current_line else x for x in target_lines]
            else:
                commits.append(lines[current_line][1])
                inscope_lines.append(lines[current_line][0])
                
            depth_counter += lines[current_line][0].count('}')
            depth_counter -= lines[current_line][0].count('{')
            current_line -= 1
            
        #search downwards for closing scope
        depth_counter = 0
        current_line = target_line + 1
        while current_line < len(lines) and depth_counter >= 0:
            
            if current_line in target_lines:
                target_lines[:] = [-1 if x==current_line else x for x in target_lines]
            else:    
                commits.append(lines[current_line][1])
                inscope_lines.append(lines[current_line][0])
                
            depth_counter += lines[current_line][0].count('{')
            depth_counter -= lines[current_line][0].count('}')
            current_line += 1

#removing any fixing_commits in case they were picked up
commits = list(filter(lambda c: c != fixing_commit, commits))
#counting occurances of commits
counter = Counter(commits)
#printing most common commit/s

VCC = ""
for v in counter:
    if counter[v] == counter.most_common(1)[0][1]:
        VCC = v
        
print(VCC)

158b54fbacb8926df0e74086e95900924c862b6e


# Exercise 2

In [252]:
fixing_commit = VCC
#Gets the message and title of the fixing commit
show_data = repo.git.show("--pretty=format:%s\n%b", "-s", fixing_commit).splitlines()
for line in show_data:
    print(line)

first cut of code


In [253]:
#Gets how many files were affected in the fixing commit
diff_data = repo.commit(fixing_commit).diff(fixing_commit + "^")
print(len(diff_data))

GitCommandError: Cmd('/usr/bin/git') failed due to: exit code(128)
  cmdline: /usr/bin/git diff-tree 158b54fbacb8926df0e74086e95900924c862b6e 158b54fbacb8926df0e74086e95900924c862b6e^ -r --abbrev=40 --full-index -M --raw --no-color

In [22]:
#Gets how many directories were affected in the fixing commit
diff_data = repo.git.show("--pretty=format:", "--dirstat=files", fixing_commit).splitlines()
print(len(diff_data))

2


In [23]:
#Gets how many lines were deleted
stats = repo.git.show("--pretty=format:", "--numstat", fixing_commit).splitlines()
deleted = 0
for line in stats:
    deleted += int(line.split()[1]);
print(deleted)

27


In [20]:
#Gets how many lines were added
stats = repo.git.show("--pretty=format:", "--numstat", fixing_commit).splitlines()
added = 0
for line in stats:
    added += int(line.split()[0]);
print(added)

9


In [22]:
#Gets how many lines were deleted exluding comments and blank lines
diff_data = repo.git.diff("-U0", "--ignore-blank-lines", fixing_commit+"^", fixing_commit).splitlines()
deleted_lines = grep(diff_data, "^\-")
deleted_wo = 0
for line in deleted_lines:
    split = line.split()
    if len(split) > 1 and not split[1].startswith("//") and not split[0].startswith("---"):
        deleted_wo += 1
print(deleted_wo)

0


In [31]:
#Gets how many lines were added exluding comments and blank lines
diff_data = repo.git.diff("-U0", "--ignore-blank-lines", fixing_commit+"^", fixing_commit).splitlines()
added_lines = grep(diff_data, "^\+")
added_wo = 0
for line in added_lines:
    split = line.split()
    if len(split) > 1 and not split[1].startswith("//") and not split[0].startswith("+++"):
        added_wo += 1
print(added_wo)

8


In [14]:
#Days since each file was last changed
diff_data = repo.commit(fixing_commit).diff(fixing_commit + "^")
for file in diff_data:
    commits = repo.git.log("--pretty=format:%ct", fixing_commit, "--", file.b_path).splitlines()
    print(str(math.ceil(int(commits[0])/86400 - int(commits[1])/86400)) + " " + file.b_path)

68 core/src/main/java/org/bouncycastle/crypto/modes/GCMBlockCipher.java


In [15]:
#Number of times each file has been previously changed (following renames)
diff_data = repo.commit(fixing_commit).diff(fixing_commit + "^")
for file in diff_data:
    commits = repo.git.log("--follow", "--pretty=format:%ct", fixing_commit+"^", "--", file.b_path).splitlines()
    print(str(len(commits)) + " " + file.b_path)

10 core/src/main/java/org/bouncycastle/crypto/modes/GCMBlockCipher.java


In [10]:
#List of authors who have modified each file
diff_data = repo.commit(fixing_commit).diff(fixing_commit + "^")
for file in diff_data:
    print(file.b_path + ":")
    authors = repo.git.log("--pretty=format:%cn", "-s", file.b_path).splitlines()
    authors = set(authors)
    for author in authors:
        print(author)

ValueError: SHA b'b3aba60f32022bf57728800fd9e9a21ac7198813' could not be resolved, git returned: b'b3aba60f32022bf57728800fd9e9a21ac7198813 missing'

In [9]:
tuples = []
for author in authors:
    log = repo.git.log("--pretty=format:%h", "--author=" + author)
    tuples.append((len(log), author))
tuples.sort(key=lambda x: x[0], reverse=True)
for tuple in tuples:
    print(tuple[1] + ": " + str(tuple[0]))

NameError: name 'authors' is not defined