In [1]:
import os
import pandas as pd
import json
import git
from tqdm import tqdm
import ast
import networkx as nx

PATH_RFT = os.path.join("dataset","result_refactoring")


In [2]:
projects = ["cayenne","commons-collections","cucumber-jvm","cxf","httpcomponents-client","iotdb","jclouds","kylin","maven","opennlp","ranger","ratis","wicket"]

# Building Evolution Graph

### verify mapping prior to building evolution graph

In [16]:
def get_relevant_rft(project,current_rfts,commit_sha,G):
    keys = ["rename class","move class","rename package","move source folder","move and rename class", \
            "add class extension","remove class extension","replace class extension", \
            "add class implementation","remove class implementation","replace class implementation", \
            "extract superclass", "extract class", "extract subclass"]
    found = []
    for r in current_rfts:
        description = r["description"].lower()
        left = [i for i in r["leftSideLocations"] if "TYPE_DECLARATION" == i["codeElementType"]]
        right = [i for i in r["rightSideLocations"] if "TYPE_DECLARATION" == i["codeElementType"]]
        if ("rename class" in r["type"].lower() or \
           "move class" in r["type"].lower() or \
            "move source folder" in r["type"].lower() or \
            "move and rename class" in r["type"].lower() or \
            "extract class" in r["type"].lower() or \
            "extract subclass" in r["type"].lower() or \
            "rename packag" in r["type"].lower() or \
            "class extension" in r["type"].lower() or \
            "class implementation" in r["type"].lower()):
            
            for l_node,r_node in zip(left,right):
                G.add_node(l_node["filePath"],label=commit_sha+"::"+l_node["filePath"])
                G.add_node(r_node["filePath"],label=commit_sha+"::"+r_node["filePath"])
                G.add_edge(l_node["filePath"],r_node["filePath"],weight=commit_sha+"::"+description,title=commit_sha+"::"+description,commit=commit_sha,arrowStrikethrough=True)

        elif ("extract superclass" in r["type"].lower()):
            for l_node in left:
                for r_node in right:
                    G.add_node(l_node["filePath"],label=commit_sha+"::"+l_node["filePath"])
                    G.add_node(r_node["filePath"],label=commit_sha+"::"+r_node["filePath"])
                    G.add_edge(l_node["filePath"],r_node["filePath"],weight=commit_sha+"::"+description,title=commit_sha+"::"+description,commit=commit_sha,arrowStrikethrough=True)

        found.append([project,commit_sha,commit_sha+"::"+description,len(left),len(right)])
    return found,G
                
    
def checking_mapping_for_verfication():
    result = []
    for project in tqdm(projects):
        print("build project: ",project)
        data = json.load(open(os.path.join(PATH_RFT,project,project+"_refactoring_history.json")))

        rfts = []
        data = data["commits"]
        data.reverse()

        G = nx.MultiDiGraph()
        for commit_obj in (data):
            rfts,_ = get_relevant_rft(project,commit_obj["refactorings"],commit_obj["sha1"],G)
            result = result + (rfts)
            
    return result

result = checking_mapping_for_verfication()



  0%|                                                                                                                                                                                      | 0/13 [00:00<?, ?it/s]

build project:  cayenne


  8%|█████████████▍                                                                                                                                                                | 1/13 [00:00<00:10,  1.10it/s]

build project:  commons-collections


 15%|██████████████████████████▊                                                                                                                                                   | 2/13 [00:02<00:16,  1.52s/it]

build project:  cucumber-jvm


 23%|████████████████████████████████████████▏                                                                                                                                     | 3/13 [00:04<00:16,  1.70s/it]

build project:  cxf


 31%|█████████████████████████████████████████████████████▌                                                                                                                        | 4/13 [00:20<01:05,  7.26s/it]

build project:  httpcomponents-client


 38%|██████████████████████████████████████████████████████████████████▉                                                                                                           | 5/13 [00:28<00:59,  7.39s/it]

build project:  iotdb


 46%|████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 6/13 [00:44<01:13, 10.52s/it]

build project:  jclouds


 54%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                | 7/13 [01:36<02:24, 24.11s/it]

build project:  kylin


 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                   | 8/13 [02:33<02:51, 34.33s/it]

build project:  maven


 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                     | 9/13 [04:01<03:24, 51.18s/it]

build project:  opennlp


 77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 10/13 [04:16<01:59, 39.97s/it]

build project:  ranger


 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 11/13 [04:44<01:12, 36.35s/it]

build project:  ratis


 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 12/13 [04:54<00:28, 28.24s/it]

build project:  wicket


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [08:10<00:00, 37.73s/it]


In [20]:
pd.DataFrame(result,columns=["p","c","rft","len_left","len_right"])

Unnamed: 0,p,c,rft,len_left,len_right
0,cayenne,1d6568674ba2205448500ab134778e990aa69c96,1d6568674ba2205448500ab134778e990aa69c96::move...,0,0
1,cayenne,c4b528b30a0e284ff69f93b43527aecf5cd3d6d6,c4b528b30a0e284ff69f93b43527aecf5cd3d6d6::move...,0,0
2,cayenne,e463112fce75abc74ea500709c109f7a21abb1c0,e463112fce75abc74ea500709c109f7a21abb1c0::move...,0,0
3,cayenne,6f8365b78de6c5dca83a9b2f54a119072e9c8e76,6f8365b78de6c5dca83a9b2f54a119072e9c8e76::move...,0,0
4,cayenne,56d93dcdec85240157a95d418722ce974f4f7e26,56d93dcdec85240157a95d418722ce974f4f7e26::move...,0,0
...,...,...,...,...,...
465658,wicket,41689ce8ec1b41806d8cb61e08abf6227f421531,41689ce8ec1b41806d8cb61e08abf6227f421531::chan...,0,0
465659,wicket,41689ce8ec1b41806d8cb61e08abf6227f421531,41689ce8ec1b41806d8cb61e08abf6227f421531::chan...,0,0
465660,wicket,41689ce8ec1b41806d8cb61e08abf6227f421531,41689ce8ec1b41806d8cb61e08abf6227f421531::chan...,0,0
465661,wicket,41689ce8ec1b41806d8cb61e08abf6227f421531,41689ce8ec1b41806d8cb61e08abf6227f421531::chan...,1,1


In [12]:
stats = pd.DataFrame(result,columns=["p","c","rft","len_left","len_right"])

# 1 to 1 mapping
move_src_fold = stats[stats["rft"] == "move source folder"]
assert move_src_fold[move_src_fold["len_left"] != move_src_fold["len_right"]].shape[0] == 0, "asserting that move src fold == 0"

move_class = stats[stats["rft"] == "move class"]
assert move_class[move_class["len_left"] != move_class["len_right"]].shape[0] == 0, "asserting that move class  == 0"

rename_class = stats[stats["rft"] == "rename class"]
assert rename_class[rename_class["len_left"] != rename_class["len_right"]].shape[0] == 0, "asserting that rename_class == 0"

move_and_rename_class = stats[stats["rft"] == "move and rename class"]
assert move_and_rename_class[move_and_rename_class["len_left"] != move_and_rename_class["len_right"]].shape[0] == 0, "asserting that move and rename class == 0"

extract_class = stats[stats["rft"] == "extract class"]
assert extract_class[extract_class["len_left"] != extract_class["len_right"]].shape[0] == 0, "asserting that move and rename class == 0"

rename_package = stats[stats["rft"] == "rename package"]
assert rename_package[rename_package["len_left"] != rename_package["len_right"]].shape[0] == 0, "asserting that rename package == 0"

extract_subclass = stats[stats["rft"] == "extract subclass"]
assert extract_subclass[extract_subclass["len_left"] != extract_subclass["len_right"]].shape[0] == 0, "asserting that extract_subclass == 0"

# x to 1 mapping is only extract superclass
extract_superclass = stats[stats["rft"] == "extract superclass"]
assert extract_superclass[extract_superclass["len_right"] == 1].shape[0] == extract_superclass.shape[0], "asserting that extract_superclass always maps to 1"


# 1 to 1 mapping for extension
extension = stats[stats["rft"].str.contains("extension")]
assert extension[extension["len_left"] != extension["len_right"]].shape[0] == 0, "asserting that extract_superclass always maps to 1"

# 1 to 1 mapping for extension
implementation = stats[stats["rft"].str.contains("implementation")]
assert implementation[implementation["len_left"] != implementation["len_right"]].shape[0] == 0, "asserting that extract_superclass always maps to 1"


### Data cleaning from refactoring miner

In [None]:
def fix_move_source_folder(project, data):
    repo = git.Repo(os.path.join("repo",project))
    for commit_obj in data:
        for r in commit_obj["refactorings"]:
                if r["type"].lower() == "move source folder" and len(r["leftSideLocations"]) == 0:
                    git_obj = repo.commit(commit_obj["sha1"])
                    left = []
                    right = []
                    for diff in (git_obj.diff(git_obj.parents[0])):
                        if diff.b_path[-5:].lower() != ".java":
                            continue
                        replacement_left,replacement_right = r["description"].split("Move Source Folder")[-1].split(" to ")
                    
                        left.append({"filePath": diff.b_path,"codeElementType": "TYPE_DECLARATION"})
                        right.append({"filePath": diff.b_path.replace(replacement_left.strip(),replacement_right.strip()),"codeElementType": "TYPE_DECLARATION"})
                        
                    r["leftSideLocations"] = left
                    r["rightSideLocations"] = right
    return data


def fixing_missing_data():
    res = {}
    for project in (projects):
        print("build project: ",project , " @ " ,os.path.join(PATH_RFT,project,project+"_refactoring_history.json"))
        data = json.load(open(os.path.join(PATH_RFT,project,project+"_refactoring_history.json")))
        commits_data = data["commits"]
        print("fixing .. move source folder")
        commits_data = fix_move_source_folder(project,commits_data)
        print("          completed...")
        data["commits"] =commits_data
        with open(os.path.join(PATH_RFT,project,project+"_refactoring_history.json"), "w") as outfile:
            outfile.write(json.dumps(data))
        res[project] = data
    return res

# d = fixing_missing_data() #Used it only once to fix the errors

### Build evolution  graph 

In [22]:
   
import networkx as nx
import sys
    
result = {}
for project in tqdm(projects):
        
    print("build project: ",project)
        
    data = json.load(open(os.path.join(PATH_RFT,project,project+"_refactoring_history.json")))

    rfts = []
    data = data["commits"]
    data.reverse()
    
    G = nx.MultiDiGraph()
    for commit_obj in (data):
        _,G = get_relevant_rft(project,commit_obj["refactorings"],commit_obj["sha1"],G)
    result[project] = G


  0%|                                                                                                                                                                                      | 0/13 [00:00<?, ?it/s]

build project:  cayenne


  8%|█████████████▍                                                                                                                                                                | 1/13 [00:01<00:13,  1.13s/it]

build project:  commons-collections


 15%|██████████████████████████▊                                                                                                                                                   | 2/13 [00:01<00:09,  1.15it/s]

build project:  cucumber-jvm


 23%|████████████████████████████████████████▏                                                                                                                                     | 3/13 [00:02<00:06,  1.63it/s]

build project:  cxf


 31%|█████████████████████████████████████████████████████▌                                                                                                                        | 4/13 [00:03<00:06,  1.30it/s]

build project:  httpcomponents-client


 38%|██████████████████████████████████████████████████████████████████▉                                                                                                           | 5/13 [00:03<00:05,  1.49it/s]

build project:  iotdb


 46%|████████████████████████████████████████████████████████████████████████████████▎                                                                                             | 6/13 [00:04<00:05,  1.36it/s]

build project:  jclouds


 54%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                | 7/13 [00:06<00:07,  1.19s/it]

build project:  kylin


 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                   | 8/13 [00:07<00:04,  1.03it/s]

build project:  maven


 69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                     | 9/13 [00:07<00:03,  1.16it/s]

build project:  opennlp
build project:  ranger


 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 12/13 [00:08<00:00,  2.27it/s]

build project:  ratis
build project:  wicket


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:09<00:00,  1.35it/s]


### Testing whether evolution graph creates correct graphs

In [23]:
from collections import Counter

theoretical = 0
for project in tqdm(projects):
    if project  != "commons-collections":
        continue
    print("build project: ",project)
    data = json.load(open(os.path.join(PATH_RFT,project,project+"_refactoring_history.json")))
    for k in data["commits"]:
        for rft in k["refactorings"]:
            if "class extension" in rft["type"].lower() or "class implementation" in rft["type"].lower():
                theoretical = theoretical + 1

    
res = []
for i in result["commons-collections"].edges(data=True):
    if "class extension" in (i[2]["weight"]).lower() or "class implementation" in (i[2]["weight"]):
        res.append(i[2]["weight"])
df = pd.DataFrame(res,columns=["rft"])
len(set(df["rft"].tolist()))


res2 = []
for sg in nx.weakly_connected_components(result["commons-collections"]):
    g = result["commons-collections"].subgraph(list(sg))
    for i in (g.edges(data=True)):
        if "class extension" in (i[2]["weight"]).lower() or "class implementation" in (i[2]["weight"]):
            res2.append(i[2]["weight"])
len(res2)

assert (len(set(df["rft"].tolist())) == theoretical == len(res2))

  0%|                                                                                                                                                                                      | 0/13 [00:00<?, ?it/s]

build project:  commons-collections


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 19.17it/s]


### Saving evolution graph

In [24]:
def get_root(g):
    H = nx.MultiDiGraph()
    has = False
    for index,edge in enumerate(g.edges(data=True)):
        l,r,w = (edge)
        if ("class extension" in w["weight"]) or ("class implementation" in w["weight"]):
            has = True
        if l != r:
            H.add_node(l,label = g.nodes[l]["label"])
            H.add_node(r,label = g.nodes[r]["label"])
            H.add_edge(l,r,weight=w["weight"])

    if has:
        root =  root_helper(H)
        if root:
            return g.nodes[root]["label"]
        else:
            root = list(g.nodes())[0]
            return g.nodes[root]["label"]
    else:
        return False
        
def has_rft(g):
    for index,edge in enumerate(g.edges(data=True)):
        l,r,w = (edge)
        if ("class extension" in w["weight"]) or ("class implementation" in w["weight"]):
            return True
    return False
    
def has_self_loop(g):
    H = nx.MultiDiGraph()
    for index,edge in enumerate(g.edges(data=True)):
        l,r,w = (edge)
        if l == r:
            return True
    return False

def root_helper(H):
    for n in H.nodes():
        if (H.in_degree(n)) == 0:
            return n
    return False



for project in tqdm(projects):
    tree = {}
    cunt = 0
        
    for sg in nx.weakly_connected_components(result[project]):
        g = result[project].subgraph(list(sg))
        
        if not has_rft(g): continue
            
        cnt_local= 0 
        acumulation = {}
        for index,edge in enumerate(g.edges(data=True)):
            l,r,w = (edge)
            acumulation[index] = ({"commit":w["commit"],
             "left ==> right":l + " ==> "+r,
             "refactoring":w["weight"]
            })
            if ("class extension" in w["weight"]) or ("class implementation" in w["weight"]):
                cunt = cunt + 1
                cnt_local = cnt_local + 1
                
             
        cnt_local2 = 0
        for cc in (acumulation):
            if ("class extension" in acumulation[cc]["refactoring"]) or ("class implementation" in acumulation[cc]["refactoring"]):
                cnt_local2  = cnt_local2 + 1
                
        assert cnt_local == cnt_local2
        root = get_root(g)
        if root in tree:
            print("not supposed to be in here")
        tree[root] = acumulation
    with open(os.path.join(PATH_RFT,project,project+"_refactoring_tree.json"), "w") as outfile:
        outfile.write(json.dumps(tree))

        
        


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  7.38it/s]


## Methdology: Table 2

In [3]:
def check_if_file_test(f):
    fname = (f.split("/")[-1][0:-5])
    if (fname[0:4].lower() == "test") or  (fname[-4:].lower() == "test"):
        return True
    else:
        return False
    


In [4]:
import re

def get_stats(data):
    acc_test = {'Add Class Extension':0,
          'Remove Class Extension':0,
          'Replace Class Extension':0,
          'Add Class Implementation':0,
          'Remove Class Implementation':0,
          'Replace Class Implementation':0}
    
    acc_source = {'Add Class Extension':0,
          'Remove Class Extension':0,
          'Replace Class Extension':0,
          'Add Class Implementation':0,
          'Remove Class Implementation':0,
          'Replace Class Implementation':0}
    
    for d in data["commits"]:
        if (d["refactorings"]):
            commit = (d["sha1"])
            for rft in (d["refactorings"]):
                t = (re.search(".* (extension|implementation)", rft["type"].lower()))
                if t:
                    fname = rft["leftSideLocations"][0]["filePath"]
                    isJava = fname.split("/")[-1][-4:].lower()
                    if isJava != "java":
                        continue
                    if check_if_file_test(fname):  # only to check test vs prod
                        acc_test[rft["type"]] = acc_test[rft["type"]] + 1
                    else:
                        acc_source[rft["type"]] = acc_source[rft["type"]] + 1
                        
    return acc_test,acc_source



In [5]:
result_test = {}
result_source = {}
for project in tqdm(projects):
    data = json.load(open(os.path.join(PATH_RFT,project,project+"_refactoring_history.json")))
    acc_test,acc_source = get_stats(data)
    result_test[project] = acc_test
    result_source[project] = acc_source


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:10<00:00,  1.27it/s]


In [6]:
# this bsaically outputs the raw table

latex_df = []
test_tot = []
source_tot = []
for index,k in enumerate(result_test):
    l = list(result_source[k].values())
    r = list(result_test[k].values())
    source_tot.append(l)
    test_tot.append(r)
    latex_df.append([k]+([str(x)+"/" +str(y) for x,y in zip(l,r)]))
latex_df = pd.DataFrame(latex_df,columns=["Project"]+["Add Ext.","Remove Ext.","Replace Ext.","Add Impl.","Remove Impl.","Replace Impl."]).to_latex(index=False).replace("\\textbackslash","")
print(latex_df)

\begin{tabular}{lllllll}
\toprule
               Project & Add Ext. & Remove Ext. & Replace Ext. & Add Impl. & Remove Impl. & Replace Impl. \\
\midrule
               cayenne &    71/13 &      49/228 &      658/276 &     193/0 &        122/0 &         164/1 \\
   commons-collections &     37/6 &       16/23 &      332/449 &     133/6 &        114/3 &         245/7 \\
          cucumber-jvm &    39/12 &       26/11 &         83/8 &    104/12 &        76/16 &        114/10 \\
                   cxf &   159/39 &      62/601 &      347/126 &     236/6 &       166/17 &        165/11 \\
 httpcomponents-client &     29/6 &       23/62 &        76/48 &     104/6 &         61/3 &          96/6 \\
                 iotdb &    46/10 &        25/6 &       265/17 &     115/2 &         77/3 &         105/7 \\
               jclouds &  222/118 &      157/33 &    1919/1473 &    345/15 &        274/6 &        737/10 \\
                 kylin &    46/35 &        18/5 &       192/44 &     199/1 &         