In [1]:
import pandas as pd
import numpy as np
from utils import mongo_helper
import requests
from assets.web_config import headers, payload
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
mongo = mongo_helper.MongoHelper()
all_issues = mongo.db["data"].find()
issues_df = pd.DataFrame(all_issues)
issues_df

In [None]:
from tqdm import tqdm

def get_change_info(df):
    raw_url = df['url']
    api_url = str(raw_url).replace("github.com", "api.github.com/repos").replace("pull", "pulls")
    response_json = requests.request("GET", api_url, headers=headers, data=payload).json()
    if "changed_files" in response_json:
        return response_json['commits'], response_json['additions'], response_json['deletions'], response_json['changed_files']
    else:
        print("no changed files found")
        return np.nan,np.nan,np.nan,np.nan

tqdm.pandas()

issues_df[["commit_num", "addition_lines_num", "deletion_lines_num", "changed_file_num"]] = issues_df.progress_apply(get_change_info, axis=1, result_type="expand")
simplify_issues_df = issues_df[["tag", "commit_num", "addition_lines_num", "deletion_lines_num", "changed_file_num"]]
simplify_issues_df

In [None]:
def get_total_changed_line_num(df):
    return df["addition_lines_num"] + df["deletion_lines_num"]

simplify_issues_df.loc[:, "changed_lines_num"] = simplify_issues_df.progress_apply(get_total_changed_line_num, axis=1)
simplify_issues_df

In [None]:
print(simplify_issues_df.groupby('tag')['changed_file_num'].max())

In [None]:
def get_avg_changed_line_num_per_file(df):
    return df["changed_lines_num"]/df["changed_file_num"]

simplify_issues_df.loc[:, "avg_changed_lines_per_file"] = simplify_issues_df.progress_apply(get_avg_changed_line_num_per_file, axis=1)
simplify_issues_df

In [None]:
grouped_df = simplify_issues_df.groupby('tag').filter(lambda x: len(x)>1).groupby('tag')
grouped_df

In [None]:
# changed_lines_num
changed_lines_data = []
for name, group in grouped_df:
    changed_lines_data.append((name, group['changed_lines_num']))
changed_lines_data

In [None]:
def get_median(ele):
    return ele[1].median()
changed_lines_data.sort(key=get_median)
changed_lines_data

In [None]:
# labels = changed_lines_data.apply(lambda x: str(x[0]).replace("_"," "), axis=1)
# labels=[x[0] for x in changed_lines_data]

label_list = []

for x in changed_lines_data:
    new_label = str(x[0]).replace("_"," ")
    if new_label == "exception":
        new_label = "leak on exception"
    print(new_label)
    label_list.append(new_label)

In [None]:

plt.boxplot([x[1] for x in changed_lines_data], 
            labels=label_list, 
            vert=False, showfliers=False)
# plt.xlabel("Changed LoC")
# plt.ylabel("Type")
plt.savefig("../files/pics/type-changed_lines-boxplot.svg", dpi=300, format="svg", bbox_inches='tight')

In [None]:
def get_labels(datas):
    new_labels = []
    for data in datas:
        my_new_label = str(data[0]).replace("_"," ")
        if my_new_label == "exception":
            my_new_label = "leak on exception"
        new_labels.append(my_new_label)
    return new_labels
    

# avg_changed_lines_per_file_num
avg_changed_lines_per_file_data = []
for name, group in grouped_df:
    avg_changed_lines_per_file_data.append((name, group['avg_changed_lines_per_file']))

avg_changed_lines_per_file_data.sort(key=get_median)

plt.boxplot([x[1] for x in avg_changed_lines_per_file_data], 
            labels=get_labels(avg_changed_lines_per_file_data),
            vert=False, showfliers=False)
# plt.xlabel("Average Changed Lines per File")
# plt.ylabel("Type")
plt.savefig("../files/pics/type-avg_changed_lines_per_file-boxplot.svg", dpi=300, format="svg", bbox_inches='tight')

In [None]:
# change_file_num
change_file_data = []
for name, group in grouped_df:
    change_file_data.append((name, group['changed_file_num']))



change_file_data.sort(key=get_median)

plt.boxplot([x[1] for x in change_file_data], 
            labels=get_labels(change_file_data), 
            vert=False, showmeans=True,
            showfliers=False)
# plt.xlabel("Changed File(s)")
# plt.ylabel("Type")
plt.savefig("../files/pics/type-Changed_file_num.svg", dpi=300, format="svg", bbox_inches='tight')

In [None]:
print(simplify_issues_df.groupby('tag')['changed_file_num'].max())

In [None]:
import utils.time_util as tu

def applied_delta(df):
    return tu.get_time_delta(
        tu.to_time_stamp(df['time']), 
        tu.to_time_stamp(df['fixtime'])
    )

issues_df.loc[:,'day_of_fix'] = issues_df.progress_apply(applied_delta, axis=1)
time_changed_df = issues_df[["day_of_fix", "changed_file_num"]]

plt.scatter('day_of_fix', 'changed_file_num', data=time_changed_df, s=20)
plt.show()