---

---

# Load version in files and extract data from their commits

---

---
## File packages

### pip installed packages

## File packages

### pip installed packages

In [3]:
import os
import git
import json

from configparser import ConfigParser
from datetime import datetime

 ### Local packages


In [4]:
import src.versions as my_versions

---

## Constants loading


In [5]:
config: ConfigParser = ConfigParser()
config.read("config.ini")

data_directory: str = config["GENERAL"]["DataDirectory"]
hive_git_directory: str = config["GIT"]["HiveGitDirectory"]
hive_git_repo_Name: str = config["GIT"]["HiveGitRepoName"]

hive_git_path: str = os.path.join(data_directory, hive_git_directory, hive_git_repo_Name)

repo: git.Repo = git.Repo(hive_git_path)

---

## Extracting git tags of minor and major versions

Alpha, beta and patch are not selected

In [6]:
tags = my_versions.get_versions_tags(repo)
tags

[<git.TagReference "refs/tags/rel/release-2.1.0">,
 <git.TagReference "refs/tags/rel/release-2.2.0">,
 <git.TagReference "refs/tags/rel/release-2.3.0">,
 <git.TagReference "refs/tags/rel/release-3.0.0">,
 <git.TagReference "refs/tags/rel/release-3.1.0">,
 <git.TagReference "refs/tags/rel/release-4.0.0">,
 <git.TagReference "refs/tags/release-2.0.0">]

In [7]:
filtered_versions: {(int, int, int): git.Commit} = {}
for tag in my_versions.order_versions(tags):
    filtered_versions[tag.name.split("-")[-1]] = tag.name
filtered_versions

{'2.0.0': 'release-2.0.0',
 '2.1.0': 'rel/release-2.1.0',
 '2.2.0': 'rel/release-2.2.0',
 '2.3.0': 'rel/release-2.3.0',
 '3.0.0': 'rel/release-3.0.0',
 '3.1.0': 'rel/release-3.1.0',
 '4.0.0': 'rel/release-4.0.0'}

In [8]:
version_order = {
    '2.0.0': None,
    '2.1.0': '2.0.0',
    '2.2.0': '2.0.0',
    '2.3.0': '2.0.0',
    '3.0.0': '2.0.0',
    '3.1.0': '3.0.0',
    '4.0.0': '3.0.0'
}

---

---

# Data processing using Pydriller


---

---

## Constants loading

In [9]:
# my_versions.build_versions_commits(repo, filtered_versions)

---

## Extracting commits for each version

In [10]:
print(f"Skipping full versions build")
versions_metrics = {}
for key in filtered_versions:
    print(f"Building version {key}")
    with open(os.path.join("data/versions_build", f"{key}.json"), "r") as f:
        versions_metrics[key] = json.load(f)

Skipping full versions build
Building version 2.0.0
Building version 2.1.0
Building version 2.2.0
Building version 2.3.0
Building version 3.0.0
Building version 3.1.0
Building version 4.0.0


In [11]:
bug_keywords = ["bug", "fix", "error", "mistake", "fault", "flaw", "defect", "patch", "repair", "resolve", "correct"]

In [13]:
version_file_dict: dict = {}
version_dev_dict: dict = {}
for version in filtered_versions:
    previous_versions = []
    previous_version = version_order[version]
    while previous_version is not None:
        previous_versions.append(versions_metrics[version_order[version]])
        previous_version = version_order[previous_version]
    i = 0
    files_dict = {}
    version_dev_dict[version] = {}
    for commit_hash in versions_metrics[version]:
        if any(commit_hash in previous_version for previous_version in previous_versions):
            continue
        commit = versions_metrics[version][commit_hash]
        dev = commit["email"]
        if dev not in version_dev_dict[version]:
            version_dev_dict[version][dev] = 0
        version_dev_dict[version][dev] += 1
        for file in versions_metrics[version][commit_hash]["modified_files"]:
            filename = file["filename"]
            if filename not in files_dict:
                files_dict[filename]: {str: dict} = {
                    "commit_count": 0,
                    "commit_count_changed_comment": 0,
                    "commit_count_unchanged_comment": 0,
                    "commit_count_bug": 0,
                    "added_lines": 0,
                    "deleted_lines": 0,
                    "devs": set(),
                    "modification_dates": [],
                }

            files_dict[filename]["commit_count"] += 1
            files_dict[filename]["added_lines"] += file["added_lines"]
            files_dict[filename]["deleted_lines"] += file["deleted_lines"]
            if file["comments_changed"]["added"] != 0 or file["comments_changed"]["deleted"] != 0:
                files_dict[filename]["commit_count_changed_comment"] += 1
            else:
                files_dict[filename]["commit_count_unchanged_comment"] += 1
            if any(keyword in commit["msg"].lower() for keyword in bug_keywords):
                files_dict[filename]["commit_count_bug"] += 1
            files_dict[filename]["devs"].add(commit["email"])
            files_dict[filename]["modification_dates"].append(commit["author_date"])
    version_file_dict[version] = files_dict


### Dev XP

In [14]:
dev_count_commit_until: dict = {}
for version in version_dev_dict:
    dev_count_commit_until[version] = {}
    previous_version = version_order[version]
    while previous_version:
        for dev in version_dev_dict[previous_version]:
            if dev not in dev_count_commit_until[version]:
                dev_count_commit_until[version][dev] = 0
            dev_count_commit_until[version][dev] += version_dev_dict[previous_version][dev]
        previous_version = version_order[previous_version]

In [15]:
for version in version_file_dict:
    for file in version_file_dict[version]:
        version_file_dict[version][file]["dev_count"] = len(version_file_dict[version][file]["devs"])
        version_file_dict[version][file]["modification_dates"] = sorted(
            version_file_dict[version][file]["modification_dates"])
        version_file_dict[version][file]["mean_dev_xp"] = 0
        version_file_dict[version][file]["min_dev_xp"] = 0
        for dev in version_file_dict[version][file]["devs"]:
            dev_xp = dev_count_commit_until[version][dev] if dev in dev_count_commit_until[version] else 0
            version_file_dict[version][file]["mean_dev_xp"] += dev_xp
            version_file_dict[version][file]["min_dev_xp"] = min(version_file_dict[version][file]["min_dev_xp"], dev_xp)
        if len(version_file_dict[version][file]["devs"]) != 0:
            version_file_dict[version][file]["mean_dev_xp"] /= len(version_file_dict[version][file]["devs"])
        else:
            version_file_dict[version][file]["mean_dev_xp"] = 0


### Commit and dev recursive

In [16]:
for version in version_file_dict:
    for file in version_file_dict[version]:
        commit_counter = version_file_dict[version][file]["commit_count"]
        devs = version_file_dict[version][file]["devs"]

        previous_version = version_order[version]
        while previous_version:
            if file in version_file_dict[previous_version]:
                commit_counter += version_file_dict[previous_version][file]["commit_count"]
                devs = devs.union(version_file_dict[previous_version][file]["devs"])
            previous_version = version_order[previous_version]
        version_file_dict[version][file]["commit_recursive"] = commit_counter
        version_file_dict[version][file]["devs_recursive"] = devs


### Mean time between commits

In [17]:
for version in version_file_dict:
    for file in version_file_dict[version]:
        modification_dates = version_file_dict[version][file]["modification_dates"]
        if len(modification_dates) == 0:
            version_file_dict[version][file]["mean_time_between_commits"] = 0
        else:
            diff = []
            for i in range(1, len(modification_dates)):
                # Modification dates are strings : "2008-09-02 23:58:59"
                # Transform them into datetime objects
                modification_dates_1 = datetime.strptime(modification_dates[i], "%Y-%m-%d %H:%M:%S")
                modification_dates_2 = datetime.strptime(modification_dates[i - 1], "%Y-%m-%d %H:%M:%S")
                diff.append((modification_dates_1 - modification_dates_2).seconds)
            version_file_dict[version][file]["mean_time_between_commits"] = sum(diff) / len(diff) if len(
                diff) != 0 else 0

### Mean time between commits recursive

In [18]:
for version in version_file_dict:
    studied_version = version
    while studied_version:
        for file in version_file_dict[studied_version]:
            modification_dates = version_file_dict[studied_version][file]["modification_dates"]
            if len(modification_dates) == 0:
                version_file_dict[studied_version][file]["mean_time_between_commits_recursive"] = 0
            else:
                diff = []
                for i in range(1, len(modification_dates)):
                    # Modification dates are strings : "2008-09-02 23:58:59"
                    # Transform them into datetime objects
                    modification_dates_1 = datetime.strptime(modification_dates[i], "%Y-%m-%d %H:%M:%S")
                    modification_dates_2 = datetime.strptime(modification_dates[i - 1], "%Y-%m-%d %H:%M:%S")
                    diff.append((modification_dates_1 - modification_dates_2).seconds)
                version_file_dict[studied_version][file]["mean_time_between_commits_recursive"] = sum(diff) / len(
                    diff) if len(diff) != 0 else 0
        studied_version = version_order[studied_version]


---

## Load understand data in a DataFrame

In [19]:
import pandas as pd

understand_df = {}

merged_metrics_directory = os.path.join(data_directory, config["OUTPUT"]["MergedMetricsOutputDirectory"])

for file in os.listdir(merged_metrics_directory):
    if file.endswith(".csv"):
        understand_df[file.split("_")[0]] = pd.read_csv(os.path.join(merged_metrics_directory, file))

print(understand_df.keys())

dict_keys(['2.0.0', '2.1.0', '2.2.0', '2.3.0', '3.0.0', '3.1.0', '4.0.0'])


In [20]:
if not os.path.exists("data/full_metrics"):
    os.makedirs("data/full_metrics")

for version in understand_df:
    df = understand_df[version]
    df["commit_count"] = 0
    df["commit_count_r"] = 0
    df["commit_count_changed_comment"] = 0
    df["commit_count_unchanged_comment"] = 0
    df["commit_count_bug"] = 0
    df["added_lines"] = 0
    df["deleted_lines"] = 0
    df["count_dev"] = 0
    df["count_dev_r"] = 0
    df["mean_dev_xp"] = 0
    df["min_dev_xp"] = 0
    df["mean_time"] = 0
    df["mean_time_r"] = 0

    for index, row in df.iterrows():
        filename = row["Name"]
        if filename in version_file_dict[version]:
            df.at[index, "commit_count"] = version_file_dict[version][filename]["commit_count"]
            df.at[index, "commit_count_r"] = version_file_dict[version][filename]["commit_recursive"]
            df.at[index, "commit_count_changed_comment"] = version_file_dict[version][filename][
                "commit_count_changed_comment"]
            df.at[index, "commit_count_unchanged_comment"] = version_file_dict[version][filename][
                "commit_count_unchanged_comment"]
            df.at[index, "commit_count_bug"] = version_file_dict[version][filename]["commit_count_bug"]
            df.at[index, "added_lines"] = version_file_dict[version][filename]["added_lines"]
            df.at[index, "deleted_lines"] = version_file_dict[version][filename]["deleted_lines"]
            df.at[index, "count_dev"] = version_file_dict[version][filename]["dev_count"]
            df.at[index, "count_dev_r"] = len(version_file_dict[version][filename]["devs_recursive"])
            df.at[index, "mean_dev_xp"] = int(version_file_dict[version][filename]["mean_dev_xp"])
            df.at[index, "min_dev_xp"] = int(version_file_dict[version][filename]["min_dev_xp"])
            df.at[index, "mean_time"] = int(version_file_dict[version][filename]["mean_time_between_commits"])
            df.at[index, "mean_time_r"] = int(
                version_file_dict[version][filename]["mean_time_between_commits_recursive"])

    df.to_csv("data/full_metrics/" + version + "_full_metrics.csv", index=False)
