# MGL869 - Lab

*MGL869 ETS Montreal - Production engineering*

## Abstract

## Authors
- **Léo FORNOFF**
- **William PHAN**
- **Yannis OUAKRIM**

## Configuration

In [3]:
import pandas as pd
import os
import git
import re
import shutil
import configparser
import requests
import numpy as np

from datetime import datetime
from time import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from subprocess import Popen, PIPE

from hiveDL import hiveDL

In [4]:
config = configparser.ConfigParser()
config.read("config.ini")

required_sections : [str] = [
    "GENERAL",
    "GIT",
    "JIRA",
    "UNDERSTAND",
    "OUTPUT",
    "JUPYTER"]

for section in required_sections:
    assert section in config, f"Section {section} is missing in the configuration file"

In [6]:
def testConfig(t_section: str, keys : [str]):
    assert t_section in config          , f"Section {t_section} is missing in the configuration file"
    for key in keys:
        assert key in config[t_section] , f"Key {key} is missing in the configuration file"
        assert config[t_section][key]   , f"Key {key} is empty in the configuration file"

## Part 1 : Data collection

### 1.1 - Download Jira data

#### 1.1.1 - Check configuration to run the section

In [4]:
section         : str   = "JIRA"
required_keys   : [str] = [
    "BaseUrl",
    "SearchComplement",
    "Query",
    "JiraCSVDirectory",
    "QueryEachRun",
    "JiraCombinedCSV"]

testConfig(section, required_keys)

base_url            : str   = config[section]["BaseUrl"]
search_complement   : str   = config[section]["SearchComplement"]
query               : str   = config[section]["Query"]
jira_csv_directory  : str   = config[section]["JiraCSVDirectory"]
query_each_run      : str   = config[section]["QueryEachRun"]
jira_combined_csv   : str   = config[section]["JiraCombinedCSV"]

command_file                = os.path.join(jira_csv_directory, "command.txt")
combined_csv_path           = os.path.join(jira_csv_directory, jira_combined_csv)

#### 1.1.2 - Download Jira data if needed

In [5]:
# Check if we need to download the data each time
data_exists : bool  = query_each_run == "No"
# Check if at least one .csv file exists in the directory
csv_files   : [str] = [f for f in os.listdir(jira_csv_directory) if f.endswith(".csv")]

# Check if the directory exists
if not os.path.exists(jira_csv_directory):
    os.makedirs(jira_csv_directory)
    data_exists = False
    
# Check if there is a command.txt file in the directory
if not os.path.exists(command_file):
    data_exists = False
else:
    with open(command_file, "r") as f:
        if f.read() != query:
            data_exists = False
        
data_exists = data_exists and len(csv_files) > 0

if not data_exists:
    temp_max    : int   = 1000
    start       : int   = 0
    try:
        print("Downloading Jira data with pagination")
        hiveDL(
            command_file,
            jira_csv_directory,
            combined_csv_path,
            base_url,
            search_complement,
            query,
            temp_max = temp_max,
            start = start
        )
    except requests.exceptions.RequestException as err:
        print(f"Error during data fetching: {err}")
        raise SystemExit(err)
else:
    print("Data already exists")

Data already exists


### 1.2 - Clean Jira data using pandas
#### 1.2.1 - Load the data

In [6]:
jira_dataframe = pd.read_csv(combined_csv_path, low_memory=False)

#### 1.2.2 - Keep only the relevant columns

In [7]:
keep: [str] = [
    'Issue key',
    'Status', 
    'Resolution', 
    'Created', 
    'Fix Versions Combined', 
    'Affects Versions Combined']

In [8]:
affects_version_columns : [str] = [col for col in jira_dataframe.columns if col.startswith('Affects Version/s')]
fix_version_columns     : [str] = [col for col in jira_dataframe.columns if col.startswith('Fix Version/s')]

# Combine the versions into a single column
jira_dataframe['Fix Versions Combined']     = jira_dataframe[fix_version_columns].apply(
                                                    lambda x: ', '.join(x.dropna().astype(str)), axis=1
                                                )
jira_dataframe['Affects Versions Combined'] = jira_dataframe[affects_version_columns].apply(
                                                    lambda x: ', '.join(x.dropna().astype(str)),  axis=1
                                                )

jira_dataframe                              = jira_dataframe.loc[:, keep]

#### 1.2.3 - Extract ids

In [9]:
# Identify columns whose names contain the string 'Issue key'
issue_key_columns       : pd.Index      = jira_dataframe.columns[jira_dataframe.columns.str.contains('Issue key')]
# Extract the values from these columns as a NumPy array
issue_key_values        : np.ndarray    = jira_dataframe[issue_key_columns].values
# Flatten the array to create a one-dimensional list of all 'Issue key' values
flattened_issue_keys    : np.ndarray    = issue_key_values.flatten()
# Convert the list into a set to remove duplicates
unique_issue_keys       : set           = set(flattened_issue_keys)
# The result is a set of unique 'Issue key' values
ids                     : set           = unique_issue_keys

issue_key_columns

Index(['Issue key'], dtype='object')

## Part 2 : Repository analysis
### 2.1 - Clone repository
#### 2.1.1 - Check configuration to run the section

In [9]:
section         : str   = "GIT"
required_keys   : [str] = [
    "HiveGitDirectory",
    "HiveGitRepoName", 
    "HiveGitUrl", 
    "HiveGitAlwaysClone", 
    "HiveGitAlwaysPull"]

testConfig(section, required_keys)

hive_git_dir            : str   = config[section]["HiveGitDirectory"]
hive_git_repo_name      : str   = config[section]["HiveGitRepoName"]
hive_git_url            : str   = config[section]["HiveGitUrl"]
hive_git_always_clone   : str   = config[section]["HiveGitAlwaysClone"]
hive_git_always_pull    : str   = config[section]["HiveGitAlwaysPull"]

hive_git_repo_dir       : str   = os.path.join(hive_git_dir, hive_git_repo_name)

#### 2.1.2 - Clone repository if needed

In [11]:
b_clone: bool = hive_git_always_clone == "Yes"

# Check if HiveGitDirectory exists
if not os.path.exists(hive_git_dir):
    os.makedirs(hive_git_dir)
    b_clone = True

# Check if HiveGitRepoName exists
if not os.path.exists(hive_git_repo_dir):
    b_clone = True
    
if b_clone:
    print("Cloning the repository")
    git.Repo.clone_from(hive_git_url, hive_git_repo_dir)
    print("Repository cloned")
else:
    print("Repository already cloned")
    if hive_git_always_pull == "Yes":
        try :
            print("Checking for updates : Pulling the repository")
            repo = git.Repo(hive_git_repo_dir)
            repo.remotes.origin.pull()
            print("Repository up to date")
        except GitCommandError as GT: 
            print(GT) 
        

Repository already cloned
Checking for updates : Pulling the repository


GitCommandError: Cmd('git') failed due to: exit code(1)
  cmdline: git pull -v -- origin

### 2.2 - Extract commits
#### 2.2.1 - Check configuration to run the section

In [8]:
section        : str   = "GENERAL"
required_keys  : [str] = ["MaxThreads"]

testConfig(section, required_keys)

section         : str   = "GIT"
required_keys  : [str] = [
    "HiveGitDirectory",
    "HiveGitRepoName",
    "HiveGitUrl",
    "HiveGitAlwaysClone",
    "HiveGitAlwaysPull",
    "CommitPattern"]

testConfig(section, required_keys)

hive_git_directory      : str               = config["GIT"]["HiveGitDirectory"]
hive_git_repo_name      : str               = config["GIT"]["HiveGitRepoName"]
hive_git_url            : str               = config["GIT"]["HiveGitUrl"]
hive_git_always_clone   : str               = config["GIT"]["HiveGitAlwaysClone"]
hive_git_always_pull    : str               = config["GIT"]["HiveGitAlwaysPull"]
commit_pattern          : re.Pattern        = re.compile(config["GIT"]["CommitPattern"])
max_threads             : int               = int(config["GENERAL"]["MaxThreads"])

# Get the number of threads
num_threads             : int               = min(max_threads, os.cpu_count())
# Get the repository directory
hive_git_repo_dir       : str               = os.path.join(hive_git_dir, hive_git_repo_name)
# Load the repository in memory
repo                    : git.Repo          = git.Repo(hive_git_repo_dir)
# List to store the couples (issue, file, commit)
all_couples             : [(str, str, str)] = []
# Split the commits into chunks
chunk_size              : int               = len(list(repo.iter_commits())) // num_threads
# Get all commits and files
all_commits             : [dict]            = [{} for _ in range(num_threads)]

NameError: name 'hive_git_dir' is not defined

#### 2.2.2 - Extract commits

##### Function to extract commits

In [13]:
# Function to process a batch of commits
def process_commits(commits):
    # Load the repository in memory of the current thread
    local_repo = git.Repo(hive_git_repo_dir) 
    
    tuple_key_file_commit = []
    for commit_id in commits:
        for match in commits[commit_id]:
            hive_key = f'HIVE-{match}'
            if hive_key in ids:
                for file in local_repo.commit(commit_id).stats.files:
                    tuple_key_file_commit.append((hive_key, file, commit_id))
    return tuple_key_file_commit

##### Prepare multithreading to extract commits

In [14]:
for i, commit in enumerate(repo.iter_commits()):
    matches = commit_pattern.findall(commit.message)
    if matches:
        all_commits[i // chunk_size][commit.hexsha] = matches

##### Extract commits using multithreading

In [15]:
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(process_commits, chunk) for chunk in all_commits]
    for future in as_completed(futures):
        couples = future.result()
        all_couples.extend(couples)

print(f"{len(all_couples)} couples found.")

20493 couples found.


### 2.3 - Filter data
#### 2.3.1 - Create a DataFrame

In [16]:
commit_dataframe : pd.DataFrame = pd.DataFrame(all_couples, columns=["Issue key", "File", "Commit"])

#### 2.3.2 - Keep specific languages only

In [17]:
section         : str   = "GENERAL"
required_keys   : [str] = ["Languages"]

testConfig(section, required_keys)

# Languages without whitespaces
languages                   : [str]         = config[section]["Languages"].split(",")
languages                   : [str]         = [lang.strip() for lang in languages]
commit_dataframe_filtered   : pd.DataFrame  = commit_dataframe[
                                                commit_dataframe['File'].str.endswith(tuple(languages))
                                            ]

### 2.4 - Extract filter versions from git
#### 2.4.1 - Extract versions 

In [10]:
section         : str   = "GIT"
required_keys   : [str] = [
    "HiveGitDirectory",
    "HiveGitRepoName",
    "HiveGitUrl",
    "ReleasesRegex"]

testConfig(section, required_keys)

hive_git_directory  : str           = config["GIT"]["HiveGitDirectory"]
hive_git_repo_name  : str           = config["GIT"]["HiveGitRepoName"]
hive_git_url        : str           = config["GIT"]["HiveGitUrl"]
releases_regex      : [str]         = config["GIT"]["ReleasesRegex"].split(",")

releases_regex      : [str]         = [regex.strip() for regex in releases_regex]
release_regex       : [re.Pattern]  = [re.compile(regex) for regex in releases_regex]

repo                : git.Repo      = git.Repo(hive_git_repo_dir)
tags                                = repo.tags
versions            : dict          = {}

for tag in tags:
    # Get the commit of the tag
    commit = tag.commit
    versions[tag.name] = commit

#### 2.4.2 - Filter versions

In [11]:
filtered_versions : dict = {}
for version in versions:
    for regex in release_regex:
        if regex.match(version):
            version_numbers = version.split("-")[1]
            filtered_versions[version_numbers] = versions[version]

#### 2.4.3 - Dict : Sort version by date in descending order 

In [13]:
sorted_filtered_versions_date = dict(sorted(filtered_versions.items(), 
                                        key=lambda item: item[1].committed_datetime, 
                                        reverse=True))

In [14]:
len(sorted_filtered_versions_date)

46

## Part 3. - Understand analysis

### 3.1 - Set up the configuration and understand project

#### 3.1.1 - Check configuration

In [32]:
section         : str   = "UNDERSTAND"    
required_keys   : [str] = [
    "UnderstandCommand",
    "UnderstandProjectName",
    "UnderstandMetricsFileName"]

testConfig(section, required_keys)

section         : str   = "GIT"
required_keys   : [str] = [
    "HiveGitDirectory",
    "HiveGitRepoName"]

testConfig(section, required_keys)

hive_git_directory              : str   = config['GIT']["HiveGitDirectory"]
hive_repo_name                  : str   = config['GIT']["HiveGitRepoName"]
understand_project_name         : str   = config["UNDERSTAND"]["UnderstandProjectName"]
und                             : str   = config["UNDERSTAND"]["UnderstandCommand"]
understand_metrics_file_name    : str   = config["UNDERSTAND"]["UnderstandMetricsFileName"]

und_project_path                : str   = os.path.join(hive_git_directory, understand_project_name)
und_metrics_path                : str   = os.path.join(hive_git_directory, understand_project_name[:-4:] + ".csv")
hive_git_repo_dir               : str   = os.path.join(hive_git_directory, hive_repo_name)

repo                            : git.Repo = git.Repo(hive_git_repo_dir)
und_metrics_path

'hive\\hive.csv'


#### 3.1.2 - Understand commands

In [33]:
und_create              : str = f"{und} create -db {und_project_path} -languages Java c++"
und_purge               : str = f"{und} purge -db {und_project_path}"
und_add                 : str = f"{und} add {hive_git_repo_dir} -db {und_project_path}"
und_settings_metrics    : str = f"{und} settings -metrics all -db {und_project_path}"
und_settings_output     : str = f"{und} settings -metricsOutputFile  -db {und_metrics_path} {und_project_path}"
und_analyze             : str = f"{und} analyze -db {und_project_path} -quiet"
und_analyze_changes     : str = f"{und} analyze -db {und_project_path} -quiet -rescan -changed"
und_metrics             : str = f"{und} metrics {und_project_path}"

def run_command(command : str):
    command_args : [str] = command.split(" ")
    print(f"Running command : \n     {command}")
    process = Popen(command_args, stdout=PIPE, stderr=PIPE).communicate()[0]
    print(process.decode("utf-8"))
    

#### 3.1.3 Create the Understand project

In [34]:
# Check if hive directory exists
if not os.path.exists(hive_git_directory):
    raise ValueError(f"The directory {hive_git_directory} does not exist")

# Check if the Understand project exists
if not os.path.exists(und_project_path):
    run_command(und_create)

#### 3.1.4 - Purge the Understand project
**WARNING** : This will delete all the data in the Understand project

In [35]:
run_command(und_purge)

Running command : 
     und purge -db hive\hive.und
Database purged.



In [None]:
for version in sorted_filtered_versions_date :
    t = time()
    commit = sorted_filtered_versions_date[version]
    print("Checking out commit : ", commit)
    repo.git.checkout(commit)
    print(f"Adding commit {commit} to the Understand project")
    run_command(und_add)
    print(f"Analyzing commit {commit}")
    run_command(und_analyze_changes)
    print(f"Exporting metrics for commit {commit}")
    run_command(und_metrics)
    # Copy UnderstandProjectName.csv to the output directory and set file name as UnderstandMetricsFileName
    shutil.copy(und_metrics_path, os.path.join(config["OUTPUT"]["OutputDir"], understand_metrics_file_name + str(version)))
    print(t-time())

Checking out commit :  3af4517eb8cfd9407ad34ed78a0b48b57dfaa264
Adding commit 3af4517eb8cfd9407ad34ed78a0b48b57dfaa264 to the Understand project
Running command : 
     und add hive\hiveRepo -db hive\hive.und
Directory: C:/Users/moshi/Documents/projects/Informatique/ETS/MGL869/MGL869-Lab-Hive/hive/hiveRepo already exists in project. Setting new properties. 
Files added: 8076

Analyzing commit 3af4517eb8cfd9407ad34ed78a0b48b57dfaa264
Running command : 
     und analyze -db hive\hive.und -quiet -rescan -changed

Exporting metrics for commit 3af4517eb8cfd9407ad34ed78a0b48b57dfaa264
Running command : 
     und metrics hive\hive.und
