# MGL869 - Lab

*MGL869 ETS Montreal - Production engineering*

## Abstract

## Authors
- **Léo FORNOFF**
- **William PHAN**
- **Yannis OUAKRIM**

## Configuration

In [1]:
import pandas as pd
import os
import git
import re
import shutil
from time import time
import configparser
import requests
import subprocess

from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from numpy.ma.testutils import assert_equal

from hiveDL import hiveDL

In [2]:
config = configparser.ConfigParser()
config.read("config.ini")

required_sections = ["GENERAL", "GIT", "JIRA", "UNDERSTAND", "OUTPUT", "JUPYTER"]
for section in required_sections:
    assert section in config, f"Section {section} is missing in the configuration file"

In [3]:
def testConfig(section, keys):
    assert section in config, f"Section {section} is missing in the configuration file"
    for key in requiered_keys:
        assert key in config[section], f"Key {key} is missing in the configuration file"
        assert config[section][key], f"Key {key} is empty in the configuration file"

## Part 1 : Data collection

### 1.1 - Download Jira data

#### 1.1.1 - Check configuration to run the section

In [4]:
section = "JIRA"
requiered_keys = ["BaseUrl", "SearchComplement", "Query", "JiraCSVDirectory", "QueryEachRun", "JiraCombinedCSV"]

testConfig(section, requiered_keys)

#### 1.1.2 - Download Jira data if needed

In [5]:
# Check if we need to download the data each time
data_exists = config[section]["QueryEachRun"] == "No"

# Check if the directory exists
jira_csv_dir = config[section]["JiraCSVDirectory"]
if not os.path.exists(jira_csv_dir):
    data_exists = False
    os.makedirs(jira_csv_dir)

# Check if there is a command.txt file in the directory
command_file = os.path.join(jira_csv_dir, "command.txt")
if not os.path.exists(command_file):
    data_exists = False
else:
    with open(command_file, "r") as f:
        query = f.read()

    if query != config[section]["Query"]:
        data_exists = False

# Check if at least one .csv file exists in the directory
csv_files = [f for f in os.listdir(jira_csv_dir) if f.endswith(".csv")]
if not csv_files:
    data_exists = False

combined_csv_path = os.path.join(jira_csv_dir, config[section]["JiraCombinedCSV"])

if not data_exists:
    print("Downloading Jira data with pagination")
    base_url = config[section]["BaseUrl"]
    search_complement = config[section]["SearchComplement"]
    query = config[section]["Query"]
    jira_csv_dir = config[section]["JiraCSVDirectory"]
    temp_max = 1000
    start = 0

    
    try:
        hiveDL(
            command_file,
            jira_csv_dir,
            combined_csv_path,
            base_url,
            search_complement,
            query,
            temp_max = temp_max,
            start = start
        )

        
    except requests.exceptions.RequestException as err:
        print(f"Error during data fetching: {err}")
        raise SystemExit(err)
else:
    print("Data already exists")

Data already exists


### 1.2 - Clean Jira data using pandas
#### 1.2.1 - Load the data

In [6]:
jira_dataframe = pd.read_csv(combined_csv_path, low_memory=False)

#### 1.2.2 - Keep only the relevant columns

In [7]:
keep: list = ['Issue key', 'Status', 'Resolution', 'Created', 'Fix Versions Combined', 'Affects Versions Combined']

In [8]:
affects_version_columns = [col for col in jira_dataframe.columns if col.startswith('Affects Version/s')]

fix_version_columns = [col for col in jira_dataframe.columns if col.startswith('Fix Version/s')]

# Combine the versions into a single column
jira_dataframe['Fix Versions Combined'] = jira_dataframe[fix_version_columns].apply(lambda x: ', '.join(x.dropna().astype(str)), axis=1)
jira_dataframe['Affects Versions Combined'] = jira_dataframe[affects_version_columns].apply(lambda x: ', '.join(x.dropna().astype(str)),  axis=1)

jira_dataframe = jira_dataframe.loc[:, keep]

#### 1.2.3 - Extract ids

In [9]:
# Identify columns whose names contain the string 'Issue key'
issue_key_columns = jira_dataframe.columns[jira_dataframe.columns.str.contains('Issue key')]
# Extract the values from these columns as a NumPy array
issue_key_values = jira_dataframe[issue_key_columns].values
# Flatten the array to create a one-dimensional list of all 'Issue key' values
flattened_issue_keys = issue_key_values.flatten()
# Convert the list into a set to remove duplicates
unique_issue_keys = set(flattened_issue_keys)
# The result is a set of unique 'Issue key' values
ids = unique_issue_keys

## Part 2 : Repository analysis
### 2.1 - Clone repository
#### 2.1.1 - Check configuration to run the section

In [10]:
section = "GIT"
requiered_keys = ["HiveGitDirectory", "HiveGitRepoName", "HiveGitUrl", "HiveGitAlwaysClone", "HiveGitAlwaysPull"]

testConfig(section, requiered_keys)

#### 2.1.2 - Clone repository if needed

In [11]:
b_clone: bool = config[section]["HiveGitAlwaysClone"] == "Yes"

# Check if HiveGitDirectory exists
hive_git_dir = config[section]["HiveGitDirectory"]
if not os.path.exists(hive_git_dir):
    os.makedirs(hive_git_dir)
    b_clone = True

# Check if HiveGitRepoName exists
hive_git_url = config[section]["HiveGitUrl"]
hive_git_repo_name = config[section]["HiveGitRepoName"]
hive_git_repo_dir = os.path.join(hive_git_dir, hive_git_repo_name)
if not os.path.exists(hive_git_repo_dir):
    b_clone = True
    
if b_clone:
    print("Cloning the repository")
    git.Repo.clone_from(hive_git_url, hive_git_repo_dir)
    print("Repository cloned")
else:
    print("Repository already cloned")
    if config[section]["HiveGitAlwaysPull"] == "Yes":
        print("Checking for updates : Pulling the repository")
        repo = git.Repo(hive_git_repo_dir)
        repo.remotes.origin.pull()
        print("Repository up to date")
        

Repository already cloned
Checking for updates : Pulling the repository
Repository up to date


### 2.2 - Extract commits
#### 2.2.1 - Check configuration to run the section

In [12]:
section = "GENERAL"
requiered_keys = ["MaxThreads"]

testConfig(section, requiered_keys)

section = "GIT"
requiered_keys = ["HiveGitDirectory", "HiveGitRepoName", "HiveGitUrl", "HiveGitAlwaysClone", "HiveGitAlwaysPull", "CommitPattern"]

testConfig(section, requiered_keys)

# Get the commit pattern
pattern = re.compile(config["GIT"]["CommitPattern"])
# Get the number of threads
num_threads = min(int(config["GENERAL"]["MaxThreads"]), os.cpu_count())
# Get the repository directory
hive_git_repo_dir = os.path.join(hive_git_dir, hive_git_repo_name)
# Load the repository in memory
repo = git.Repo(hive_git_repo_dir)
# List to store the couples (issue, file, commit)
all_couples = []
# Split the commits into chunks
chunk_size = len(list(repo.iter_commits())) // num_threads
# Get all commits and files
all_commits = [{} for _ in range(num_threads)]

#### 2.2.2 - Extract commits

##### Function to extract commits

In [13]:
# Function to process a batch of commits
def process_commits(commits):
    local_repo = git.Repo(hive_git_repo_dir) # Load the repository in memory of the current thread
    tuple_key_file_commit = []
    for commit_id in commits:
        for match in commits[commit_id]:
            hive_key = f'HIVE-{match}'
            if hive_key in ids:
                for file in local_repo.commit(commit_id).stats.files:
                    tuple_key_file_commit.append((hive_key, file, commit_id))
    return tuple_key_file_commit

##### Prepare multithreading to extract commits

In [14]:
for i, commit in enumerate(repo.iter_commits()):
    matches = pattern.findall(commit.message)
    if matches:
        all_commits[i // chunk_size][commit.hexsha] = matches

##### Extract commits using multithreading

In [15]:
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(process_commits, chunk) for chunk in all_commits]
    for future in as_completed(futures):
        couples = future.result()
        all_couples.extend(couples)

print(f"{len(all_couples)} couples found.")

20524 couples found.


### 2.3 - Filter data
#### 2.3.1 - Create a DataFrame

In [16]:
commit_dataframe = pd.DataFrame(all_couples, columns=["Issue key", "File", "Commit"])

#### 2.3.2 - Keep specific languages only

In [17]:
section = "GENERAL"
requiered_keys = ["Languages"]

testConfig(section, requiered_keys)

languages = config[section]["Languages"].split(",")
# Remove potential whitespaces
languages = [lang.strip() for lang in languages]

commit_dataframe_filtered = commit_dataframe[commit_dataframe['File'].str.endswith(tuple(languages))]

### 2.4 - Extract filter versions from git
#### 2.4.1 - Extract versions 

In [18]:
section = "GIT"
requiered_keys = ["HiveGitDirectory", "HiveGitRepoName", "HiveGitUrl",  "ReleasesRegex"]

testConfig(section, requiered_keys)

repo = git.Repo(hive_git_repo_dir)
tags = repo.tags
versions = {}

for tag in tags:
    # Get the commit of the tag
    commit = tag.commit
    versions[tag.name] = commit

#### 2.4.2 - Filter versions

In [19]:
filtered_versions = {}
releases_regex = config["GIT"]["ReleasesRegex"].split(",")
releases_regex = [regex.strip() for regex in releases_regex]
release_regex = [re.compile(regex) for regex in releases_regex]

for version in versions:
    for regex in release_regex:
        if regex.match(version):
            version_numbers = version.split("-")[1]
            filtered_versions[version_numbers] = versions[version]

#### 2.4.3 - Dict : Sort version by date in descending order 

In [20]:
sorted_filtered_versions_date = dict(sorted(filtered_versions.items(), 
                                        key=lambda item: item[1].committed_datetime, 
                                        reverse=True))

## Part 3. - Understand analysis

### 3.1 - Set up the configuration and understand project

#### 3.1.1 - Check configuration

In [21]:
section = "UNDERSTAND"    
requiered_keys = ["UnderstandCommand", "UnderstandProjectName", "UnderstandMetricsFileName"]

testConfig(section, requiered_keys)

section = "GIT"
required_sections = ["HiveGitDirectory", "GiveGitRepoName"]

testConfig(section, required_sections)

AssertionError: Key UnderstandCommand is missing in the configuration file


#### 3.1.2 - Understand commands

In [None]:
hive_git_directory = config['GIT']["HiveGitDirectory"]
hive_repo_name = config['GIT']["HiveGitRepoName"]
understand_project_name = config["UNDERSTAND"]["UnderstandProjectName"]
und = config["UNDERSTAND"]["UnderstandCommand"]
understand_metrics_file_name = config["UNDERSTAND"]["UnderstandMetricsFileName"]

und_project_path = os.path.join(hive_git_directory, understand_project_name)
und_metrics_path = os.path.join(hive_git_directory, understand_metrics_file_name)
hive_git_repo_dir = os.path.join(hive_git_directory, hive_repo_name)

und_create = f"{und} create -db {und_project_path} -languages Java c++"
und_purge = f"{und} purge -db {und_project_path}"
und_add = f"{und} add {hive_git_repo_dir} -db {und_project_path}"
und_settings_metrics = f"{und} settings -metrics all -db {und_project_path}"
und_settings_output = f"{und} settings -metricsOutputFile  -db {und_metrics_path} {und_project_path}"
und_analyze = f"{und} analyze -db {und_project_path} -quiet"

#### 3.1.3 Create the Understand project

In [None]:
# Check if hive directory exists
hive_directory = config["GIT"]["HiveGitDirectory"]
if not os.path.exists(hive_directory):
    raise ValueError(f"The directory {hive_directory} does not exist")

# Check if the understand project exists
understand_project_name = config["UNDERSTAND"]["UnderstandProjectName"]
understand_project_path = os.path.join(hive_directory, understand_project_name)
if not os.path.exists(understand_project_path):
    print("Creating the Understand project")
    subprocess.run(und_create, check=True)
    print("Understand project created")

#### 3.1.4 - Purge the Understand project
**WARNING** : This will delete all the data in the Understand project

In [None]:
print("Purging the Understand project")
subprocess.run(und_purge, check=True)
print("Understand project purged")

In [None]:
print("Adding settings to the Understand project")
subprocess.run(und_settings_metrics, check=True)
subprocess.run(und_settings_output, check=True)
print("Settings added")

In [None]:
print("Adding the repository to the Understand project")
subprocess.run(und_add, check=True)
print("Repository added")

In [None]:
print("Analyzing the repository")
subprocess.run(und_analyze, check=True)
print("Repository analyzed")