# MGL869 - Lab

*MGL869 ETS Montreal - Production engineering*

## Abstract

## Authors
- **Léo FORNOFF**
- **William PHAN**
- **Yannis OUAKRIM**

---

## Part 1 : Data collection

In [None]:
from numpy.random import logistic

from Jira import jira_download
from pandas import Index
from numpy import ndarray


### 1.1 - Download Jira data
We download data if they are not already present in the data folder.

Return the dataframe of the data.

Query filter can be defined in config.ini

In [None]:
jira_dataframe = jira_download()

### 1.2 - Clean Jira data using pandas
Previously, we downloaded all the data from Jira. Now, we will clean the data using pandas.
We will keep only some columns and combine some columns.

In [None]:
keep: [str] = ['Issue key', 'Status', 'Resolution', 'Created', 'Fix Versions Combined', 'Affects Versions Combined']

In [None]:
affects_version_columns: [str] = [col for col in jira_dataframe.columns if col.startswith('Affects Version/s')]
jira_dataframe['Affects Versions Combined'] = jira_dataframe[affects_version_columns].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1
)

In [None]:
# Combine the versions into a single column
fix_version_columns: [str] = [col for col in jira_dataframe.columns if col.startswith('Fix Version/s')]

jira_dataframe['Fix Versions Combined'] = jira_dataframe[fix_version_columns].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1
)
jira_dataframe = jira_dataframe.loc[:, keep]

In [None]:
# Identify columns whose names contain the string 'Issue key'
issue_key_columns: Index = jira_dataframe.columns[jira_dataframe.columns.str.contains('Issue key')]
# Extract the values from these columns as a NumPy array
issue_key_values: ndarray = jira_dataframe[issue_key_columns].values
# Flatten the array to create a one-dimensional list of all 'Issue key' values
flattened_issue_keys: ndarray = issue_key_values.flatten()
# Convert the list into a set to remove duplicates
ids: set = set(flattened_issue_keys)

In [None]:
ids

---


## Part 2 : Repository analysis


In [None]:
from Hive import git_download, commit_analysis, update_commit_dataframe
from git import Repo, Tag
from pandas import DataFrame
from configparser import ConfigParser
from re import compile
from packaging import version

### 2.1 - Clone repository

In [None]:
repo: Repo = git_download()

In [None]:
all_couples = commit_analysis(ids)

### 2.2 - Filter data

In [None]:
commit_dataframe: DataFrame = DataFrame(all_couples, columns=["Issue key", "File", "Commit"])

In [None]:
# Languages without whitespaces
config: ConfigParser = ConfigParser()
config.read("config.ini")
languages: [str] = config["GENERAL"]["Languages"].split(",")
languages: [str] = [lang.strip() for lang in languages]
commit_dataframe: DataFrame = commit_dataframe[commit_dataframe['File'].str.endswith(tuple(languages))]

In [None]:
couples = update_commit_dataframe(commit_dataframe, jira_dataframe)
couples


In [None]:
version_3_files = couples[couples["Version Affected"].str.contains("3.0.0", na=False)]

num_files = version_3_files["File"].nunique()
num_files


### 2.3 - Extract filter versions from git

In [None]:
releases_regex: [str] = config["GIT"]["ReleasesRegex"].split(",")
tags: Tag = repo.tags
versions: dict = {tag.name: tag.commit for tag in tags}
releases_regex: [str] = [regex.strip() for regex in releases_regex]
releases_regex = [compile(regex) for regex in releases_regex]

In [None]:
filtered_versions: dict = {}
for version_str in versions:
    if any(regex.match(version_str) for regex in releases_regex):
        version_numbers = version_str.split("-")[1]
        if version.parse(version_numbers) >= version.parse("2.0"):
            filtered_versions[version_numbers] = versions[version_str]

filtered_versions = dict(sorted(
    filtered_versions.items(),
    key=lambda item: item[1].committed_datetime,
    reverse=True
))

filtered_versions, len(filtered_versions)

## Part 3. - Understand analysis

In [None]:
from Understand.commands import und_create_command, und_purge_command
from Understand.metrics import metrics
from Understand.label import label_all_metrics
from os import path
from Understand.enrich import enrich_metrics
from Understand.update import merge_all_metrics

### 3.1 - Create the Understand project


In [None]:
hive_git_directory: str = config["GIT"]["HiveGitDirectory"]
data_directory: str = config["GENERAL"]["DataDirectory"]
understand_project_name: str = config["UNDERSTAND"]["UnderstandProjectName"]

understand_project_path: str = path.join(data_directory, hive_git_directory, understand_project_name)

if not path.exists(understand_project_path):
    und_create_command()

In [None]:
und_purge_command()

### 3.2 - Metrics extraction


In [None]:
metrics(filtered_versions)

### 3.3 - Labeling


In [None]:
label_all_metrics(couples)

In [None]:
enrich_metrics(couples)

In [None]:
v = [
    "2.0.0", "2.0.1", "2.1.0", "2.1.1", "2.2.0", "2.3.0", "2.3.1", "2.3.2",
    "2.3.3", "2.3.4", "2.3.5", "2.3.6", "2.3.7", "2.3.8", "2.3.9", "2.3.10",
    "3.0.0", "3.1.0", "3.1.1", "3.1.2", "3.1.3", "4.0.0", "4.0.1"
]
merge_all_metrics(v)

## 4 - Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from configparser import ConfigParser
from IA_models import load_data, KFold_XY, plot_SHAP, logistic_regression_treatment, random_forest_treatment

### 4.1 - Load data

In [None]:
config: ConfigParser = ConfigParser()
config.read("config.ini")

LABELED_METRICS_OUTPUT_DIRECTORY: str = config["OUTPUT"]["LabeledMetricsOutputDirectory"]
N_SPLITS: int = int(config["IA"]["NSplits"])
SHUFFLE: bool = config["IA"].getboolean("Shuffle")
RANDOM_STATE: int = int(config["IA"]["RandomState"])
N_ESTIMATORS: int = int(config["IA"]["nEstimators"])

data_dict: dict = load_data(LABELED_METRICS_OUTPUT_DIRECTORY)

### 4.2 - Prepare data
The commit version, its ID, and the file name are not considered in the model training. All columns containing a NaN are unusable and removed.

In [None]:
XY_dict: dict = {}
for key in data_dict.keys():
    data: pd.DataFrame = data_dict[key]
    X = data.drop(columns=['BugStatus', 'Name', 'Kind']).dropna(axis=1)  # independent variables
    y = data['BugStatus']  # presence of a bug
    XY_dict[key] = (X, y)

### 4.3 Training and test data
The entire dataset is divided into 10 equal parts on which the model is trained. Validation is performed [using cross-validation](https://medium.com/@tubelwj/five-methods-for-data-splitting-in-machine-learning-27baa50908ed) to more accurately determine the effectiveness of our model.

In [None]:
XY_training_dict: dict = {}
XY_testing_dict: dict = {}
for key in XY_dict.keys():
    X, y = XY_dict[key]
    X_train, X_test, y_train, y_test = KFold_XY(N_SPLITS, SHUFFLE, RANDOM_STATE, X, y)
    XY_training_dict[key] = (X_train, y_train)
    XY_testing_dict[key] = (X_test, y_test)

### 4.4 Model Training

Comparison between logistic regression and random forest.

In [None]:
XY_training_dict.keys()

In [None]:
XY_training_dict['2.0.0_labeled_metrics.csv']

In [None]:
logistic_regression_models: dict = {}
random_forest_models: dict = {}

for key in XY_training_dict:
    X_train, y_train = XY_training_dict[key]
    log_model = LogisticRegression(max_iter=1000)
    random_model = RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE)

    log_model.fit(X_train, y_train)
    random_model.fit(X_train, y_train)

    logistic_regression_models[key] = log_model
    random_forest_models[key] = random_model

### 4.5 Prediction

In [None]:
logistic_regression_predictions: dict = {}
random_forest_prediction: dict = {}

for key in XY_testing_dict:
    X_test, y_test = XY_testing_dict[key]
    random_forest_prediction[key] = random_forest_models[key].predict(X_test)
    logistic_regression_predictions[key] = logistic_regression_models[key].predict(X_test)

### 4.6 Evaluate the model performance

The 2 models are compared by their AUC, precision, and recall. The **random forest** is a better model for determining the presence of bugs in a commit.

#### 4.6.1 Random Forest

In [None]:
shap_values_versions = {}
for key in logistic_regression_models:
    shap_values = plot_SHAP(logistic_regression_models[key], XY_training_dict[key][0], XY_testing_dict[key][0], key)
    shap_values_versions[key] = shap_values

In [None]:
best_metrics = set()
metrics_values = {}
for key in shap_values_versions.keys():
    version_title = key[:3]
    shap_values = shap_values_versions[key]
    X_test = XY_testing_dict[key][0]
    # Extraire les noms des features depuis X_test
    feature_names = X_test.columns if hasattr(X_test, 'columns') else [f'Feature {i}' for i in range(X_test.shape[1])]
    # Calculer l'importance moyenne absolue des SHAP values pour chaque feature
    shap_mean_importance = np.abs(shap_values.values).mean(axis=0)
    # Trouver les indices des deux features les plus importantes
    top_2_indices = np.argsort(shap_mean_importance)[-5:][::-1]
    # Extraire les noms et les valeurs des deux features les plus importantes
    top_2_features = []
    for i in top_2_indices:
        top_2_features.append((feature_names[i], shap_mean_importance[i]))
        best_metrics.add(feature_names[i])
    metrics_values[version_title] = top_2_features
print(best_metrics)

#

In [None]:
# Créer un DataFrame pour organiser les données
df_metrics = pd.DataFrame(columns=['version'] + list(best_metrics))
for version, features in metrics_values.items():
    row = {'version': version}
    for metric_name, metric_value in features:
        row[metric_name] = metric_value
    df_metrics = pd.concat([df_metrics, pd.DataFrame([row])], ignore_index=True)
df_metrics = df_metrics.fillna(0)

# Tracer le graphique
plt.figure(figsize=(10, 6))
for metric in best_metrics:
    if metric in df_metrics:
        plt.plot(
            df_metrics['version'],
            df_metrics[metric],
            label=metric,
            marker='o'
        )

# Ajouter des détails au graphique
plt.xlabel('Version')
plt.ylabel('Importance moyenne (SHAP)')
plt.title('Évolution des métriques les plus importantes par version')
plt.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()

# Sauvegarder et afficher le graphique
plt.savefig('metrics_evolution_plot.png')
plt.show()