# Code to filter and extract data from CSV files and then search for HIVE-X keys in the git repository.

---
## Data cleaning

In [17]:
import pandas as pd
import os
import git
import re
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
import subprocess
import csv
from datetime import datetime

In [2]:
folder_path = 'CSV exported'
# Lire tous les fichiers CSV du dossier
csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

In [3]:
dataframes = [pd.read_csv(file, sep="^") for file in csv_files]
dataframes_filtered = []
for df in dataframes:
    fix_version_columns = [col for col in df.columns if col.startswith('Fix Version/s')]
    affects_version_columns = [col for col in df.columns if col.startswith('Affects Version/s')]

    df['Fix Versions Combined'] = df[fix_version_columns].apply(lambda x: ', '.join(x.dropna().astype(str)), axis=1)
    df['Affects Versions Combined'] = df[affects_version_columns].apply(lambda x: ', '.join(x.dropna().astype(str)),
                                                                        axis=1)

    # Supprimer les colonnes originales
    df = df.drop(fix_version_columns, axis=1)
    df = df.drop(affects_version_columns, axis=1)

    keep: list = ['Issue key', 'Status', 'Resolution', 'Created', 'Fix Versions Combined', 'Affects Versions Combined']
    df = df.loc[:, keep]
    print(len(df.columns))

    dataframes_filtered.append(df)

6


In [4]:
df_merged = pd.concat(dataframes_filtered, ignore_index=True, sort=False)

In [5]:
df_merged.to_csv('filtered_data.csv')

---

## Extract

In [6]:
# Create a set of all the ids
ids = set(df_merged[df_merged.columns[df_merged.columns.str.contains('Issue key')]].values.flatten())

---
GIT RESEARCH
---



In [7]:

# Current project path
current_project_path = os.getcwd()
# Parent directory
parent_directory = os.path.dirname(current_project_path)
# Path to the repository
repo_path = os.path.join(parent_directory, 'hive')
# Repository URL
repo_url = 'git@github.com:apache/hive.git'

In [8]:
# Check if repo exists
if not os.path.exists(repo_path) or not os.path.exists(os.path.join(repo_path, '.git')):
    repo = git.Repo.clone_from(repo_url, repo_path)
else:
    repo = git.Repo(repo_path)

In [9]:
pattern = re.compile(r'HIVE-(\d{3,5})')

In [10]:
# Function to process a batch of commits
def process_commits(commits):
    print(f'Processing {len(commits)} commits')
    local_repo = git.Repo(repo_path)
    couples = []
    for key in commits:
        commit_id = key
        for match in commits[key]:
            hive_key = f'HIVE-{match}'
            if hive_key in ids:
                for file in local_repo.commit(commit_id).stats.files:
                    couples.append((hive_key, file))
                
    print(f'Finished processing {len(commits)} commits')
    return couples

In [11]:
num_threads = os.cpu_count() # Get the number of threads of the CPU
chunk_size = len(list(repo.iter_commits())) // num_threads

# Get all commits and files
all_commits = [{} for _ in range(num_threads)]

for i, commit in enumerate(repo.iter_commits()):
    matches = pattern.findall(commit.message)
    if matches:
        all_commits[i // chunk_size][commit.hexsha] = matches



In [12]:
# Initialize the list to store all couples
all_couples = []
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(process_commits, chunk) for chunk in all_commits]
    for future in as_completed(futures):
        couples = future.result()
        all_couples.extend(couples)

Processing 2134 commitsProcessing 2135 commits
Processing 2169 commits

Processing 2163 commits
Processing 2137 commits
Finished processing 2134 commits
Processing 2061 commits
Processing 2065 commits
Processing 2047 commits
Finished processing 2047 commits
Finished processing 2065 commits
Finished processing 2061 commits
Finished processing 2169 commits
Finished processing 2135 commits
Finished processing 2163 commits
Finished processing 2137 commits


In [13]:
# Convert the list of tuples into a DataFrame
df_couples = pd.DataFrame(all_couples, columns=['Issue Key', 'File'])
# Save the DataFrame to a CSV file
df_couples.to_csv('couples.csv', index=False)

In [14]:
# Charger le fichier CSV
df_couples = pd.read_csv("couples.csv")
# Garder seulement les fichiers Java ou C++
df_filtered = df_couples[df_couples['File'].str.endswith(('.java', '.cpp', '.c', '.h'))]
# Sauvegarder le fichier filtré si besoin
df_filtered.to_csv("filtered_couples.csv", index=False)

In [15]:
# Obtenir tous les tags et leurs dates de création
tags = repo.tags
versions = {}

for tag in tags:
    # Récupérer la date de création du commit associé au tag
    commit = tag.commit
    commit_date = commit.committed_datetime.strftime('%Y-%m-%d')
    versions[tag.name] = commit_date

# Afficher les versions et leurs dates
print(versions)

{'branch-3.1.2-rc0': '2020-01-13', 'branch-3.1.3-rc0': '2020-01-13', 'master_2015_11_30': '2015-11-29', 'rel/release-1.2.2': '2017-04-01', 'rel/release-2.1.0': '2016-06-17', 'rel/release-2.1.1': '2016-11-29', 'rel/release-2.2.0': '2017-07-21', 'rel/release-2.3.0': '2017-07-13', 'rel/release-2.3.1': '2017-10-19', 'rel/release-2.3.10': '2024-05-04', 'rel/release-2.3.2': '2017-11-09', 'rel/release-2.3.3': '2018-03-28', 'rel/release-2.3.4': '2018-10-31', 'rel/release-2.3.5': '2019-05-07', 'rel/release-2.3.6': '2019-08-13', 'rel/release-2.3.7': '2020-04-07', 'rel/release-2.3.8': '2021-01-06', 'rel/release-2.3.9': '2021-06-01', 'rel/release-3.0.0': '2018-05-17', 'rel/release-3.1.0': '2018-07-23', 'rel/release-3.1.1': '2018-10-23', 'rel/release-3.1.2': '2019-08-22', 'rel/release-3.1.3': '2022-03-24', 'rel/release-4.0.0': '2024-03-21', 'rel/release-4.0.0-alpha-1': '2022-03-22', 'rel/release-4.0.0-alpha-2': '2022-11-07', 'rel/release-4.0.0-beta-1': '2023-08-07', 'rel/release-4.0.1': '2024-09-26

In [18]:
# Filtrer les versions à partir de 2.0.0
filtered_versions = {tag: date for tag, date in versions.items() if tag >= 'release-2.0.0'}

# Tri des versions par date
sorted_versions = sorted(filtered_versions.items(), key=lambda x: datetime.strptime(x[1], '%Y-%m-%d'))

# Stockage des commits pour chaque version
commits_for_versions = {}

# Récupération du dernier commit avant chaque version
for version, release_date in sorted_versions:
    # Obtenir le dernier commit avant la date de version
    commit = repo.git.log('--before', release_date, '-n', '1', '--pretty=format:%H')
    commits_for_versions[version] = {
        "release_date": release_date,
        "last_commit": commit
    }

# Enregistrement des données dans un fichier CSV
csv_file_path = "commits_for_hive_versions.csv"
with open(csv_file_path, mode='w', newline='') as csv_file:
    fieldnames = ["version", "release_date", "last_commit"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    for version, info in commits_for_versions.items():
        writer.writerow({
            "version": version,
            "release_date": info["release_date"],
            "last_commit": info["last_commit"]
        })

print(f"Fichier CSV créé : {csv_file_path}")

Fichier CSV créé : commits_for_hive_versions.csv


In [19]:
# Charger le fichier CSV existant avec toutes les versions
df = pd.read_csv("commits_for_hive_versions.csv")

# Filtrer les versions principales avec une expression régulière
df_main_versions = df[df['version'].str.match(r'^release-\d+\.\d+\.\d+$')]

# Enregistrer le nouveau fichier CSV avec uniquement les versions principales
df_main_versions.to_csv("commits_for_main_hive_versions.csv", index=False)

In [28]:
# Charger les fichiers
metrics_df = pd.read_csv("full_metrics_release-2.0.0.csv")
couples_df = pd.read_csv("filtered_couples.csv")

# Extraire les noms de fichiers de `filtered_couples.csv`
couples_df['File'] = couples_df['File'].apply(lambda x: x.split('/')[-1])  # Prend juste le nom de fichier à partir du chemin
couples_files = set(couples_df['File'])  # Convertir en ensemble pour une recherche rapide

# Ajouter la colonne `BugStatus` dans `metrics_df` en fonction de la présence du fichier dans `couples_files`
metrics_df['BugStatus'] = metrics_df['Name'].apply(lambda x: 1 if x.strip('"') in couples_files else 0)

# Sauvegarder le fichier avec la colonne `BugStatus`
metrics_df.to_csv("labeled_full_metrics_release-2.0.0.csv", index=False)
print("Labelling completed and saved to labeled_full_metrics_release-2.0.0.csv")

Labelling completed and saved to labeled_filtered_final_metrics.csv
