# Code to filter and extract data from CSV files and then search for HIVE-X keys in the git repository.

---
## Data cleaning

In [12]:
import pandas as pd
import os
import git
import re
from multiprocessing import Pool, cpu_count

In [2]:
folder_path = 'CSV exported'
# Lire tous les fichiers CSV du dossier
csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

In [3]:
dataframes = [pd.read_csv(file, sep="^") for file in csv_files]
dataframes_filtered = []
for df in dataframes:
    fix_version_columns = [col for col in df.columns if col.startswith('Fix Version/s')]
    affects_version_columns = [col for col in df.columns if col.startswith('Affects Version/s')]

    df['Fix Versions Combined'] = df[fix_version_columns].apply(lambda x: ', '.join(x.dropna().astype(str)), axis=1)
    df['Affects Versions Combined'] = df[affects_version_columns].apply(lambda x: ', '.join(x.dropna().astype(str)),
                                                                        axis=1)

    # Supprimer les colonnes originales
    df = df.drop(fix_version_columns, axis=1)
    df = df.drop(affects_version_columns, axis=1)

    keep: list = ['Issue key', 'Status', 'Resolution', 'Created', 'Fix Versions Combined', 'Affects Versions Combined']
    df = df.loc[:, keep]
    print(len(df.columns))

    dataframes_filtered.append(df)

6
6


In [4]:
df_merged = pd.concat(dataframes_filtered, ignore_index=True, sort=False)

In [5]:
df_merged.to_csv('filtered_data.csv')

---

## Extract

In [6]:
# Create a set of all the ids
ids = set(df_merged[df_merged.columns[df_merged.columns.str.contains('Issue key')]].values.flatten())

---
GIT RESEARCH
---



In [7]:
repo_path = r'C:\Users\moshi\Documents\projects\Informatique\ETS\MGL869\hive'
repo_url = 'git@github.com:apache/hive.git'

In [8]:
pattern = re.compile(r'HIVE-(\d{3,5})')

In [9]:
# Check if repo exists
if not os.path.exists(repo_path) or not os.path.exists(os.path.join(repo_path, '.git')):
    repo = git.Repo.clone_from(repo_url, repo_path)
else:
    repo = git.Repo(repo_path)

In [10]:
couples = []
for commit in repo.iter_commits():
    matches = pattern.findall(commit.message)
    for match in matches:
        hive_key = f'HIVE-{match}'
        if hive_key in ids:
            for file in commit.stats.files.keys():
                couples.append((hive_key, file))

In [11]:
df_couples = pd.DataFrame(couples, columns=['Issue Key', 'File'])
# Save the DataFrame to a CSV file
df_couples.to_csv('couples.csv', index=False)