# Code to filter and extract data from CSV files and then search for HIVE-X keys in the git repository.

---
## Data cleaning

In [1]:
import pandas as pd
import os
import git
import re
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor

In [2]:
folder_path = 'CSV exported'
# Lire tous les fichiers CSV du dossier
csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

In [3]:
dataframes = [pd.read_csv(file, sep="^") for file in csv_files]
dataframes_filtered = []
for df in dataframes:
    fix_version_columns = [col for col in df.columns if col.startswith('Fix Version/s')]
    affects_version_columns = [col for col in df.columns if col.startswith('Affects Version/s')]

    df['Fix Versions Combined'] = df[fix_version_columns].apply(lambda x: ', '.join(x.dropna().astype(str)), axis=1)
    df['Affects Versions Combined'] = df[affects_version_columns].apply(lambda x: ', '.join(x.dropna().astype(str)),
                                                                        axis=1)

    # Supprimer les colonnes originales
    df = df.drop(fix_version_columns, axis=1)
    df = df.drop(affects_version_columns, axis=1)

    keep: list = ['Issue key', 'Status', 'Resolution', 'Created', 'Fix Versions Combined', 'Affects Versions Combined']
    df = df.loc[:, keep]
    print(len(df.columns))

    dataframes_filtered.append(df)

6
6


In [4]:
df_merged = pd.concat(dataframes_filtered, ignore_index=True, sort=False)

In [5]:
df_merged.to_csv('filtered_data.csv')

---

## Extract

In [6]:
# Create a set of all the ids
ids = set(df_merged[df_merged.columns[df_merged.columns.str.contains('Issue key')]].values.flatten())

---
GIT RESEARCH
---



In [7]:

# Current project path
current_project_path = os.getcwd()
# Parent directory
parent_directory = os.path.dirname(current_project_path)
# Path to the repository
repo_path = os.path.join(parent_directory, 'hive')
# Repository URL
repo_url = 'git@github.com:apache/hive.git'

In [8]:
# Check if repo exists
if not os.path.exists(repo_path) or not os.path.exists(os.path.join(repo_path, '.git')):
    repo = git.Repo.clone_from(repo_url, repo_path)
else:
    repo = git.Repo(repo_path)

In [9]:
pattern = re.compile(r'HIVE-(\d{3,5})')

In [10]:
# Function to process a batch of commits
def process_commits(commits):
    print(f'Processing {len(commits)} commits')
    local_repo = git.Repo(repo_path)
    couples = []
    for key in commits:
        commit_id = key
        for match in commits[key]:
            hive_key = f'HIVE-{match}'
            if hive_key in ids:
                for file in local_repo.commit(commit_id).stats.files:
                    couples.append((hive_key, file))
                
    print(f'Finished processing {len(commits)} commits')
    return couples

In [11]:
num_threads = os.cpu_count() # Get the number of threads of the CPU
chunk_size = len(list(repo.iter_commits())) // num_threads

# Get all commits and files
all_commits = [{} for _ in range(num_threads)]

for i, commit in enumerate(repo.iter_commits()):
    matches = pattern.findall(commit.message)
    if matches:
        all_commits[i // chunk_size][commit.hexsha] = matches



In [12]:
# Initialize the list to store all couples
all_couples = []
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(process_commits, chunk) for chunk in all_commits]
    for future in as_completed(futures):
        couples = future.result()
        all_couples.extend(couples)

Processing 1078 commits
Processing 1054 commits
Processing 1050 commits
Processing 1083 commits
Processing 1084 commits
Processing 1083 commits
Processing 1078 commits
Processing 1083 commits
Processing 1070 commits
Processing 1065 commits
Processing 1005 commits
Processing 1054 commits
Processing 1046 commits
Processing 1017 commits
Processing 1058 commits
Processing 995 commits
Finished processing 1005 commits
Finished processing 1046 commits
Finished processing 1054 commits
Finished processing 1017 commits
Finished processing 1058 commits
Finished processing 995 commits
Finished processing 1050 commits
Finished processing 1054 commits
Finished processing 1078 commits
Finished processing 1065 commits
Finished processing 1083 commits
Finished processing 1078 commits
Finished processing 1083 commits
Finished processing 1084 commits
Finished processing 1083 commits
Finished processing 1070 commits


In [13]:
# Convert the list of tuples into a DataFrame
df_couples = pd.DataFrame(all_couples, columns=['Issue Key', 'File'])
# Save the DataFrame to a CSV file
df_couples.to_csv('couples.csv', index=False)