# Code to filter and extract data from CSV files and then search for HIVE-X keys in the git repository.

---
## Data cleaning

In [1]:
import pandas as pd
import os
import git
import re
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
import subprocess

In [2]:
folder_path = 'CSV exported'
# Lire tous les fichiers CSV du dossier
csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

In [3]:
dataframes = [pd.read_csv(file, sep="^") for file in csv_files]
dataframes_filtered = []
for df in dataframes:
    fix_version_columns = [col for col in df.columns if col.startswith('Fix Version/s')]
    affects_version_columns = [col for col in df.columns if col.startswith('Affects Version/s')]

    df['Fix Versions Combined'] = df[fix_version_columns].apply(lambda x: ', '.join(x.dropna().astype(str)), axis=1)
    df['Affects Versions Combined'] = df[affects_version_columns].apply(lambda x: ', '.join(x.dropna().astype(str)),
                                                                        axis=1)

    # Supprimer les colonnes originales
    df = df.drop(fix_version_columns, axis=1)
    df = df.drop(affects_version_columns, axis=1)

    keep: list = ['Issue key', 'Status', 'Resolution', 'Created', 'Fix Versions Combined', 'Affects Versions Combined']
    df = df.loc[:, keep]
    print(len(df.columns))

    dataframes_filtered.append(df)

6
6


In [4]:
df_merged = pd.concat(dataframes_filtered, ignore_index=True, sort=False)

In [5]:
df_merged.to_csv('filtered_data.csv')

---

## Extract

In [6]:
# Create a set of all the ids
ids = set(df_merged[df_merged.columns[df_merged.columns.str.contains('Issue key')]].values.flatten())

---
GIT RESEARCH
---



In [7]:

# Current project path
current_project_path = os.getcwd()
# Parent directory
parent_directory = os.path.dirname(current_project_path)
# Path to the repository
repo_path = os.path.join(parent_directory, 'hive')
# Repository URL
repo_url = 'git@github.com:apache/hive.git'

In [8]:
# Check if repo exists
if not os.path.exists(repo_path) or not os.path.exists(os.path.join(repo_path, '.git')):
    repo = git.Repo.clone_from(repo_url, repo_path)
else:
    repo = git.Repo(repo_path)

In [9]:
pattern = re.compile(r'HIVE-(\d{3,5})')

In [10]:
# Function to process a batch of commits
def process_commits(commits):
    print(f'Processing {len(commits)} commits')
    local_repo = git.Repo(repo_path)
    couples = []
    for key in commits:
        commit_id = key
        for match in commits[key]:
            hive_key = f'HIVE-{match}'
            if hive_key in ids:
                for file in local_repo.commit(commit_id).stats.files:
                    couples.append((hive_key, file))
                
    print(f'Finished processing {len(commits)} commits')
    return couples

In [11]:
num_threads = os.cpu_count() # Get the number of threads of the CPU
chunk_size = len(list(repo.iter_commits())) // num_threads

# Get all commits and files
all_commits = [{} for _ in range(num_threads)]

for i, commit in enumerate(repo.iter_commits()):
    matches = pattern.findall(commit.message)
    if matches:
        all_commits[i // chunk_size][commit.hexsha] = matches



In [12]:
# Initialize the list to store all couples
all_couples = []
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(process_commits, chunk) for chunk in all_commits]
    for future in as_completed(futures):
        couples = future.result()
        all_couples.extend(couples)

Processing 2134 commits
Processing 2135 commits
Processing 2169 commits
Processing 2163 commits
Processing 2136 commits
Processing 2062 commits
Processing 2063 commits
Processing 2045 commits
Finished processing 2062 commits
Finished processing 2063 commits
Finished processing 2045 commits
Finished processing 2134 commits
Finished processing 2135 commits
Finished processing 2136 commits
Finished processing 2169 commits
Finished processing 2163 commits


In [13]:
# Convert the list of tuples into a DataFrame
df_couples = pd.DataFrame(all_couples, columns=['Issue Key', 'File'])
# Save the DataFrame to a CSV file
df_couples.to_csv('couples.csv', index=False)

In [14]:
# Charger le fichier CSV
df_couples = pd.read_csv("couples.csv")
# Garder seulement les fichiers Java ou C++
df_filtered = df_couples[df_couples['File'].str.endswith(('.java', '.cpp', '.c', '.h'))]
# Sauvegarder le fichier filtré si besoin
df_filtered.to_csv("filtered_couples.csv", index=False)

In [15]:
# Obtenir tous les tags et leurs dates de création
tags = repo.tags
versions = {}

for tag in tags:
    # Récupérer la date de création du commit associé au tag
    commit = tag.commit
    commit_date = commit.committed_datetime.strftime('%Y-%m-%d')
    versions[tag.name] = commit_date

# Afficher les versions et leurs dates
print(versions)

{'branch-3.1.2-rc0': '2020-01-13', 'branch-3.1.3-rc0': '2020-01-13', 'master_2015_11_30': '2015-11-29', 'rel/release-1.2.2': '2017-04-01', 'rel/release-2.1.0': '2016-06-17', 'rel/release-2.1.1': '2016-11-29', 'rel/release-2.2.0': '2017-07-21', 'rel/release-2.3.0': '2017-07-13', 'rel/release-2.3.1': '2017-10-19', 'rel/release-2.3.10': '2024-05-04', 'rel/release-2.3.2': '2017-11-09', 'rel/release-2.3.3': '2018-03-28', 'rel/release-2.3.4': '2018-10-31', 'rel/release-2.3.5': '2019-05-07', 'rel/release-2.3.6': '2019-08-13', 'rel/release-2.3.7': '2020-04-07', 'rel/release-2.3.8': '2021-01-06', 'rel/release-2.3.9': '2021-06-01', 'rel/release-3.0.0': '2018-05-17', 'rel/release-3.1.0': '2018-07-23', 'rel/release-3.1.1': '2018-10-23', 'rel/release-3.1.2': '2019-08-22', 'rel/release-3.1.3': '2022-03-24', 'rel/release-4.0.0': '2024-03-21', 'rel/release-4.0.0-alpha-1': '2022-03-22', 'rel/release-4.0.0-alpha-2': '2022-11-07', 'rel/release-4.0.0-beta-1': '2023-08-07', 'rel/release-4.0.1': '2024-09-26

In [16]:
# Filtrer pour garder uniquement les versions principales
filtered_versions = {tag: date for tag, date in versions.items() if tag.startswith('release-') and '-rc' not in tag}
cleaned_versions = {tag.replace('release-', ''): date for tag, date in filtered_versions.items()}
# Afficher les versions principales et leurs dates de sortie
print(cleaned_versions)

{'0.10.0': '2013-01-10', '0.11.0': '2013-05-16', '0.12.0': '2013-10-14', '0.13.0': '2014-04-19', '0.13.1': '2014-06-05', '0.14.0': '2014-11-12', '0.3.0': '2010-10-26', '0.4.0': '2010-10-26', '0.4.1': '2010-10-26', '0.5.0': '2010-10-26', '0.6.0': '2010-10-26', '0.7.0': '2011-03-25', '0.7.1': '2011-06-17', '0.8.0': '2011-12-16', '0.8.1': '2012-01-30', '0.9.0': '2012-04-27', '1.0.0': '2015-02-03', '1.0.1': '2015-05-14', '1.1.0': '2015-03-09', '1.1.1': '2015-05-14', '1.2.0': '2015-05-14', '1.2.1': '2015-06-19', '2.0.0': '2016-02-09', '2.0.1': '2016-05-03'}


In [17]:
# Convertir le dictionnaire des versions en DataFrame
df_versions = pd.DataFrame(list(cleaned_versions.items()), columns=['Version', 'Release Date'])

# Sauvegarder dans un fichier CSV
df_versions.to_csv("versions.csv", index=False)

In [18]:
commits_for_versions = {}

for index, row in df_versions.iterrows():
    version = row['Version']
    date = row['Release Date']
    
    # Obtenir le dernier commit avant la date de version
    commit = repo.git.log('--before', date, '-n', '1', '--pretty=format:%H')
    commits_for_versions[version] = commit

print("Derniers commits pour chaque version :", commits_for_versions)

Derniers commits pour chaque version : {'0.10.0': 'a786579fa0bb3245adea9c19d0da5fbbe7930f64', '0.11.0': '0d12b180ad5e29f486f89f6e2005d9b0d6e7069e', '0.12.0': '1f0cffbb1dcac66fdb85465fbf043d41ce1ad391', '0.13.0': '12b8bc55c4b2ff8bc9662e31fe88a209655f6f84', '0.13.1': 'e1e559d85feba642e75b70bee5a97a070a291d2a', '0.14.0': '5dc2367785e01dbe703790931a339d88e777c7a4', '0.3.0': 'd8701ea32ab24e4c19d583ba9b40f4789703245b', '0.4.0': 'd8701ea32ab24e4c19d583ba9b40f4789703245b', '0.4.1': 'd8701ea32ab24e4c19d583ba9b40f4789703245b', '0.5.0': 'd8701ea32ab24e4c19d583ba9b40f4789703245b', '0.6.0': 'd8701ea32ab24e4c19d583ba9b40f4789703245b', '0.7.0': 'eb72421c75c56c2fa62527c326f7835e128caca8', '0.7.1': '4f5b4e22cc19e85de8f3bfda29551954bd69a8ee', '0.8.0': '660c5e0ec1fb04ac20639cb492580538274966e4', '0.8.1': '3555cf32326280f2b1c4391ae7fb911763941df5', '0.9.0': '09e9a233748c6b25b5485e6a8b3886bc7bede89d', '1.0.0': 'b94e151602cf89e41ddf19da2466a75aa5799d8b', '1.0.1': 'bc0138c436add2335d2045b6c7bf86bc6a15cc27', 