# MGL 869 - Hive - Groupe 8 - Lab 
---


## Packages

In [1]:
import pandas as pd
import os
import git
import re
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed

## Global variables

In [2]:
HIVE_CSV_path = 'ApacheHiveCSV'
HIVE_CSV_separator = '^'
GIT_HIVE_path = 'repositories'
GIT_url = 'git@github.com:apache/hive.git'

## OS environment

In [3]:
os.makedirs('output', exist_ok=True)
current_project_path = os.getcwd()

## Git environment

In [4]:
# Parent directory
parent_directory = os.path.dirname(current_project_path)
# Repository path
repo_path = current_project_path + '/' + GIT_HIVE_path + '/' + 'main'

## Extract data from Hive

### Function to create dataframe from multiple sources

In [5]:
def createHiveDataFrame():
    """
    Create a dataframe from multiple CSV files
    :return: A dataframe that contains all the data from the directory HIVE_CSV_path
    """
    # Get all the CSV files in the directory
    csv_files = [os.path.join(HIVE_CSV_path, file) for file in os.listdir(HIVE_CSV_path) if file.endswith('.csv')]
    
    # Create a dataframe for each CSV file
    dataframes = [pd.read_csv(file, sep="^") for file in csv_files]
    
    dataframes_filtered = []
    for df in dataframes:
        
        # Get the columns that contain the patch versions
        fix_version_columns = [col for col in df.columns if col.startswith('Fix Version/s')]
        
        # Get the columns that contain the affects versions
        affects_version_columns = [col for col in df.columns if col.startswith('Affects Version/s')]
        
        # Combine the versions into a single column
        df['Fix Versions Combined'] = df[fix_version_columns].apply(lambda x: ', '.join(x.dropna().astype(str)), axis=1)
        df['Affects Versions Combined'] = df[affects_version_columns].apply(lambda x: ', '.join(x.dropna().astype(str)),
                                                                            axis=1)
        # Delete the columns that are not needed
        df = df.drop(fix_version_columns, axis=1)
        df = df.drop(affects_version_columns, axis=1)
    
        # Keep only the columns that are needed
        keep: list = ['Issue key', 'Status', 'Resolution', 'Created', 'Fix Versions Combined', 'Affects Versions Combined']
        df = df.loc[:, keep]
    
        dataframes_filtered.append(df)
    return dataframes_filtered

### Create dataframe

In [6]:
df_merged = pd.concat(createHiveDataFrame(), ignore_index=True, sort=False)

### Collect bug ids

In [7]:
# Create a set of all the ids
ids = set(df_merged[df_merged.columns[df_merged.columns.str.contains('Issue key')]].values.flatten())

## Git research
### Clone if repo isn't already clone

In [8]:
# Check if repo exists
if not os.path.exists(repo_path) or not os.path.exists(os.path.join(repo_path, '.git')):
    repo = git.Repo.clone_from(GIT_url, repo_path)
else:
    repo = git.Repo(repo_path)

### Regex to find bug names in commits

In [9]:
pattern = re.compile(r'HIVE-(\d{3,5})')

### Function to find files modified for a list of commit

In [10]:
# Function to process a batch of commits
def process_commits(commits):
    local_repo = git.Repo(repo_path) # Load the repository in memory of the current thread
    tuple_key_file_commit = []
    for commit_id in commits:
        for match in commits[commit_id]:
            hive_key = f'HIVE-{match}'
            if hive_key in ids:
                for file in local_repo.commit(commit_id).stats.files:
                    tuple_key_file_commit.append((hive_key, file, commit_id))
    return tuple_key_file_commit

### Prepare research in batch bug research in commits

In [11]:
# Get the number of threads of the CPU
num_threads = os.cpu_count() 
# Size of the chunk
chunk_size = len(list(repo.iter_commits())) // num_threads

# Get all commits and files
all_commits = [{} for _ in range(num_threads)]

for i, commit in enumerate(repo.iter_commits()):
    matches = pattern.findall(commit.message)
    if matches:
        all_commits[i // chunk_size][commit.hexsha] = matches


In [12]:
# Initialize the list to store all couples
all_couples = []
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(process_commits, chunk) for chunk in all_commits]
    for future in as_completed(futures):
        couples = future.result()
        all_couples.extend(couples)

print(f"{len(all_couples)} couples found.")

4526 couples found.


### Create the dataframe

In [13]:
# Convert the list of tuples into a DataFrame
df_files = pd.DataFrame(all_couples, columns=['Issue Key', 'File', 'Commit'])

### Keep only Java and C++ files

In [14]:
df_filtered = df_files[df_files['File'].str.endswith(('.java', '.cpp', '.c', '.h'))]

### Create tag dictionary to get only release versions

In [15]:
# Get all tags
tags = repo.tags
versions = {}

for tag in tags:
    # Get the commit of the tag
    commit = tag.commit
    commit_date = commit.committed_datetime.strftime('%Y-%m-%d')
    versions[tag.name] = commit_date

# Afficher les versions et leurs dates
# print(versions)

### Filter tags to keep only release versions

In [16]:
filtered_versions = {tag: date for tag, date in versions.items() if tag.startswith('release-') and '-rc' not in tag}
cleaned_versions = {tag.replace('release-', ''): date for tag, date in filtered_versions.items()}
# Display the cleaned versions
# print(cleaned_versions)

In [19]:
commits_for_versions = {}
i = 0
for version in cleaned_versions:
    date = cleaned_versions[version]
    
    # Obtenir le dernier commit avant la date de version
    commit = repo.git.log('--before', date, '-n', '1', '--pretty=format:%H')
    commits_for_versions[version] = commit

    print("checkout")
    repo.git.checkout(commit)
    print("end")
    if i % 1 == 0:
        print(i)
    i += 1

print("Derniers commits pour chaque version :", commits_for_versions)

checkout


GitCommandError: Cmd('git') failed due to: exit code(1)
  cmdline: git checkout a786579fa0bb3245adea9c19d0da5fbbe7930f64
  stderr: 'error: Your local changes to the following files would be overwritten by checkout:
	.asf.yaml
	.github/workflows/spelling.yml
	bin/replstats.sh
	data/files/datasets/cbo_t1/load.hive.sql
	data/files/datasets/cbo_t2/load.hive.sql
	data/files/datasets/cbo_t3/load.hive.sql
	data/files/datasets/lineitem/load.hive.sql
	data/files/datasets/part/load.hive.sql
	data/files/datasets/tpch_0_001.customer/load.hive.sql
	data/files/datasets/tpch_0_001.lineitem/load.hive.sql
	data/files/datasets/tpch_0_001.nation/load.hive.sql
	data/files/datasets/tpch_0_001.orders/load.hive.sql
	data/files/datasets/tpch_0_001.part/load.hive.sql
	data/files/datasets/tpch_0_001.partsupp/load.hive.sql
	data/files/datasets/tpch_0_001.region/load.hive.sql
	data/files/datasets/tpch_0_001.supplier/load.hive.sql
	hcatalog/src/test/e2e/hcatalog/build.xml
	hcatalog/src/test/e2e/templeton/deployers/config/hive/hive-site.xml
	hcatalog/src/test/e2e/templeton/deployers/deploy_e2e_artifacts.sh
	packaging/src/docker/Dockerfile
	packaging/src/docker/README.md
	packaging/src/docker/docker-compose.yml
	service/src/resources/hive-webapps/static/js/bootstrap.js
	service/src/resources/hive-webapps/static/js/llap.js
	standalone-metastore/metastore-common/src/gen/thrift/gen-py/hive_metastore/ThriftHiveMetastore-remote
	standalone-metastore/metastore-common/src/gen/thrift/gen-py/hive_metastore/ThriftHiveMetastore.py
	standalone-metastore/metastore-common/src/gen/thrift/gen-py/hive_metastore/ttypes.py
	testutils/gen-report.py
	testutils/metastore/metastore-validation-test.sh
Please commit your changes or stash them before you switch branches.
Aborting'

In [None]:
# Define the command
command = "/Hive/understand_app/scitools/bin/linux64/und create /Hive/data/project.und"

# Execute the command using subprocess
try:
    subprocess.run(command, shell=True, check=True)
    print("Command executed successfully.")
except subprocess.CalledProcessError as e:
    print(f"Error occurred while running the command: {e}")
