# File paths and custom names

- PATH_REPOSITORY — Path to the cloned repository from where commits will be taken.
- HASH_LIST_FILE — Path to a text file with commit hashes fetched using [this script (msg_analyzer)](../commit_msg_analyzer/msg_analyzer.ipynb).
- OUTPUT_PATH — The name of the directory in which the result of this script will be saved.
- FILE_EXTENSION_FILTER — File extensions that will be taken into account when parsing by this script.

In [None]:
PATH_TO_THE_REPOSITORY_BEING_ANALYSED = r"../commit_msg_analyzer/repo"
HASH_LIST_FILE = r"../commit_msg_analyzer/result/20240524_elasticsearch_commit_only.txt"
OUTPUT_PATH = "result_hash_pairs"
FILE_EXTENSION_FILTER = [".java"]

# Installing the required libraries

In [None]:
!pip install --upgrade pip
!pip install --upgrade tqdm
!pip install --upgrade python-git
!pip install --upgrade pytest-shutil

# Importing libraries

In [None]:
import os
import git
import shutil
from tqdm import tqdm

# Presetting

In [None]:
# Create an output directory if it doesn't exist
if not os.path.exists(PATH_TO_THE_REPOSITORY_BEING_ANALYSED):
    os.makedirs(PATH_TO_THE_REPOSITORY_BEING_ANALYSED)

In [None]:
# Read the list of hashes from the file
with open(HASH_LIST_FILE, 'r') as f:
    hash_list_array = [line.strip() for line in f.readlines()]

In [None]:
repo = git.Repo(PATH_TO_THE_REPOSITORY_BEING_ANALYSED)

In [None]:
# Create a DataFrame to store commit information
columns = ['hash', 'file', 'change_type', 'new_file']

In [None]:
# Function to save file contents
def save_file_content(commit, filepath, output_dir):
    try:
        blob = commit.tree / filepath
        file_output_path = os.path.join(output_dir, filepath)
        os.makedirs(os.path.dirname(file_output_path), exist_ok=True)
        with open(file_output_path, 'wb') as f:
            f.write(blob.data_stream.read())
        return True
    except Exception as e:
        print(f"Error saving file {filepath} from commit {commit.hexsha}: {e}")
        return False

In [None]:
# Function to check file extension
def has_valid_extension(filepath):
    return any(filepath.endswith(ext) for ext in FILE_EXTENSION_FILTER)

# Main part

In [None]:
for currhash in tqdm(hash_list_array, total=len(hash_list_array), desc="Processing:", ascii=True):
    print(f"Processing hash: {currhash}")
    try:
        commit = repo.commit(currhash)
        parent_commit = commit.parents[0] if commit.parents else None

        commit_output_path = os.path.join(OUTPUT_PATH, currhash)
        prev_output_path = os.path.join(commit_output_path, 'prev')
        curr_output_path = os.path.join(commit_output_path, 'curr')
        os.makedirs(prev_output_path, exist_ok=True)
        os.makedirs(curr_output_path, exist_ok=True)

        for diff in commit.diff(parent_commit):

            file_path = diff.a_path if diff.a_path else diff.b_path

            if diff.change_type == 'M' and has_valid_extension(file_path):
                save_file_content(parent_commit, file_path, prev_output_path)
                save_file_content(commit, file_path, curr_output_path)
            else:
                shutil.rmtree(commit_output_path)
                break
            
    except Exception as e:
        print(f"Error processing hash {currhash}: {e}")

# Run the script to analyze the collected data using PVS-Studio

- pvs — Path to the executable jar file PVS-Studio.
- dir — Path to the directory containing the result of collecting versions of the collected commits.
- max-jobs — Number of simultaneously running processes in the background.

In [None]:
!./start_analysis.sh --pvs /home/qqushka/pvs-studio-java/7.30.80678/pvs-studio.jar --dir ./result_hash_pairs --max-jobs 2