# File paths and custom names

- PATH_TO_HASH_PAIRS_FILES — Path to the directory of the resulting hash pairs from parsing files from two commit versions. Is the result of [this script (getting_all_changes)](getting_all_changes.ipynb).
- JSON_PWS_STUDIO — Set name for PVS-Studio JSON report.
- CSV_OUTPUT_CWE_FIX — Found fixes.
- CSV_OUTPUT_FOUND_CWE — Vulnerabilities found.

In [None]:
PATH_TO_HASH_PAIRS_FILES = 'result_hash_pairs'
JSON_PWS_STUDIO = 'PVS-Studio.json'
CSV_OUTPUT_CWE_FIX = 'bug_fix_functions.csv'
CSV_OUTPUT_FOUND_CWE = 'vulnerable_functions.csv'

# Installing the required libraries

In [None]:
!pip install --upgrade pip
!pip install --upgrade tqdm
!pip install --upgrade pandas
!pip install --upgrade tree-sitter
!pip install --upgrade tree-sitter-java

# Importing libraries

In [None]:
import os
import re
import json
from tqdm import tqdm
import pandas as pd
import tree_sitter_java as tsjava
from tree_sitter import Language, Parser

# Selecting an installed language for parsing

In [None]:
JAVA_LANGUAGE = Language(tsjava.language())

# Creating a parser
parser = Parser()
parser.set_language(JAVA_LANGUAGE)

# Parsing Java functions

In [None]:
# Function to parse Java file and extract function
import time


def extract_functions_from_file(filepath, parser, line):
    with open(filepath, 'r') as f:
        code = f.read()

    functions = []
    code_lines = code.split("\n")

    # Query to retrieve method declarations
    query = JAVA_LANGUAGE.query("""
    (method_declaration
        name: (identifier) @method_name
        body: (block) @method_body)
    """)

    # Getting syntax tree node captures
    captures = query.captures(parser.parse(bytes(code, "utf-8")).root_node)
    for capture in captures:
        node = capture[0]
        start_line = node.start_point[0]
        end_line = node.end_point[0]
        if start_line + 1 <= line <= end_line + 1:
            return "\n".join(code_lines[start_line:end_line + 1])

# List of all vulnerable functions
vulnerable_functions = []

# List of all fixed features
bug_fix_functions = []

# Get a list of all commit hashes
hash_pairs = os.listdir(PATH_TO_HASH_PAIRS_FILES)

for hash_dir in tqdm(hash_pairs, total=len(hash_pairs), desc="Processing:", ascii=True):
    hash_dir_path = os.path.join(PATH_TO_HASH_PAIRS_FILES, hash_dir)

    curr_version_dir = os.path.join(hash_dir_path, 'curr')
    prev_version_dir = os.path.join(hash_dir_path, 'prev')

    curr_json_path = os.path.join(curr_version_dir, JSON_PWS_STUDIO)
    prev_json_path = os.path.join(prev_version_dir, JSON_PWS_STUDIO)

    if os.path.exists(curr_json_path) and os.path.exists(prev_json_path):
        with open(curr_json_path, 'r') as curr_file:
            curr_data = json.load(curr_file)
        
        with open(prev_json_path, 'r') as prev_file:
            prev_data = json.load(prev_file)
        
        curr_positions = []
        prev_positions = []
        
        for warning in curr_data.get('warnings', []):
            for position in warning.get('positions', []):
                if warning.get('cwe') != 0:
                    curr_positions.append((warning.get('cwe'), position['file'], position['line']))
        curr_positions.sort(key=lambda x: (x[1], x[2]))
        
        for warning in prev_data.get('warnings', []):
            for position in warning.get('positions', []):
                if warning.get('cwe') != 0:
                    prev_positions.append((warning.get('cwe'), position['file'], position['line']))
        prev_positions.sort(key=lambda x: (x[1], x[2]))
        
        # Transform positions into sets with file paths update
        curr_positions_set = set((cwe, re.sub(r'.*curr', '', filepath), line) for cwe, filepath, line in curr_positions)
        prev_positions_set = set((cwe, re.sub(r'.*prev', '', filepath), line) for cwe, filepath, line in prev_positions)

        # Determining the positions of corrections
        if len(curr_positions) < len(prev_positions):
            bug_fix_positions_set = prev_positions_set - curr_positions_set

            # Set for storing unique functions with fixes
            bug_fix_functions_set = set()

            for fix_cwe, filepath, line in bug_fix_positions_set:
                function_from_bad_code = extract_functions_from_file(f'{prev_version_dir}{filepath}', parser, line)
                function_from_good_code = extract_functions_from_file(f'{curr_version_dir}{filepath}', parser, line)

                # Checking and recording unique corrections
                if function_from_bad_code and function_from_good_code and function_from_bad_code != function_from_good_code and function_from_good_code.count('\n') > 1 and function_from_bad_code.count('\n') > 1:
                    if function_from_bad_code not in bug_fix_functions_set:
                        bug_fix_functions_set.add(function_from_bad_code)
                        bug_fix_functions.append({
                            'Hash': hash_dir,
                            'Fixed_CWE': fix_cwe,
                            'Line_in_vulnerable_code': line,
                            'Vulnerable_code': function_from_bad_code.strip(),
                            'Fixed_code': function_from_good_code.strip(),
                            'File_path': re.sub(fr'.*{hash_dir}', '', filepath)
                        })

In [None]:
# Saving results
df_bug_fix = pd.DataFrame(bug_fix_functions)
df_bug_fix.to_csv(CSV_OUTPUT_CWE_FIX, index=False)

In [None]:
# Number of commits with fixes
df_bug_fix['Hash'].nunique()

In [None]:
df_bug_fix.info()