In [None]:
import pandas as pd

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# read dfVulCWE (BigVul Commits processed)

# all path need to be modified accordingly

In [None]:
dfVulCWE = pd.read_csv("dfVulCWE.csv")

In [None]:
import pandas as pd

cwe_per_commit = dfVulCWE.groupby('commit_id')['CVE ID'].nunique()

commits_with_multiple_cwes = cwe_per_commit[cwe_per_commit > 1]

print("Commits with multiple distinct CWE IDs:")
print(commits_with_multiple_cwes)

# new code to get comit urls

In [None]:
import pandas as pd

df_filtered = dfVulCWE[['project', 'codeLink', 'commit_id', 'CWE ID', 'CVE ID']].drop_duplicates()

df_filtered = df_filtered[df_filtered['codeLink'].str.startswith('https://github.com/', na=False)]

df_filtered = df_filtered.sort_values(by='project')

df_filtered.to_csv("commitUrlFinalBigVul.csv", index=False)

dfResult = df_filtered[['codeLink']].drop_duplicates()

commit_urls = dfResult['codeLink'].tolist()


# get file extensions to check which one to discard

# Github Token need to be added

In [None]:
import requests
import os
import shutil
from urllib.parse import urlparse
import re
from base64 import b64decode
from datetime import datetime
import difflib
import time
import logging
import csv

GITHUB_TOKENS = [

]

LOG_FILE_PATH = r"M:\FULL_DATA_COLLECTED\commit_errors_log.txt"
logging.basicConfig(
    filename=LOG_FILE_PATH,
    level=logging.ERROR,
    format='[%(asctime)s] %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

current_token_index = 0
REQUEST_TIMEOUT = 10
MAX_RETRIES = 3
RETRY_DELAY = 5
FETCH_DELAY = 0.5
FILE_PROCESS_DELAY = 0.5
COMMIT_PROCESS_DELAY = 2

TIME_LOG_PATH = r"M:\FULL_DATA_COLLECTED\execution_time_log.csv"

logged_errors = set()

def get_current_token():
    global current_token_index
    return GITHUB_TOKENS[current_token_index]

def switch_to_next_token():
    global current_token_index
    current_token_index = (current_token_index + 1) % len(GITHUB_TOKENS)

def check_rate_limit():
    current_token = get_current_token()
    headers = {'Authorization': f'token {current_token}'}
    rate_limit_url = "https://api.github.com/rate_limit"
    response = requests.get(rate_limit_url, headers=headers, timeout=REQUEST_TIMEOUT)
    response.raise_for_status()
    rate_limit_data = response.json()

    remaining = rate_limit_data['rate']['remaining']

    if remaining < 1000:
        switch_to_next_token()

def log_error(commit_url, error_message, index=None):
    error_key = (commit_url, error_message)
    if error_key not in logged_errors:
        log_message = f"Commit URL: {commit_url}\n  - Error: {error_message}"
        if index is not None:
            log_message = f"[Index: {index}] {log_message}"
        logging.error(log_message)
        logged_errors.add(error_key)

def get_with_retries(url, headers, params=None, index=None):
    for attempt in range(MAX_RETRIES):
        try:
            response = requests.get(url, headers=headers, params=params, timeout=REQUEST_TIMEOUT)
            response.raise_for_status()
            time.sleep(FETCH_DELAY)
            return response
        except requests.exceptions.HTTPError as e:
            if response.status_code == 404:
                error_message = f"404 Not Found - The requested resource could not be found."
                log_error(url, error_message, index)
                return None
            else:
                error_message = f"HTTPError - {str(e)}"
                log_error(url, error_message, index)
                time.sleep(RETRY_DELAY)
        except (requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout) as e:
            error_message = f"ConnectionTimeout - {str(e)}"
            log_error(url, error_message, index)
            time.sleep(RETRY_DELAY)
        except requests.exceptions.RequestException as e:
            error_message = f"Request failed: {str(e)}"
            log_error(url, error_message, index)
            break
    raise Exception(f"Failed to get a response from {url} after {MAX_RETRIES} attempts.")

def fetch_all_commits_for_file(repo_owner, repo_name, file_path, index):
    commits_url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/commits'
    params = {'path': file_path, 'per_page': 100}
    all_commits = []
    page = 1

    while True:
        headers = {'Authorization': f'token {get_current_token()}'}
        params['page'] = page
        response = get_with_retries(commits_url, headers, params=params, index=index)
        if response is None:
            log_error(commits_url, "404 error when fetching commits", index)
            break
        commits = response.json()
        if not commits:
            break
        all_commits.extend(commits)
        page += 1

    return all_commits

def remove_empty_directory(directory):
    if os.path.exists(directory) and not os.listdir(directory):
        shutil.rmtree(directory)
        print(f"Removed empty directory: {directory}")

NON_CODE_EXTENSIONS = ['', '.txt', '.md', '.jpg', '.png', '.jpeg', '.pdf', '.xml', '.conf', '.man', '.texi']

def is_non_code(file_name):
    file_extension = os.path.splitext(file_name)[1]
    return file_extension in NON_CODE_EXTENSIONS

def log_execution_time(range_str, execution_time):
    file_exists = os.path.isfile(TIME_LOG_PATH)

    with open(TIME_LOG_PATH, mode='a', newline='') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(["Range", "Total Execution Time (seconds)"])
        writer.writerow([range_str, execution_time])

def process_commit_link(commit_url, index):
    try:
        parsed_url = urlparse(commit_url)
        path = parsed_url.path

        match = re.match(r'/([^/]+)/([^/]+)/commit/([a-f0-9]+)', path)
        if match:
            REPO_OWNER = match.group(1)
            REPO_NAME = match.group(2)
            COMMIT_HASH = match.group(3)
        else:
            error_message = f"Invalid commit URL: {commit_url}"
            log_error(commit_url, error_message, index)
            return

        FULL_DATA_DIR = r"M:\FULL_DATA_COLLECTED"
        PROJECT_DIR = os.path.join(FULL_DATA_DIR, REPO_NAME)
        COMMIT_DIR = os.path.join(PROJECT_DIR, f"{REPO_NAME}_{COMMIT_HASH}")
        OUTPUT_DIR = os.path.join(COMMIT_DIR, 'all_versions')
        COMMIT_MESSAGES_DIR = os.path.join(COMMIT_DIR, 'commit_messages')
        CHANGES_DIR = os.path.join(COMMIT_DIR, 'file_changes_in_versions')

        check_rate_limit()

        commit_api_url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/commits/{COMMIT_HASH}'

        headers = {'Authorization': f'token {get_current_token()}'}
        response = get_with_retries(commit_api_url, headers, index=index)
        if response is None:
            return
        commit_data = response.json()

        files = commit_data['files']

        if files:
            for file_info in files:
                file_path = file_info['filename']

                if is_non_code(file_path):
                    continue

                commits = fetch_all_commits_for_file(REPO_OWNER, REPO_NAME, file_path, index)

                commits.reverse()
                fixed_version_index = None
                for idx, commit in enumerate(commits):
                    if commit['sha'] == COMMIT_HASH:
                        fixed_version_index = idx
                        break

                if fixed_version_index is None:
                    error_message = f"Fixed version {COMMIT_HASH} not found in commit history for {file_path}."
                    log_error(commit_url, error_message, index)
                    continue

                if fixed_version_index > 0:
                    commits_to_process = commits[fixed_version_index - 1:]
                else:
                    commits_to_process = commits[fixed_version_index:]

                os.makedirs(OUTPUT_DIR, exist_ok=True)
                os.makedirs(COMMIT_MESSAGES_DIR, exist_ok=True)
                os.makedirs(CHANGES_DIR, exist_ok=True)

                previous_content = None
                version_number = 1

                for commit in commits_to_process:
                    sha = commit['sha']
                    commit_date = commit['commit']['committer']['date']
                    formatted_date = datetime.strptime(commit_date, '%Y-%m-%dT%H:%M:%SZ').strftime('%Y%m%d_%H%M%S')
                    commit_message = commit['commit']['message'].strip().replace('\n', ' ')
                    file_url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/{file_path}?ref={sha}'

                    is_fixed_version = (sha == COMMIT_HASH)
                    version_suffix = "_fixed_version" if is_fixed_version else ""

                    try:
                        headers = {'Authorization': f'token {get_current_token()}'}
                        file_response = get_with_retries(file_url, headers, index=index)
                        if file_response is None:
                            continue
                        file_content_base64 = file_response.json()['content']
                        file_content = b64decode(file_content_base64).decode('utf-8')

                        file_name = os.path.basename(file_path)
                        file_version_path = os.path.join(OUTPUT_DIR, f"{file_name}_v{version_number}_{formatted_date}_{sha}{version_suffix}.txt")
                        with open(file_version_path, 'w', encoding='utf-8') as f:
                            f.write(file_content)

                        commit_message_path = os.path.join(COMMIT_MESSAGES_DIR, f"{file_name}_v{version_number}_{formatted_date}_{sha}_commit_message{version_suffix}.txt")
                        with open(commit_message_path, 'w', encoding='utf-8') as f:
                            f.write(commit_message)

                        if previous_content is not None:
                            diff = difflib.unified_diff(previous_content.splitlines(), file_content.splitlines(),
                                                        fromfile=f'v{version_number-1}', tofile=f'v{version_number}')
                            changes_file_path = os.path.join(CHANGES_DIR, f"{file_name}_changes_v{version_number-1}_v{version_number}{version_suffix}.txt")
                            with open(changes_file_path, 'w', encoding='utf-8') as f:
                                f.write('\n'.join(diff))

                        previous_content = file_content
                        version_number += 1

                        time.sleep(FILE_PROCESS_DELAY)

                    except Exception as e:
                        error_message = f"Unexpected error while processing the file: '{file_path}'. Error: {str(e)}"
                        log_error(commit_url, error_message, index)

        remove_empty_directory(COMMIT_DIR)

        check_rate_limit()

    except Exception as e:
        error_message = f"Exception - {str(e)}"
        log_error(commit_url, error_message, index)

start_time = time.time()

start_index = 401
end_index = 450

for index in range(start_index, end_index + 1):
    try:
        process_commit_link(commit_urls[index], index)
        print(f"{index}")
    except Exception as e:
        log_error(commit_urls[index], f"Error processing commit at index {index}. Error: {str(e)}", index)
    time.sleep(COMMIT_PROCESS_DELAY)

end_time = time.time()
total_execution_time = end_time - start_time

log_execution_time(f"{start_index}-{end_index}", total_execution_time)

print("done")