In [1]:
import difflib
from fileinput import filename
from unittest import case
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pydriller import Repository
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
selected_repos = {"JAX":"https://github.com/jax-ml/jax","SKIA":"https://github.com/google/skia","LLAMA":"https://github.com/meta-llama/llama3"}


In [12]:
def clean_diff(diff_str):
    lines = diff_str.split('\n')
    cleaned_lines = []
    for line in lines:
        if not line.startswith(('---', '+++', '@@', 'diff --git')) and line.strip():
            cleaned_lines.append(line[1:].strip())
    return "".join(cleaned_lines)


for repo_name, repo_url in selected_repos.items():
    print(f"--- Analyzing repository: {repo_name} ---")

    repo_data = []
    modified_file_count = 0
    file_limit = 100
    
    try:
        for commit in Repository(repo_url).traverse_commits():
            if modified_file_count >= file_limit:
                break
                
            parent_sha = commit.parents[0] if commit.parents else "None"
            
            for mod in commit.modified_files:
                if modified_file_count >= file_limit:
                    break
                
                # Skip files that are newly added (old_path is None) or deleted (new_path is None)
                if mod.old_path is None or mod.new_path is None:
                    continue

                diff_myers1 = ""
                diff_hist2 = ""
                discrepancy = "N/A"

                if parent_sha != "None":
                    try:
                        # Use the new robust git diff syntax to compare specific files at specific commits
                        diff_myers1 = subprocess.check_output(
                            ["git", "diff", "--diff-algorithm=myers", "--full-index", f"{parent_sha}:{mod.old_path}", f"{commit.hash}:{mod.new_path}"],
                            text=True, stderr=subprocess.DEVNULL
                        )
                        diff_hist2 = subprocess.check_output(
                            ["git", "diff", "--diff-algorithm=histogram", "--full-index", f"{parent_sha}:{mod.old_path}", f"{commit.hash}:{mod.new_path}"],
                            text=True, stderr=subprocess.DEVNULL
                        )

                        diff_myers1_cleaned = clean_diff(diff_myers1)
                        diff_hist2_cleaned = clean_diff(diff_hist2)
                        
                        discrepancy = "Yes" if diff_myers1_cleaned != diff_hist2_cleaned else "No"
                        
                    except subprocess.CalledProcessError as e:
                        print(f"Error running git diff for commit {commit.hash} on file {mod.new_path}: {e}")

                file_data = {
                    'old_filepath': mod.old_path,
                    'new_filepath': mod.new_path,
                    'commitSHA': commit.hash,
                    'parentcommitSHA': parent_sha,
                    'commit_message': commit.msg,
                    'diff_myers1': diff_myers1,
                    'diff_hist2': diff_hist2,
                    'Discrepancy': discrepancy
                }
                print(f"{modified_file_count} : Data added for {mod.new_path} in repo {repo_name}")
                repo_data.append(file_data)
                modified_file_count += 1
                
    except Exception as e:
        print(f"An error occurred while analyzing {repo_name}: {e}")
        continue

    if repo_data:
        df = pd.DataFrame(repo_data)
        output_filename = f"{repo_name.lower()}_data.csv"
        df.to_csv(output_filename, index=False)
        print(f"Successfully generated {output_filename}")
    else:
        print(f"No data was collected for {repo_name}.")


--- Analyzing repository: JAX ---
Error running git diff for commit 1d902436ce39b8caf1e8e5e3dc2f27990b9fda0b on file jax/lib/xla_bridge.py: Command '['git', 'diff', '--diff-algorithm=myers', '--full-index', 'a30e858e59d7184b9e54dc3f3955238221d70439:jax/lib/xla_bridge.py', '1d902436ce39b8caf1e8e5e3dc2f27990b9fda0b:jax/lib/xla_bridge.py']' returned non-zero exit status 128.
0 : Data added for jax/lib/xla_bridge.py in repo JAX
Error running git diff for commit fd8d83dca27fd8e8b642743e669bda6285f66e9e on file build/WORKSPACE: Command '['git', 'diff', '--diff-algorithm=myers', '--full-index', 'f78c05195d4edab4baf9664acc3f4aab9f09281f:WORKSPACE', 'fd8d83dca27fd8e8b642743e669bda6285f66e9e:build/WORKSPACE']' returned non-zero exit status 128.
1 : Data added for build/WORKSPACE in repo JAX
Error running git diff for commit fd8d83dca27fd8e8b642743e669bda6285f66e9e on file build/build_jax.sh: Command '['git', 'diff', '--diff-algorithm=myers', '--full-index', 'f78c05195d4edab4baf9664acc3f4aab9f092