In [1]:
"""
This code extracts commit data from a Git repository using PyDriller 
and saves it in a CSV file. It sets the path to the repository and the CSV file name and header. 
If the file exists, it reads the last commit ID from it and continues processing from there. 
For each commit, it gathers information about the commit and writes it to the CSV file, 
skipping commits that have already been processed. 
Finally, it prints progress updates every 25 commits until the first 1000 have been processed,and then every 1000 commits. 

Script Author: Jesus Cantu 
"""

from pydriller import Repository
import csv
import requests
import subprocess
import os.path

# Replace this path with your own repository of interest (this could be a local path or URL!) URL e.g., https://github.com/Anuken/Mindustry
# If using a local path clone the repository using the mirror option, e.g., git clone --mirror https://github.com/scikit-learn/scikit-learn.gitrepo_url = '/Users/jesuscantu/Desktop/Temp/Cloned_Repos/Mindustry.git' # local path 

# Set up CSV file and header row
file_name = '/Users/jesuscantu/Desktop/Temp/Mindustry_commit_data.csv' # ~/filepath/csv.name
header_row = ['commit_id', 'message', 'author_name', 'author_email', 'author_date', 'author_tz', 
              'committer_name', 'committer_email', 'committer_date', 'committer_tz', 'in_main',
              'is_merge', 'num_deletes', 'num_inserts', 'net_lines', 'num_files', 'branches',
              'files', 'parents', 'dmm_unit_size', 'dmm_unit_complexity', 'dmm_unit_interfacing']

if os.path.isfile(file_name):
    # If the file already exists, read the last commit ID from it and continue processing from there
    with open(file_name, 'r') as csv_file:
        last_commit_id = ''
        for row in csv.reader(csv_file):
            last_commit_id = str(row[0]) # Convert the commit hash to a string
        print(f'Resuming processing from commit {last_commit_id}')
        append_mode = 'a'
else:
    # If the file doesn't exist, start a new one
    print('Creating new CSV file')
    append_mode = 'w'

with open(file_name, mode=append_mode, newline='') as csv_file:
    writer = csv.writer(csv_file)
    if append_mode == 'w':
        writer.writerow(header_row)

    # Loop over each PyDriller commit to transform it to a commit usable for analysis later
    print("Processing repository...")
    commit_count = 0
    for commit in Repository(repo_url).traverse_commits():
        hash = commit.hash

        # Skip commits that have already been processed
        if append_mode == 'a' and hash == last_commit_id:
            continue

        # Gather a list of files modified in the commit
        files = []
        try:
            for f in commit.modified_files:
                if f.new_path is not None:
                    files.append(f.new_path)
        except Exception:
            print('Could not read files for commit ' + hash)
            continue

        # Capture information about the commit in object format so I can reference it later
        record = {
            'commit_id': hash,
            'message': commit.msg,
            'author_name': commit.author.name,
            'author_email': commit.author.email,
            'author_date': commit.author_date,
            'author_tz': commit.author_timezone,
            'committer_name': commit.committer.name,
            'committer_email': commit.committer.email,
            'committer_date': commit.committer_date,
            'committer_tz': commit.committer_timezone,
            'in_main': commit.in_main_branch,
            'is_merge': commit.merge,
            'num_deletes': commit.deletions,
            'num_inserts': commit.insertions,
            'net_lines': commit.insertions - commit.deletions,
            'num_files': commit.files,
            'branches': ', '.join(commit.branches), # Comma separated list of branches the commit is found in
            'files': ', '.join(files), # Comma separated list of files the commit modifies
            'parents': ', '.join(commit.parents), # Comma separated list of parents
            # PyDriller Open Source Delta Maintainability Model (OS-DMM) stat.            # See https://pydriller.readthedocs.io/en/latest/deltamaintainability.html for metric definitions
            'dmm_unit_size': commit.dmm_unit_size,
            'dmm_unit_complexity': commit.dmm_unit_complexity,
            'dmm_unit_interfacing': commit.dmm_unit_interfacing,
        }
        writer.writerow(record.values())
        
        # Print progress every 1000 commits
        commit_count += 1
        # Print progress every 25 commits and only every 1000 commits after 1000
        if (commit_count < 1000 and commit_count % 25 == 0) or commit_count % 1000 == 0:
            print(f'{commit_count} commits processed')
            
    print("Finished processing repository.")


Resuming processing from commit 285ee665ee23b86574d3f4810c0c0654bdc9fdfc
Processing repository...
25 commits processed
50 commits processed
75 commits processed
100 commits processed
125 commits processed
150 commits processed
175 commits processed
200 commits processed
225 commits processed
250 commits processed
275 commits processed
300 commits processed
325 commits processed
350 commits processed
375 commits processed
400 commits processed
425 commits processed
450 commits processed
475 commits processed
500 commits processed
525 commits processed
550 commits processed
575 commits processed
600 commits processed
625 commits processed
650 commits processed
675 commits processed
700 commits processed
725 commits processed
750 commits processed
775 commits processed
800 commits processed
825 commits processed
850 commits processed
875 commits processed
900 commits processed
925 commits processed
950 commits processed
975 commits processed
1000 commits processed
2000 commits processed
3