In [1]:
from git import Repo, GitCommandError

import pandas as pd
import os

In [2]:
path_to_data = os.path.join("C:\\Users\\tobias.lindenbauer\\PycharmProjects\\vcs-actions-agent\\", 'data')
path_to_repositories = os.path.join("C:\\Users\\tobias.lindenbauer\\PycharmProjects\\vcs-actions-agent\\", 'repos')

# Qualitative analysis of metadata

## Python

In [4]:
# 1. Read in repositories from CSV
python_repositories_metadata = pd.read_csv(os.path.join(path_to_data, 'python_repos.csv'))

In [4]:
len(python_repositories_metadata)

Let's take a look at the distribution of relevant numeric columns to get an overview of the dataset.

In [5]:
python_repositories_metadata

In [13]:
python_repositories_metadata[['branches', 'releases', 'forks', 'watchers', 'contributors', 'codeLines']].describe()

All repositories include more than one branch. Most include some forks. Curiously, we note that `min(codeLines) = 1`. Looking at the mean and 25% quartile though, we see that overall the repos seem to be in a good shape.

In [6]:
python_repositories_metadata[python_repositories_metadata.codeLines < 2]

In [5]:
pd.to_datetime(python_repositories_metadata.updatedAt).describe()

In [14]:
pd.to_datetime(python_repositories_metadata.createdAt).describe()

## Java

In [16]:
# 1. Read in repositories from CSV
java_repositories_metadata = pd.read_csv(os.path.join(path_to_data, 'java_repos.csv'))

In [17]:
len(java_repositories_metadata)

In [18]:
java_repositories_metadata[['branches', 'releases', 'forks', 'watchers', 'contributors', 'codeLines']].describe()

Most have been forked. Again, we note that `min(codeLines) = 2`. Looking at the mean and 25% quartile though, we see that overall the repos seem to be in a good shape.

In [19]:
pd.to_datetime(java_repositories_metadata.updatedAt).describe()

In [20]:
pd.to_datetime(java_repositories_metadata.createdAt).describe()

## Kotlin

In [21]:
# 1. Read in repositories from CSV
kotlin_repositories_metadata = pd.read_csv(os.path.join(path_to_data, 'kotlin_repos.csv'))

In [22]:
len(kotlin_repositories_metadata)

In [23]:
kotlin_repositories_metadata[['branches', 'releases', 'forks', 'watchers', 'contributors', 'codeLines']].describe()

Most have been forked. Again, we note that `min(codeLines) = 1`. Looking at the mean and 25% quartile though, we see that overall the repos seem to be in a good shape. For Kotlin, we note that the number of contributors and branches seems lower in general.

In [24]:
pd.to_datetime(kotlin_repositories_metadata.updatedAt).describe()

In [25]:
pd.to_datetime(kotlin_repositories_metadata.createdAt).describe()

# Development of git history traversal and quantitative analysis of repositories

## Determine ratio of branches to files

In [4]:
import re 

In [5]:
repo_instance = None
repository_metadata = python_repositories_metadata.iloc[1]
repository_path = os.path.join(path_to_repositories, "__".join(repository_metadata["name"].split("/")))
try:
    repo_instance = Repo.clone_from(f'https://github.com/{repository_metadata["name"]}.git',
                                f'{repository_path}')
except GitCommandError as e:
    # If already exists, create Repo instance of it
    if 'already exists' in e.stderr:
        print('Repository already exists, using local directory instead of cloning.')
        repo_instance = Repo(repository_path)

Repository already exists, using local directory instead of cloning.


In [9]:
from src.repository_data_scraper import RepositoryDataScraper
from src.programming_language import ProgrammingLanguage
repo_scraper = RepositoryDataScraper(repository=repo_instance, sliding_window_size=2,
                                         language_to_scrape_for=ProgrammingLanguage.TEXT,
                                         repository_path=repository_path)

In [12]:
%%timeit -n 3
repo_scraper.scrape()

GitCommandError: Cmd('git') failed due to: exit code(128)
  cmdline: git blame --incremental detected-amazon-mws-auth-token.txt
  stderr: 'fatal: no such path 'detected-amazon-mws-auth-token.txt' in HEAD'

In [21]:
repositories_statistics = pd.DataFrame()

for i, repository_metadata in python_repositories_metadata.iloc[:15].iterrows():
    repo_instance = None
    repository_path = os.path.join(path_to_repositories, "__".join(repository_metadata["name"].split("/")))
    try:
        repo_instance = Repo.clone_from(f'https://github.com/{repository_metadata["name"]}.git',
                                    f'{repository_path}')
    except GitCommandError as e:
        # If already exists, create Repo instance of it
        if 'already exists' in e.stderr:
            print('Repository already exists, using local directory instead of cloning.')
            repo_instance = Repo(repository_path)
            
    if repo_instance is None:
        continue

    os.chdir(os.path.join(path_to_data, repository_path))

    repositories_statistics.loc[i, 'branches'] = len(repo_instance.refs)
    
    num_python_files = 0
    num_total_files = 0
    for directory,subdirs,files in os.walk(repository_path):
        if re.match('.*(\\\\|\/)\..*', directory):
            continue # Skip hidden folders
        
        python_files = [f for f in files if '.py' in f]
        total_files = [f for f in files if re.match('^[^\.].*\..*$', f)] # skip hidden files and files without file ending
        
        num_python_files += len(python_files)
        num_total_files += len(total_files)

    repositories_statistics.loc[i, 'python_files'] = num_python_files
    repositories_statistics.loc[i, 'total_files'] = num_total_files

In [25]:
(repositories_statistics.total_files / repositories_statistics.branches).sum() / 10

# Code experimentation
Note that this implementation is somewhat naive and disregards the fact that a sliding window could capture results beyond the branching point of a branch (ie the commit where it was created) if its child commits in another branch also modify the same file, for example. If we need more data, implementing this general case, could be one way to move forward.

In [4]:
# 2. Generate a Repo instance for it
repository_metadata = python_repositories_metadata.iloc[2]
repository_path = os.path.join(path_to_repositories, "__".join(repository_metadata["name"].split("/")))
repo_instance = Repo.clone_from(f'https://github.com/{repository_metadata["name"]}.git', f'{repository_path}')

os.chdir(os.path.join(path_to_data, repository_path))

#   a. Run analytics operations to extract relevant metrics for overall data

# 3. Print overview

In [5]:
repo_instance = Repo(os.path.join(path_to_repositories, 'feast-dev__feast'))
os.chdir(os.path.join(path_to_repositories, 'feast-dev__feast'))
os.getcwd()

## Find merge commits

In [8]:
repo_instance.iter_commits().__next__().message

In [9]:
len([commit.message for commit in repo_instance.iter_commits(merges=True)])

## Find sequential commits modifying the same file

In [50]:
demo_repo = Repo(os.path.join(path_to_repositories, 'demo-repo'))
os.chdir(os.path.join(path_to_repositories, 'demo-repo'))
os.getcwd()

In [56]:
demo_repo.git.blame(r'-p', 'document.txt')

### Get all available branches and set up iteration

We will iterate over all branches to generate the metrics with which we want to evaluate the dataset and to find the entry points for dataset creation.

Get all branches, including remotes, so that we can then check them out and iterate over their commits. Since `branches` contains the raw output of git to stdout, we need to clean it before proceeding.

In [34]:
branches = pd.Series(repo_instance.git.branch('-a').split('\n'), name='branch_names')
print(len(branches))
# Branches will initially just be remote, to check them out and pull them we clean them by removing:
#   the remote prefix
#   any whitespace
#   literal line breaks
#   the asterisk denoting the current branch
branches = branches.str.replace(r'(remotes/origin/|\s*|\n|\*)', '', regex=True)
# Then we remove the branch pointed at by the head (this is just METADATA and the branch would be duplicated otherwise)
# E.g. HEAD->master and master, we want to keep master
branches = branches[~branches.str.contains('HEAD->')]

In [47]:
for branch in branches:
    repo_instance.git.checkout(branch)

In [45]:
fetches = repo_instance.remotes.origin.fetch()
fetches[2].remote_ref_path

In [51]:
len([c.hexsha for c in repo_instance.iter_commits(all=True)])

In [49]:
repo_instance.commit

Next, we need to ensure to continue to the next branch, once we reached its branching point or root node. We do not want to iterate over the entire graph (meaning all children including those of the branch from which this branch originated) for every branch.

In [52]:
# This even solves the issue of the generalization
def find_branches_containing_commit(commit_sha, repo):
    # Find branches containing the commit
    branches_containing_commit = []
    for branch in repo.branches:
        branch_commit = repo.commit(branch)
        try:
            # Check if commit is reachable from the branch
            # Not sure if this is even correct
            # Ancestor is towards newer commits
            if repo.is_ancestor(commit_sha, branch_commit):
                branches_containing_commit.append(branch)
        except GitCommandError as e:
            print(f"Error while processing branch {branch}: {e}")
            continue
    
    return branches_containing_commit

In [38]:
def update_accumulator_with(file_state: dict, file_to_remove: str, branch: str):
    if file_state['times_seen_consecutively'] >= sliding_window_size:
        accumulator.append({'file': file_to_remove, 'branch': branch, 'first_commit': file_state['first_commit'], 'last_commit': file_state['last_commit'], 'times_seen_consecutively': file_state['times_seen_consecutively']})

In [54]:
snippet_commits = list(repo_instance.iter_commits(all=True))[:25]

In [55]:
sliding_window_size = 2

# Maintains a state for each file currently in scope
# Each scope is defined by the overlap size n, if we do not see the file again after n steps we remove it from the state
state = {} 

# Accumulates file-commit grams
# If we detect a series of n consecutive modifications of the same file we append a dict to this list.
# Each dict contains: The associated file (relative path from working directory), first commit for this file-commit gram, last commit for this file-commit gram
# and how many times the file was seen consecutively (length of the file-commit gram)
# Note that the change_types that are valid are M, MM, A. All other change types are ignored (because the file wasn't modified).
accumulator = []

valid_change_types = ['A', 'M', 'MM']
for commit in snippet_commits: # We will visit each commit exactly once
    branches_with_commit = find_branches_containing_commit(commit.hexsha, repo_instance)
    
    changes_in_commit = repo_instance.git.show(commit, name_status=True, format='oneline').split('\n')
    changes_in_commit = changes_in_commit[1:] # remove commit hash and message
    changes_in_commit = [change for change in changes_in_commit if change] # filter empty lines
    
    # If any change in this commit is a valid change, we want to update the state
    # This is important, because operations on the state, when we dont want to perform them
    # can lead to flaky behaviour. This is needed for the cleanup phase that removes stale files.
    # Implicitly ensures that we len(changes_in_commit) > 0, because otherwise we would not iterate at all
    should_process_commit = False
    for change in changes_in_commit:
        change_type = change.split('\t')[0]
        should_process_commit = change_type in valid_change_types
        if should_process_commit:
            break
    
    if should_process_commit:
        # Commit has changes
        affected_files = []
        
        # Parse changes
        # Do we need to update the state of this particular file?
        for change_in_commit in changes_in_commit:
            changes_to_unpack = change_in_commit.split('\t')
            if changes_to_unpack[0] not in valid_change_types:
                continue
                
            change_type, file = changes_to_unpack
            affected_files.append(file)        
            
            # Update the file state for every branch with this commit
            # Otherwise ignore this commit (dont update state)
            for branch in branches_with_commit:
                # We should maintain a state for this branch, ensure that we are
                if branch  not in state:
                    state[branch] = {}
                
                if file in state[branch]:
                    # We are maintaining a state for this file on this branch
                    state[branch][file]['times_seen_consecutively'] = state[branch][file]['times_seen_consecutively'] + 1
                
                    if state[branch][file]['times_seen_consecutively'] >= sliding_window_size:
                        state[branch][file]['last_commit'] = commit.hexsha
                else:
                    # We are not currently maintaining a state for this file in this branch, but have detected it
                    # Need to set up the state dict
                    state[branch][file] = {'first_commit': commit.hexsha, 'last_commit': commit.hexsha, 'times_seen_consecutively': 1}
                    
            # We updated (Add, Update) one file of the commit for all affected branches at this point
        # (Add, Update) ALL files of the commit for all affected branches
        # Now we only need to remove stale file states (files that were not found in the commit)
        for branch in branches_with_commit:
            # Only do this for branches affected by the commit
            new_state = {}
            for file in state[branch]:
                if file in affected_files:
                    new_state[file] = state[branch][file]
                else:
                    update_accumulator_with(state[branch][file], file, branch)
        
            state[branch] = new_state

accumulator

In [93]:
commit = demo_repo.commit('aeeab817a1bd7d146fc7596546e0c98a0ec94dbc')
commit.stats.files

In [107]:
sliding_window_state = []
result = []

commit_history = [commit for commit in demo_repo.iter_commits(all=True)]

def find_files_modified_within_consecutive_commits(commits_to_process, window_size, sliding_window_state):
    while commits_to_process:
        commit = commits_to_process.pop()
        
        if commit_children[commit.hexsha]['children_count'] == 1 and len(commit.parents) == 1:
            # This is an ordinary commit
            # Stay in this recursion and continue iterating
            pass
        elif commit_children[commit.hexsha]['children_count'] >= 2:
            # This commit is the starting point of at least one other branch
            result.append(find_files_modified_within_consecutive_commits(commits_to_process, window_size, sliding_window_state))
        
    

    

In [108]:
commit_history.pop().message

# Currently not used just for docs of what I tried already

In [81]:
remote_refs = repo_instance.remote().refs

In [100]:
remote = repo_instance.remote()

remote_ref = remote_refs[1]
remote.fetch(remote_ref.remote_head)
repo_instance.branches

In [72]:
for remote in repo_instance.remotes:
    remote.fetch()

In [65]:
for commit in repo_instance.iter_commits():
    print(f"{commit.hexsha} {commit.summary}")