In [1]:
from git import Repo, GitCommandError

import pandas as pd
import os

In [2]:
path_to_data = os.path.join("C:\\Users\\tobias.lindenbauer\\PycharmProjects\\vcs-actions-agent\\", 'data')
path_to_repositories = os.path.join("C:\\Users\\tobias.lindenbauer\\PycharmProjects\\vcs-actions-agent\\", 'repos')

# Qualitative analysis of metadata

## Python

In [3]:
# 1. Read in repositories from CSV
python_repositories_metadata = pd.read_csv(os.path.join(path_to_data, 'python_repos.csv'))

In [4]:
len(python_repositories_metadata)

Let's take a look at the distribution of relevant numeric columns to get an overview of the dataset.

In [5]:
python_repositories_metadata

In [13]:
python_repositories_metadata[['branches', 'releases', 'forks', 'watchers', 'contributors', 'codeLines']].describe()

All repositories include more than one branch. Most include some forks. Curiously, we note that `min(codeLines) = 1`. Looking at the mean and 25% quartile though, we see that overall the repos seem to be in a good shape.

In [6]:
python_repositories_metadata[python_repositories_metadata.codeLines < 2]

In [5]:
pd.to_datetime(python_repositories_metadata.updatedAt).describe()

In [14]:
pd.to_datetime(python_repositories_metadata.createdAt).describe()

## Java

In [16]:
# 1. Read in repositories from CSV
java_repositories_metadata = pd.read_csv(os.path.join(path_to_data, 'java_repos.csv'))

In [17]:
len(java_repositories_metadata)

In [18]:
java_repositories_metadata[['branches', 'releases', 'forks', 'watchers', 'contributors', 'codeLines']].describe()

Most have been forked. Again, we note that `min(codeLines) = 2`. Looking at the mean and 25% quartile though, we see that overall the repos seem to be in a good shape.

In [19]:
pd.to_datetime(java_repositories_metadata.updatedAt).describe()

In [20]:
pd.to_datetime(java_repositories_metadata.createdAt).describe()

## Kotlin

In [21]:
# 1. Read in repositories from CSV
kotlin_repositories_metadata = pd.read_csv(os.path.join(path_to_data, 'kotlin_repos.csv'))

In [22]:
len(kotlin_repositories_metadata)

In [23]:
kotlin_repositories_metadata[['branches', 'releases', 'forks', 'watchers', 'contributors', 'codeLines']].describe()

Most have been forked. Again, we note that `min(codeLines) = 1`. Looking at the mean and 25% quartile though, we see that overall the repos seem to be in a good shape. For Kotlin, we note that the number of contributors and branches seems lower in general.

In [24]:
pd.to_datetime(kotlin_repositories_metadata.updatedAt).describe()

In [25]:
pd.to_datetime(kotlin_repositories_metadata.createdAt).describe()

# Development of git history traversal and quantitative analysis of repositories

## Determine ratio of branches to files

In [14]:
import re 

In [21]:
repositories_statistics = pd.DataFrame()

for i, repository_metadata in python_repositories_metadata.iloc[:15].iterrows():
    repo_instance = None
    repository_path = os.path.join(path_to_repositories, "__".join(repository_metadata["name"].split("/")))
    try:
        repo_instance = Repo.clone_from(f'https://github.com/{repository_metadata["name"]}.git',
                                    f'{repository_path}')
    except GitCommandError as e:
        # If already exists, create Repo instance of it
        if 'already exists' in e.stderr:
            print('Repository already exists, using local directory instead of cloning.')
            repo_instance = Repo(repository_path)
            
    if repo_instance is None:
        continue

    os.chdir(os.path.join(path_to_data, repository_path))

    repositories_statistics.loc[i, 'branches'] = len(repo_instance.refs)
    
    num_python_files = 0
    num_total_files = 0
    for directory,subdirs,files in os.walk(repository_path):
        if re.match('.*(\\\\|\/)\..*', directory):
            continue # Skip hidden folders
        
        python_files = [f for f in files if '.py' in f]
        total_files = [f for f in files if re.match('^[^\.].*\..*$', f)] # skip hidden files and files without file ending
        
        num_python_files += len(python_files)
        num_total_files += len(total_files)

    repositories_statistics.loc[i, 'python_files'] = num_python_files
    repositories_statistics.loc[i, 'total_files'] = num_total_files

In [25]:
(repositories_statistics.total_files / repositories_statistics.branches).sum() / 10

# Exploring results

In [81]:
results = pd.read_parquet(os.path.join(path_to_data, 'python_dummy_subset.parquet'), engine='pyarrow')

In [82]:
results.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, 1 to 1
Data columns (total 39 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   id                                         1 non-null      int64  
 1   name                                       1 non-null      object 
 2   isFork                                     1 non-null      bool   
 3   commits                                    1 non-null      int64  
 4   branches                                   1 non-null      int64  
 5   releases                                   1 non-null      int64  
 6   forks                                      1 non-null      int64  
 7   mainLanguage                               1 non-null      object 
 8   defaultBranch                              1 non-null      object 
 9   license                                    1 non-null      object 
 10  homepage                           

In [83]:
results.loc[1]

id                                                                                     3275944
name                                                                returntocorp/semgrep-rules
isFork                                                                                   False
commits                                                                                   2234
branches                                                                                   171
releases                                                                                     0
forks                                                                                      217
mainLanguage                                                                            Python
defaultBranch                                                                          develop
license                                                                                  Other
homepage                                          

In [84]:
results['scraped_data']

1    {'cherry_pick_scenarios': [{'cherry_commit': '...
Name: scraped_data, dtype: object

In [85]:
cherry_pick_data= results['scraped_data'].apply(lambda x: x['cherry_pick_scenarios'])

In [86]:
cherry_pick_data

1    [{'cherry_commit': 'f7dbb2fc3be3060b29345f7801...
Name: scraped_data, dtype: object

In [87]:
# Need to check cherry pick scenarios at the end of the list.
# Check if I find a scenario with exactly the same commit hexshas another time (duplicated).
scenarios = cherry_pick_data[1]
unpacked_commits = pd.DataFrame()
for i, scenario in enumerate(scenarios):
    unpacked_commits.loc[i, 'cherry_pick_commits'] = scenario['cherry_pick_commit']
    unpacked_commits.loc[i, 'cherry_commits'] = scenario['cherry_commit']

In [88]:
unpacked_commits[unpacked_commits.duplicated(subset=['cherry_pick_commits'], keep=False)]

Unnamed: 0,cherry_pick_commits,cherry_commits
1,1740abbce2ac0fee408fbdf22ab96405f3468485,e8aaa7dcceab7db818720445e497f98f97c5d6e3
3,1740abbce2ac0fee408fbdf22ab96405f3468485,ef75f21d13699affb9d53f96de06687847027e4d
4,eec22630b7b3d58956a2adbf8eb6a7a2a49326e6,041b8028c66941ad586e3c2ade99164a6d771fb6
6,eec22630b7b3d58956a2adbf8eb6a7a2a49326e6,248d33b15e19660ed744d68295365f203ffe4510
7,a8227756e7cfa6355c0a390f69be8c379d7a6d6c,15acc8349a2c124b07d196c9c26781da9029a8f4
9,a8227756e7cfa6355c0a390f69be8c379d7a6d6c,002421281b7fc6f915a33d21bea85498f42a17b9
10,7b99d889476790950c477c30456aa86e0f1f348d,a7ba3269507d4261aff7d23a7241c337d511b7f0
12,7b99d889476790950c477c30456aa86e0f1f348d,f1136b08663e62552e3704845eabe741d762374e
13,f1446d3b22ae8d21853d1e921d9fa814647b94c4,2fbace0aad729045bbff494bd231cb9bb9da5ef7
15,f1446d3b22ae8d21853d1e921d9fa814647b94c4,9928b4e7f6fe8c6943ee19eff6cf583a09163007


In [37]:
len(cherry_pick_data[1])

447

In [41]:
repository_metadata = python_repositories_metadata.iloc[1]
repo_instance = None
repository_path = os.path.join(path_to_repositories, "__".join(repository_metadata["name"].split("/")))
try:
    repo_instance = Repo.clone_from(f'https://github.com/{repository_metadata["name"]}.git',
                                    f'{repository_path}')
except GitCommandError as e:
    # If already exists, create Repo instance of it
    if 'already exists' in e.stderr:
        print('Repository already exists, using local directory instead of cloning.')
        repo_instance = Repo(repository_path)

Repository already exists, using local directory instead of cloning.


In [99]:
from datetime import datetime
datetime.fromtimestamp(repo_instance.commit('1740abbce2ac0fee408fbdf22ab96405f3468485').committed_date)

datetime.datetime(2020, 9, 23, 0, 26, 24)

In [100]:
datetime.fromtimestamp(repo_instance.commit('e8aaa7dcceab7db818720445e497f98f97c5d6e3').committed_date)

datetime.datetime(2020, 9, 15, 17, 20, 48)

In [101]:
datetime.fromtimestamp(repo_instance.commit('ef75f21d13699affb9d53f96de06687847027e4d').committed_date)


datetime.datetime(2020, 9, 21, 18, 37, 59)

In [93]:
repo_instance.git.show('1740abbce2ac0fee408fbdf22ab96405f3468485').split('\n')

['commit 1740abbce2ac0fee408fbdf22ab96405f3468485',
 'Author: Yoann Padioleau <pad@returntocorp.com>',
 'Date:   Tue Sep 15 17:20:48 2020 +0200',
 '',
 '    * python/lang/correctness/common-mistakes/is-comparison-string.py: Fix (#861)',
 '    ',
 '    * * python/lang/correctness/common-mistakes/is-comparison-string.py: Fix',
 '    ',
 "    Add extra assign to not make 'x' a constant.",
 '    Indeed, Iago recently fixed a bug in the const-analysis which',
 '    now kick-in in this file and leads to more reported errors.',
 '    ',
 '    Related to https://github.com/returntocorp/semgrep/pull/1676',
 '    ',
 '    Test plan:',
 '    make test with latest semgrep',
 '    ',
 '    * Avoid constant propagation by using object() instead of str',
 '    ',
 '    Co-authored-by: Matt Schwager <matt@returntocorp.com>',
 '',
 'diff --git a/python/lang/correctness/common-mistakes/is-comparison-string.py b/python/lang/correctness/common-mistakes/is-comparison-string.py',
 'index 9fb45823..dbb76cf4 

In [92]:
repo_instance.git.show('e8aaa7dcceab7db818720445e497f98f97c5d6e3').split('\n')

['commit e8aaa7dcceab7db818720445e497f98f97c5d6e3',
 'Author: Yoann Padioleau <pad@returntocorp.com>',
 'Date:   Tue Sep 15 17:20:48 2020 +0200',
 '',
 '    * python/lang/correctness/common-mistakes/is-comparison-string.py: Fix (#861)',
 '    ',
 '    * * python/lang/correctness/common-mistakes/is-comparison-string.py: Fix',
 '    ',
 "    Add extra assign to not make 'x' a constant.",
 '    Indeed, Iago recently fixed a bug in the const-analysis which',
 '    now kick-in in this file and leads to more reported errors.',
 '    ',
 '    Related to https://github.com/returntocorp/semgrep/pull/1676',
 '    ',
 '    Test plan:',
 '    make test with latest semgrep',
 '    ',
 '    * Avoid constant propagation by using object() instead of str',
 '    ',
 '    Co-authored-by: Matt Schwager <matt@returntocorp.com>',
 '',
 'diff --git a/python/lang/correctness/common-mistakes/is-comparison-string.py b/python/lang/correctness/common-mistakes/is-comparison-string.py',
 'index 9fb45823..dbb76cf4 

In [94]:
repo_instance.git.show('ef75f21d13699affb9d53f96de06687847027e4d').split('\n')

['commit ef75f21d13699affb9d53f96de06687847027e4d',
 'Author: Yoann Padioleau <pad@returntocorp.com>',
 'Date:   Tue Sep 15 17:20:48 2020 +0200',
 '',
 '    * python/lang/correctness/common-mistakes/is-comparison-string.py: Fix (#861)',
 '    ',
 '    * * python/lang/correctness/common-mistakes/is-comparison-string.py: Fix',
 '    ',
 "    Add extra assign to not make 'x' a constant.",
 '    Indeed, Iago recently fixed a bug in the const-analysis which',
 '    now kick-in in this file and leads to more reported errors.',
 '    ',
 '    Related to https://github.com/returntocorp/semgrep/pull/1676',
 '    ',
 '    Test plan:',
 '    make test with latest semgrep',
 '    ',
 '    * Avoid constant propagation by using object() instead of str',
 '    ',
 '    Co-authored-by: Matt Schwager <matt@returntocorp.com>',
 '',
 'diff --git a/python/lang/correctness/common-mistakes/is-comparison-string.py b/python/lang/correctness/common-mistakes/is-comparison-string.py',
 'index 9fb45823..dbb76cf4 