In [1]:
from git import Repo, GitCommandError

import pandas as pd
import os

In [2]:
os.chdir('../..')
path_to_data = os.path.join(os.getcwd(), 'data')
path_to_repositories = os.path.join(os.getcwd(), 'repos')

In [3]:
PERMISSIVE_LICENSES = ['MIT License',
                       'Apache License 2.0',
                       'BSD 3-Clause New or Revised License',
                       'BSD 2-Clause Simplified License']

# Qualitative analysis of metadata

## Python

In [4]:
# 1. Read in repositories from CSV
python_repositories_metadata = pd.read_csv(os.path.join(path_to_data, 'python_repos.csv'))

In [5]:
python_repositories_metadata = python_repositories_metadata[python_repositories_metadata['license'].isin(PERMISSIVE_LICENSES)]
python_repositories_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4968 entries, 0 to 4967
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4968 non-null   int64  
 1   name               4968 non-null   object 
 2   isFork             4968 non-null   bool   
 3   commits            4968 non-null   int64  
 4   branches           4968 non-null   int64  
 5   releases           4968 non-null   int64  
 6   forks              4968 non-null   int64  
 7   mainLanguage       4968 non-null   object 
 8   defaultBranch      4968 non-null   object 
 9   license            4968 non-null   object 
 10  homepage           2930 non-null   object 
 11  watchers           4968 non-null   int64  
 12  stargazers         4968 non-null   int64  
 13  contributors       4968 non-null   int64  
 14  size               4968 non-null   int64  
 15  createdAt          4968 non-null   object 
 16  pushedAt           4968 

In [6]:
stronger_filters_python_repositories_metadata = python_repositories_metadata[(python_repositories_metadata['contributors'] >= 10) & (python_repositories_metadata['stargazers'] >= 100)]
stronger_filters_python_repositories_metadata.info()
stronger_filters_python_repositories_metadata.to_csv(os.path.join(path_to_data, 'stronger_filters_python_repositories_metadata.csv'), index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4968 entries, 0 to 4967
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4968 non-null   int64  
 1   name               4968 non-null   object 
 2   isFork             4968 non-null   bool   
 3   commits            4968 non-null   int64  
 4   branches           4968 non-null   int64  
 5   releases           4968 non-null   int64  
 6   forks              4968 non-null   int64  
 7   mainLanguage       4968 non-null   object 
 8   defaultBranch      4968 non-null   object 
 9   license            4968 non-null   object 
 10  homepage           2930 non-null   object 
 11  watchers           4968 non-null   int64  
 12  stargazers         4968 non-null   int64  
 13  contributors       4968 non-null   int64  
 14  size               4968 non-null   int64  
 15  createdAt          4968 non-null   object 
 16  pushedAt           4968 

We found the two docarray repositories to cause Out-Of-Memory (OOM) errors when scraping the dataset with YTsaurus. We were however unable to reproduce this error locally. Furthermore, they are Python repositories, which make up the largest chunk of our dataset with around 5k repositories, so two more or less dont matter much. This is why we decide to remove these.

In [11]:
stronger_filters_python_repositories_metadata = stronger_filters_python_repositories_metadata[~stronger_filters_python_repositories_metadata.name.str.contains('docarray')]
stronger_filters_python_repositories_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4966 entries, 0 to 4967
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4966 non-null   int64  
 1   name               4966 non-null   object 
 2   isFork             4966 non-null   bool   
 3   commits            4966 non-null   int64  
 4   branches           4966 non-null   int64  
 5   releases           4966 non-null   int64  
 6   forks              4966 non-null   int64  
 7   mainLanguage       4966 non-null   object 
 8   defaultBranch      4966 non-null   object 
 9   license            4966 non-null   object 
 10  homepage           2928 non-null   object 
 11  watchers           4966 non-null   int64  
 12  stargazers         4966 non-null   int64  
 13  contributors       4966 non-null   int64  
 14  size               4966 non-null   int64  
 15  createdAt          4966 non-null   object 
 16  pushedAt           4966 non-n

In [12]:
stronger_filters_python_repositories_metadata.to_csv(os.path.join(path_to_data, 'python_repos.csv'), index=False)

In [11]:
python_repositories_metadata['license'].value_counts()

license
MIT License                                                   3489
Apache License 2.0                                            2377
Other                                                         1549
GNU General Public License v3.0                                911
BSD 3-Clause New or Revised License                            747
GNU Affero General Public License v3.0                         330
GNU General Public License v2.0                                209
BSD 2-Clause Simplified License                                185
GNU Lesser General Public License v3.0                         124
Mozilla Public License 2.0                                      69
GNU Lesser General Public License v2.1                          50
Creative Commons Zero v1.0 Universal                            36
ISC License                                                     36
The Unlicense                                                   23
Creative Commons Attribution Share Alike 4.0 Internati

In [9]:
len(python_repositories_metadata)

10246

Let's take a look at the distribution of relevant numeric columns to get an overview of the dataset.

In [10]:
python_repositories_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10246 entries, 0 to 10245
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10246 non-null  int64  
 1   name               10246 non-null  object 
 2   isFork             10246 non-null  bool   
 3   commits            10246 non-null  int64  
 4   branches           10246 non-null  int64  
 5   releases           10246 non-null  int64  
 6   forks              10246 non-null  int64  
 7   mainLanguage       10246 non-null  object 
 8   defaultBranch      10246 non-null  object 
 9   license            10246 non-null  object 
 10  homepage           5687 non-null   object 
 11  watchers           10246 non-null  int64  
 12  stargazers         10246 non-null  int64  
 13  contributors       10246 non-null  int64  
 14  size               10246 non-null  int64  
 15  createdAt          10246 non-null  object 
 16  pushedAt           102

In [13]:
python_repositories_metadata[['branches', 'releases', 'forks', 'watchers', 'contributors', 'codeLines']].describe()

All repositories include more than one branch. Most include some forks. Curiously, we note that `min(codeLines) = 1`. Looking at the mean and 25% quartile though, we see that overall the repos seem to be in a good shape.

In [6]:
python_repositories_metadata[python_repositories_metadata.codeLines < 2]

In [5]:
pd.to_datetime(python_repositories_metadata.updatedAt).describe()

In [14]:
pd.to_datetime(python_repositories_metadata.createdAt).describe()

## Java

In [8]:
# 1. Read in repositories from CSV
java_repositories_metadata = pd.read_csv(os.path.join(path_to_data, 'java_repos.csv'))

In [9]:
java_repositories_metadata = java_repositories_metadata[java_repositories_metadata['license'].isin(PERMISSIVE_LICENSES)]
java_repositories_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4542 entries, 0 to 6839
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4542 non-null   int64  
 1   name               4542 non-null   object 
 2   isFork             4542 non-null   bool   
 3   commits            4542 non-null   int64  
 4   branches           4542 non-null   int64  
 5   releases           4542 non-null   int64  
 6   forks              4542 non-null   int64  
 7   mainLanguage       4542 non-null   object 
 8   defaultBranch      4542 non-null   object 
 9   license            4542 non-null   object 
 10  homepage           2123 non-null   object 
 11  watchers           4542 non-null   int64  
 12  stargazers         4542 non-null   int64  
 13  contributors       4542 non-null   int64  
 14  size               4542 non-null   int64  
 15  createdAt          4542 non-null   object 
 16  pushedAt           4542 non-n

In [10]:
stronger_filters_java_repositories_metadata = java_repositories_metadata[(java_repositories_metadata['contributors'] >= 10) & (java_repositories_metadata['stargazers'] >= 100)]
stronger_filters_java_repositories_metadata.info()
stronger_filters_java_repositories_metadata.to_csv(os.path.join(path_to_data, 'stronger_filters_java_repositories_metadata.csv'), index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 2470 entries, 0 to 6839
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2470 non-null   int64  
 1   name               2470 non-null   object 
 2   isFork             2470 non-null   bool   
 3   commits            2470 non-null   int64  
 4   branches           2470 non-null   int64  
 5   releases           2470 non-null   int64  
 6   forks              2470 non-null   int64  
 7   mainLanguage       2470 non-null   object 
 8   defaultBranch      2470 non-null   object 
 9   license            2470 non-null   object 
 10  homepage           1340 non-null   object 
 11  watchers           2470 non-null   int64  
 12  stargazers         2470 non-null   int64  
 13  contributors       2470 non-null   int64  
 14  size               2470 non-null   int64  
 15  createdAt          2470 non-null   object 
 16  pushedAt           2470 non-n

In [18]:
java_repositories_metadata[['branches', 'releases', 'forks', 'watchers', 'contributors', 'codeLines']].describe()

Most have been forked. Again, we note that `min(codeLines) = 2`. Looking at the mean and 25% quartile though, we see that overall the repos seem to be in a good shape.

In [19]:
pd.to_datetime(java_repositories_metadata.updatedAt).describe()

In [20]:
pd.to_datetime(java_repositories_metadata.createdAt).describe()

## Kotlin

In [11]:
# 1. Read in repositories from CSV
kotlin_repositories_metadata = pd.read_csv(os.path.join(path_to_data, 'kotlin_repos.csv'))

In [12]:
kotlin_repositories_metadata = kotlin_repositories_metadata[kotlin_repositories_metadata['license'].isin(PERMISSIVE_LICENSES)]
kotlin_repositories_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1225 entries, 2 to 1703
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1225 non-null   int64  
 1   name               1225 non-null   object 
 2   isFork             1225 non-null   bool   
 3   commits            1225 non-null   int64  
 4   branches           1225 non-null   int64  
 5   releases           1225 non-null   int64  
 6   forks              1225 non-null   int64  
 7   mainLanguage       1225 non-null   object 
 8   defaultBranch      1225 non-null   object 
 9   license            1225 non-null   object 
 10  homepage           528 non-null    object 
 11  watchers           1225 non-null   int64  
 12  stargazers         1225 non-null   int64  
 13  contributors       1225 non-null   int64  
 14  size               1225 non-null   int64  
 15  createdAt          1225 non-null   object 
 16  pushedAt           1225 non-n

In [13]:
stronger_filters_kotlin_repositories_metadata = kotlin_repositories_metadata[(kotlin_repositories_metadata['contributors'] >= 10) & (kotlin_repositories_metadata['stargazers'] >= 100)]
stronger_filters_kotlin_repositories_metadata.info()
stronger_filters_kotlin_repositories_metadata.to_csv(os.path.join(path_to_data, 'stronger_filters_kotlin_repositories_metadata.csv'), index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 496 entries, 2 to 1694
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 496 non-null    int64  
 1   name               496 non-null    object 
 2   isFork             496 non-null    bool   
 3   commits            496 non-null    int64  
 4   branches           496 non-null    int64  
 5   releases           496 non-null    int64  
 6   forks              496 non-null    int64  
 7   mainLanguage       496 non-null    object 
 8   defaultBranch      496 non-null    object 
 9   license            496 non-null    object 
 10  homepage           261 non-null    object 
 11  watchers           496 non-null    int64  
 12  stargazers         496 non-null    int64  
 13  contributors       496 non-null    int64  
 14  size               496 non-null    int64  
 15  createdAt          496 non-null    object 
 16  pushedAt           496 non-nul

In [27]:
len(kotlin_repositories_metadata)

1704

In [23]:
kotlin_repositories_metadata[['branches', 'releases', 'forks', 'watchers', 'contributors', 'codeLines']].describe()

Most have been forked. Again, we note that `min(codeLines) = 1`. Looking at the mean and 25% quartile though, we see that overall the repos seem to be in a good shape. For Kotlin, we note that the number of contributors and branches seems lower in general.

In [24]:
pd.to_datetime(kotlin_repositories_metadata.updatedAt).describe()

In [25]:
pd.to_datetime(kotlin_repositories_metadata.createdAt).describe()

# Development of git history traversal and quantitative analysis of repositories

## Determine ratio of branches to files

In [14]:
import re 

In [21]:
repositories_statistics = pd.DataFrame()

for i, repository_metadata in python_repositories_metadata.iloc[:15].iterrows():
    repo_instance = None
    repository_path = os.path.join(path_to_repositories, "__".join(repository_metadata["name"].split("/")))
    try:
        repo_instance = Repo.clone_from(f'https://github.com/{repository_metadata["name"]}.git',
                                    f'{repository_path}')
    except GitCommandError as e:
        # If already exists, create Repo instance of it
        if 'already exists' in e.stderr:
            print('Repository already exists, using local directory instead of cloning.')
            repo_instance = Repo(repository_path)
            
    if repo_instance is None:
        continue

    os.chdir(os.path.join(path_to_data, repository_path))

    repositories_statistics.loc[i, 'branches'] = len(repo_instance.refs)
    
    num_python_files = 0
    num_total_files = 0
    for directory,subdirs,files in os.walk(repository_path):
        if re.match('.*(\\\\|\/)\..*', directory):
            continue # Skip hidden folders
        
        python_files = [f for f in files if '.py' in f]
        total_files = [f for f in files if re.match('^[^\.].*\..*$', f)] # skip hidden files and files without file ending
        
        num_python_files += len(python_files)
        num_total_files += len(total_files)

    repositories_statistics.loc[i, 'python_files'] = num_python_files
    repositories_statistics.loc[i, 'total_files'] = num_total_files

In [25]:
(repositories_statistics.total_files / repositories_statistics.branches).sum() / 10

# Exploring results

In [1]:
results = pd.read_parquet(os.path.join(path_to_data, 'kotlin.parquet'), engine='pyarrow')

NameError: name 'pd' is not defined

In [None]:
results.info()

## Duplicate cherry pick scenarios

In [85]:
cherry_pick_data= results['scraped_data'].apply(lambda x: x['cherry_pick_scenarios'])

In [86]:
cherry_pick_data

1    [{'cherry_commit': 'f7dbb2fc3be3060b29345f7801...
Name: scraped_data, dtype: object

In [87]:
# Need to check cherry pick scenarios at the end of the list.
# Check if I find a scenario with exactly the same commit hexshas another time (duplicated).
scenarios = cherry_pick_data[1]
unpacked_commits = pd.DataFrame()
for i, scenario in enumerate(scenarios):
    unpacked_commits.loc[i, 'cherry_pick_commits'] = scenario['cherry_pick_commit']
    unpacked_commits.loc[i, 'cherry_commits'] = scenario['cherry_commit']

In [88]:
unpacked_commits[unpacked_commits.duplicated(subset=['cherry_pick_commits'], keep=False)]

Unnamed: 0,cherry_pick_commits,cherry_commits
1,1740abbce2ac0fee408fbdf22ab96405f3468485,e8aaa7dcceab7db818720445e497f98f97c5d6e3
3,1740abbce2ac0fee408fbdf22ab96405f3468485,ef75f21d13699affb9d53f96de06687847027e4d
4,eec22630b7b3d58956a2adbf8eb6a7a2a49326e6,041b8028c66941ad586e3c2ade99164a6d771fb6
6,eec22630b7b3d58956a2adbf8eb6a7a2a49326e6,248d33b15e19660ed744d68295365f203ffe4510
7,a8227756e7cfa6355c0a390f69be8c379d7a6d6c,15acc8349a2c124b07d196c9c26781da9029a8f4
9,a8227756e7cfa6355c0a390f69be8c379d7a6d6c,002421281b7fc6f915a33d21bea85498f42a17b9
10,7b99d889476790950c477c30456aa86e0f1f348d,a7ba3269507d4261aff7d23a7241c337d511b7f0
12,7b99d889476790950c477c30456aa86e0f1f348d,f1136b08663e62552e3704845eabe741d762374e
13,f1446d3b22ae8d21853d1e921d9fa814647b94c4,2fbace0aad729045bbff494bd231cb9bb9da5ef7
15,f1446d3b22ae8d21853d1e921d9fa814647b94c4,9928b4e7f6fe8c6943ee19eff6cf583a09163007


In [37]:
len(cherry_pick_data[1])

447

In [41]:
repository_metadata = python_repositories_metadata.iloc[1]
repo_instance = None
repository_path = os.path.join(path_to_repositories, "__".join(repository_metadata["name"].split("/")))
try:
    repo_instance = Repo.clone_from(f'https://github.com/{repository_metadata["name"]}.git',
                                    f'{repository_path}')
except GitCommandError as e:
    # If already exists, create Repo instance of it
    if 'already exists' in e.stderr:
        print('Repository already exists, using local directory instead of cloning.')
        repo_instance = Repo(repository_path)

Repository already exists, using local directory instead of cloning.


In [99]:
from datetime import datetime
datetime.fromtimestamp(repo_instance.commit('1740abbce2ac0fee408fbdf22ab96405f3468485').committed_date)

datetime.datetime(2020, 9, 23, 0, 26, 24)

In [100]:
datetime.fromtimestamp(repo_instance.commit('e8aaa7dcceab7db818720445e497f98f97c5d6e3').committed_date)

datetime.datetime(2020, 9, 15, 17, 20, 48)

In [101]:
datetime.fromtimestamp(repo_instance.commit('ef75f21d13699affb9d53f96de06687847027e4d').committed_date)


datetime.datetime(2020, 9, 21, 18, 37, 59)

In [93]:
repo_instance.git.show('1740abbce2ac0fee408fbdf22ab96405f3468485').split('\n')

['commit 1740abbce2ac0fee408fbdf22ab96405f3468485',
 'Author: Yoann Padioleau <pad@returntocorp.com>',
 'Date:   Tue Sep 15 17:20:48 2020 +0200',
 '',
 '    * python/lang/correctness/common-mistakes/is-comparison-string.py: Fix (#861)',
 '    ',
 '    * * python/lang/correctness/common-mistakes/is-comparison-string.py: Fix',
 '    ',
 "    Add extra assign to not make 'x' a constant.",
 '    Indeed, Iago recently fixed a bug in the const-analysis which',
 '    now kick-in in this file and leads to more reported errors.',
 '    ',
 '    Related to https://github.com/returntocorp/semgrep/pull/1676',
 '    ',
 '    Test plan:',
 '    make test with latest semgrep',
 '    ',
 '    * Avoid constant propagation by using object() instead of str',
 '    ',
 '    Co-authored-by: Matt Schwager <matt@returntocorp.com>',
 '',
 'diff --git a/python/lang/correctness/common-mistakes/is-comparison-string.py b/python/lang/correctness/common-mistakes/is-comparison-string.py',
 'index 9fb45823..dbb76cf4 

In [92]:
repo_instance.git.show('e8aaa7dcceab7db818720445e497f98f97c5d6e3').split('\n')

['commit e8aaa7dcceab7db818720445e497f98f97c5d6e3',
 'Author: Yoann Padioleau <pad@returntocorp.com>',
 'Date:   Tue Sep 15 17:20:48 2020 +0200',
 '',
 '    * python/lang/correctness/common-mistakes/is-comparison-string.py: Fix (#861)',
 '    ',
 '    * * python/lang/correctness/common-mistakes/is-comparison-string.py: Fix',
 '    ',
 "    Add extra assign to not make 'x' a constant.",
 '    Indeed, Iago recently fixed a bug in the const-analysis which',
 '    now kick-in in this file and leads to more reported errors.',
 '    ',
 '    Related to https://github.com/returntocorp/semgrep/pull/1676',
 '    ',
 '    Test plan:',
 '    make test with latest semgrep',
 '    ',
 '    * Avoid constant propagation by using object() instead of str',
 '    ',
 '    Co-authored-by: Matt Schwager <matt@returntocorp.com>',
 '',
 'diff --git a/python/lang/correctness/common-mistakes/is-comparison-string.py b/python/lang/correctness/common-mistakes/is-comparison-string.py',
 'index 9fb45823..dbb76cf4 

In [94]:
repo_instance.git.show('ef75f21d13699affb9d53f96de06687847027e4d').split('\n')

['commit ef75f21d13699affb9d53f96de06687847027e4d',
 'Author: Yoann Padioleau <pad@returntocorp.com>',
 'Date:   Tue Sep 15 17:20:48 2020 +0200',
 '',
 '    * python/lang/correctness/common-mistakes/is-comparison-string.py: Fix (#861)',
 '    ',
 '    * * python/lang/correctness/common-mistakes/is-comparison-string.py: Fix',
 '    ',
 "    Add extra assign to not make 'x' a constant.",
 '    Indeed, Iago recently fixed a bug in the const-analysis which',
 '    now kick-in in this file and leads to more reported errors.',
 '    ',
 '    Related to https://github.com/returntocorp/semgrep/pull/1676',
 '    ',
 '    Test plan:',
 '    make test with latest semgrep',
 '    ',
 '    * Avoid constant propagation by using object() instead of str',
 '    ',
 '    Co-authored-by: Matt Schwager <matt@returntocorp.com>',
 '',
 'diff --git a/python/lang/correctness/common-mistakes/is-comparison-string.py b/python/lang/correctness/common-mistakes/is-comparison-string.py',
 'index 9fb45823..dbb76cf4 