In [1]:
import json

In [2]:
repo_sets = [
    "eval/resources/python_repos_5k-1k.json",
    "eval/resources/python_repos_10k-5k.json",
    "eval/resources/python_repos_20k+.json"
]
repos = [json.load(open(repo_set, 'r')) for repo_set in repo_sets]
repos = [repo for repo_set in repos for repo in repo_set]

In [3]:
repos

[{'url': 'https://github.com/mandarons/icloud-drive-docker.git',
  'categories': [1],
  'test_type': 'pytest',
  'relevant_docs': [],
  'tags': ['requirements', 'requirements-extra', 'bash-extra', 'pytest'],
  'working': True},
 {'url': 'https://github.com/typeddjango/django-stubs.git',
  'categories': [1],
  'test_type': 'pytest',
  'relevant_docs': ['CONTRIBUTING.md'],
  'tags': ['requirements', 'pytest'],
  'working': True},
 {'url': 'https://github.com/Pennyw0rth/NetExec.git',
  'categories': [3],
  'test_type': 'pytest',
  'relevant_docs': ['tests/README.md'],
  'working': True,
  'tags': ['pipx', 'poetry', 'poetry-extra', 'pytest']},
 {'url': 'https://github.com/CVHub520/X-AnyLabeling.git',
  'categories': [1],
  'test_type': 'unittest',
  'relevant_docs': ['docs/en/get_started.md'],
  'working': False,
  'tags': ['pip-extra', 'requirements', 'unittest']},
 {'url': 'https://github.com/open-compass/opencompass.git',
  'categories': [1],
  'test_type': 'unittest',
  'relevant_docs'

In [4]:
doc_files = [repo['relevant_docs'] for repo in repos]
doc_files_clean = set([doc.split('/')[-1].split('.')[0].lower().replace('-', '_') for docs in doc_files for doc in docs])
doc_files = set([doc for docs in doc_files for doc in docs])


In [5]:
print(f"total number of repos: {len(repos)}")
print(f"number of unique paths to relevant documentation: {len(doc_files)}")
print(f"number of unique names of relevant documentation: {len(doc_files_clean)}")
print(f"number of repos with no relevant documentation: {len([r for r in repos if len(r['relevant_docs']) == 0])}")

total number of repos: 36
number of unique paths to relevant documentation: 22
number of unique names of relevant documentation: 12
number of repos with no relevant documentation: 7


In [6]:
import os
dockerfiles_dir = 'resources/working_dockerfiles'
subdirs = os.listdir(dockerfiles_dir)
dockerfile_paths = []
dockerfiles = []
dockerfile_tags = [frozenset(repo['tags']) for repo in repos if 'tags' in repo]
df_filter = lambda x: (
    x != '' and not (x.startswith('#')
    or x.startswith('FROM') or x.startswith('WORKDIR') or x.startswith('COPY') )
)
for subdir in subdirs:
    for dockerfile in os.listdir(os.path.join(dockerfiles_dir, subdir)):
        df = os.path.join(dockerfiles_dir, subdir, dockerfile)
        if os.path.isfile(df):
            dockerfile_paths.append(df)
            dockerfiles.append(list(filter(df_filter, map(lambda x: x.strip(), open(df, 'r').readlines()))))
print(len(dockerfile_paths))

24


In [7]:
print(len(dockerfile_tags))
print(len(set(dockerfile_tags)))

24
16


In [13]:
pprint(set(dockerfile_tags))

{frozenset({'poetry', 'pytest', 'pytest-extra', 'poetry-extra'}),
 frozenset({'make-test', 'poetry', 'make-install'}),
 frozenset({'poetry', 'pytest', 'compile'}),
 frozenset({'install-pytest', 'poetry', 'pytest'}),
 frozenset({'poetry', 'pytest', 'poetry-extra'}),
 frozenset({'requirements', 'compile', 'unittest'}),
 frozenset({'requirements', 'pytest', 'compile'}),
 frozenset({'poetry', 'pytest', 'poetry-extra', 'pipx'}),
 frozenset({'make-test', 'requirements', 'requirements-extra'}),
 frozenset({'requirements', 'unittest', 'pip-extra'}),
 frozenset({'requirements', 'pytest'}),
 frozenset({'make-test', 'extra'}),
 frozenset({'requirements', 'pytest', 'requirements-extra', 'bash-extra'}),
 frozenset({'make-test', 'make-install'}),
 frozenset({'requirements', 'pytest', 'requirements-extra'}),
 frozenset({'poetry', 'pytest'})}


In [8]:
from pprint import pprint
pprint(dockerfiles[0])

['RUN pip install --no-cache-dir -r requirements.txt',
 'RUN pip install wheel && pip install --no-build-isolation --editable .',
 'RUN pip install pytest',
 'RUN pytest']


In [9]:
df_set = set('\n'.join(dockerfile) for dockerfile in dockerfiles)

In [10]:
print(len(dockerfiles))
print(len(df_set))

24
19


In [11]:
pprint(dockerfile_paths)

['resources/working_dockerfiles/20k+/thefuck.dockerfile',
 'resources/working_dockerfiles/20k+/spleeter.dockerfile',
 'resources/working_dockerfiles/20k+/textual.dockerfile',
 'resources/working_dockerfiles/20k+/open-interpreter.dockerfile',
 'resources/working_dockerfiles/20k+/fastapi.dockerfile',
 'resources/working_dockerfiles/20k+/spaCy.dockerfile',
 'resources/working_dockerfiles/20k+/rich.dockerfile',
 'resources/working_dockerfiles/5k-1k/icloud-drive-docker.dockerfile',
 'resources/working_dockerfiles/5k-1k/django-stubs.dockerfile',
 'resources/working_dockerfiles/5k-1k/torch-pruning.dockerfile',
 'resources/working_dockerfiles/5k-1k/r2r.dockerfile',
 'resources/working_dockerfiles/5k-1k/opencompass.dockerfile',
 'resources/working_dockerfiles/5k-1k/warehouse.dockerfile',
 'resources/working_dockerfiles/5k-1k/sabnzbd.dockerfile',
 'resources/working_dockerfiles/5k-1k/netexec.dockerfile',
 'resources/working_dockerfiles/5k-1k/dlt.dockerfile',
 'resources/working_dockerfiles/5k-1k