In [1]:
import json
from typing import Any, Dict

In [2]:
REPO_SETS = [
    'eval/resources/python_repos_5k-1k.json',
    'eval/resources/python_repos_10k-5k.json',
    'eval/resources/python_repos_20k-10k.json',
    'eval/resources/python_repos_20k+.json'
]
DOCKERFILE_PATH = "resources/working_dockerfiles"
def load_repos():
    repos = {repo_set: json.load(open(repo_set, 'r')) for repo_set in REPO_SETS}
    return repos

In [3]:
repos = load_repos()

In [4]:
def calc_visbility(repo: Dict[str, Any]):
    # Calculate the "visibility" of a repo's documentation based on the following heursitic
    # 1 / (num directories to navigate + num files to find)
    dirs = [relevant.split('/') for relevant in repo['relevant_docs']]
    paths = [['/'.join(repo_dirs[:i+1]) for i in range(len(repo_dirs))] for repo_dirs in dirs]
    visibility = set([d for directories in paths for d in directories])
    return 1 / len(visibility) if len(visibility) > 0 else 0

In [5]:
for repo in repos[REPO_SETS[-1]]:
    print(repo['url'])
    print(" - " + ', '.join(repo['relevant_docs']))
    visivility = calc_visbility(repo)
    repo['visivility'] = visivility
    print(" - " + str(visivility))


https://github.com/tiangolo/fastapi.git
 - docs/en/docs/contributing.md, docs/en/docs/tutorial/testing.md
 - 0.16666666666666666
https://github.com/psf/black.git
 - docs/contributing/the_basics.md
 - 0.3333333333333333
https://github.com/tqdm/tqdm.git
 - 
 - 0
https://github.com/Textualize/rich.git
 - CONTRIBUTING.md
 - 1.0
https://github.com/OpenInterpreter/open-interpreter.git
 - docs/CONTRIBUTING.md
 - 0.5
https://github.com/home-assistant/core.git
 - 
 - 0
https://github.com/sherlock-project/sherlock.git
 - 
 - 0
https://github.com/explosion/spaCy.git
 - README.md
 - 1.0
https://github.com/soimort/you-get.git
 - README.md
 - 1.0
https://github.com/Textualize/textual.git
 - CONTRIBUTING.md
 - 1.0


In [5]:
import os
from difflib import get_close_matches
from pprint import pprint
from tqdm import tqdm
from doc_test.agent.functions import _get_file_contents, get_api_url

def get_dockerfile(repo, set_name):
    name = repo['url'].split('/')[-1][:-4]
    set_name = set_name.split('_')[-1][:-5]
    dockerfile_path = os.path.join(DOCKERFILE_PATH, set_name, name.lower()+'.dockerfile')
    if not os.path.exists(dockerfile_path):
        print(f"path {dockerfile_path} does not exist")
        return -2, []
    dockerfile = open(dockerfile_path, 'r').readlines()

    return dockerfile

def calc_informativity(repo: Dict[str, Any], set_name: str):
    # Calculate the "informativity" of a repo's documentation based on the following heuristic
    # (Num lines in model dockerfile that appear in documentation) / (Num lines in model dockerfile total)

    # step 1: load dockerfile
    dockerfile = get_dockerfile(repo, set_name)

    dockerfile = [
        ' '.join(line.split(' ')[1:]).strip()
        for line in dockerfile
        if not (
            line.startswith('COPY') or line.startswith('WORKDIR') or line.strip() == ''
        )
    ]

    # step 2: load documents
    if len(repo['relevant_docs']) == 0:
        return -1, []
    api_url = get_api_url(repo['url'])
    documents = [_get_file_contents(api_url, path, repo['ref']).split('\n') for path in repo['relevant_docs']]

    # step 3: find dockerfile lines that occur in documentation
    matched_lines = {}
    for doc, doc_name in zip(documents, repo['relevant_docs']):
        for line in doc:
            matches = get_close_matches(line, dockerfile, cutoff=0.8)
            if len(matches) > 0:
                if matches[0] not in matched_lines:
                    matched_lines[matches[0]] = []
                matched_lines[matches[0]].append((line, doc_name))
            else:
                for df_line in dockerfile:
                    if df_line in line:
                        if df_line not in matched_lines:
                            matched_lines[df_line] = []
                        matched_lines[df_line].append((line, doc_name))

    return (len(matched_lines) / len(dockerfile), matched_lines)

In [7]:
# CALCULATE BOTH HEURISTICS FOR ALL REPOS
for repo_set in REPO_SETS:
    for repo in repos[repo_set]:
        print(repo['url'])
        print(" - " + ', '.join(repo['relevant_docs']))

        info = calc_informativity(repo, repo_set)
        repo['info'] = info[0]
        print(" - informativity = " + str(info[0]))

        visivility = calc_visbility(repo)
        repo['visivility'] = visivility
        print(" - visibility = " + str(visivility))

https://github.com/mandarons/icloud-drive-docker.git
 - 
 - informativity = -1
 - visibility = 0
https://github.com/typeddjango/django-stubs.git
 - CONTRIBUTING.md
 - informativity = 0.3333333333333333
 - visibility = 1.0
https://github.com/Pennyw0rth/NetExec.git
 - tests/README.md
 - informativity = 0.2
 - visibility = 0.5
https://github.com/CVHub520/X-AnyLabeling.git
 - docs/en/get_started.md
 - informativity = 0.3333333333333333
 - visibility = 0.3333333333333333
https://github.com/open-compass/opencompass.git
 - README.md, docs/en/get_started/installation.md
 - informativity = 0.4
 - visibility = 0.2
https://github.com/SciPhi-AI/R2R.git
 - 
 - informativity = -1
 - visibility = 0
https://github.com/VainF/Torch-Pruning.git
 - README.md
 - informativity = 0.0
 - visibility = 1.0
https://github.com/pypi/warehouse.git
 - docs/dev/development/getting-started.rst
 - informativity = 0.3333333333333333
 - visibility = 0.25
https://github.com/sabnzbd/sabnzbd.git
 - README.md
 - informativit

KeyboardInterrupt: 

In [None]:
# WRITE HEURISTICS  
for repo_set in REPO_SETS:
    with open(repo_set, 'w') as f:
        json.dump(repos[repo_set], f)

In [6]:
from vm_control import VMController, test_dockerfile
from doc_test.utils import notify
# TEST DOCKERFILES
results = {}
i = 0
for repo_set in REPO_SETS:
    print(repo_set)
    for repo in repos[repo_set]:
        name = repo['url'].split('/')[-1][:-4]
        vmc = VMController(f"working_logs/{name}.log")
        results[name] = test_dockerfile(
            repo['url'],
            "\n".join(get_dockerfile(repo, repo_set)),
            name,
            vmc,
            repo['ref']
        )
        vmc.clear_cache()
        i += 1
        notify(f" - ({i}/40) {name}: {results[name]}")

eval/resources/python_repos_5k-1k.json
# Doesnt work with python 3.8

FROM python:3.10



COPY . /app/



WORKDIR /app



RUN pip install -r requirements.txt

RUN pip install -r requirements-test.txt



# Commands to create required directories, taken from repo's workflows

RUN mkdir /config /icloud

RUN chown $(id -u) /config /icloud



RUN pytest

attempting to build using dockerfile, logs written to working_logs/icloud-drive-docker.log.
/tmp/tmp.7GpV2w7rcK/icloud-drive-docker




In [None]:
results['pymc']

True

In [6]:
from vm_control import VMController, test_dockerfile

def get_repo(name):
    return [(r, repo_set) for repo_set in REPO_SETS for r in repos[repo_set] if name.lower() in r['url'].lower()]

def test_repo(repo, repo_set):
    name = repo['url'].split('/')[-1][:-4]
    vmc = VMController(f"working_logs/{name}.log")
    result = test_dockerfile(
        repo['url'],
        "\n".join(get_dockerfile(repo, repo_set)),
        name,
        vmc,
        repo['ref']
    )
    vmc.clear_cache()
    return result

In [19]:
get_repo("modelscope")[0]

({'url': 'https://github.com/modelscope/modelscope.git',
  'categories': [1],
  'test_type': 'make',
  'relevant_docs': ['docs/source/develop.md'],
  'working': False,
  'tags': ['requirements', 'requirements-extra', 'make-test'],
  'ref': '469159d',
  'info': 0.4,
  'visivility': 0.3333333333333333},
 'eval/resources/python_repos_10k-5k.json')

In [20]:
test_repo(*get_repo("modelscope")[0])

FROM python:3.10



COPY . /app/



WORKDIR /app



RUN apt-get update && apt-get install libgl1  -y

RUN pip install onnxruntime

RUN pip install -r requirements.txt



RUN python -m unittest tests/test_utils/test_general.py

attempting to build using dockerfile, logs written to working_logs/X-AnyLabeling.log.
/tmp/tmp.0ZWaTB55zZ/X-AnyLabeling




At least 1 test passed.
Docker build completed successfully on virtual machine.
Untagged: temp_image:latest
Deleted: sha256:b4931d337adddb635a5f77a80de50460eb71e69979299c95f99159b722f4ee41
Deleted build cache objects:
l2phfnzfveor1hg9sp8bz40ee
w6ptkhhvfwvrmleaohx8fc7vw
ctobcj8q5tr1inhdxyt91ht7b
l692i196n3ru895mr7l7qjago
rmxbesy425rfdwdhgn97ftyuz
1x6i0acopzuytp9qdqluhmxv0
68aeuvb6s1warw1pscf48shqc
1ieo2qjoqtbd8obtinani6d5w
p35xzuqjp2hdym88hgfoy0w90
nktmof10bieim1jb7gjmmqye8
dktxi6qg4bgj5pkz7zvi8eyyu
zi1cpz9vfmtmes2ci76hk1cck
x8c9cwh55sot5jaucmbyiud92
pkoz7se2p01riw755w43b8v3w
zshu673w946ex9ltio12u9cn8
eqngrf8dec542jjdp6n54l7un

Total reclaimed space: 1.703GB


rm: cannot remove '/var/lib/docker/overlay2/*': Permission denied
Failed to restart docker.service: Interactive authentication required.
See system logs and 'systemctl status docker.service' for details.


True