In [1]:
import json
from typing import Any, Dict

In [2]:

from doc_test.consts import REPO_SETS


DOCKERFILE_PATH = "resources/dataset/dockerfiles"
def load_repos():
    repos = {repo_set: json.load(open(repo_set, 'r')) for repo_set in REPO_SETS.values()}
    return repos

In [3]:
repos = load_repos()

In [4]:
def calc_visbility(repo: Dict[str, Any]):
    # Calculate the "visibility" of a repo's documentation based on the following heursitic
    # 1 / (num directories to navigate + num files to find)
    dirs = [relevant.split('/') for relevant in repo['relevant_docs']]
    paths = [['/'.join(repo_dirs[:i+1]) for i in range(len(repo_dirs))] for repo_dirs in dirs]
    visibility = set([d for directories in paths for d in directories])
    return 1 / len(visibility) if len(visibility) > 0 else 0

In [5]:
for repo in repos[REPO_SETS[-1]]:
    print(repo['url'])
    print(" - " + ', '.join(repo['relevant_docs']))
    visibility = calc_visbility(repo)
    repo['visibility'] = visibility
    print(" - " + str(visibility))


https://github.com/tiangolo/fastapi.git
 - docs/en/docs/contributing.md, docs/en/docs/tutorial/testing.md
 - 0.16666666666666666
https://github.com/psf/black.git
 - docs/contributing/the_basics.md
 - 0.3333333333333333
https://github.com/tqdm/tqdm.git
 - 
 - 0
https://github.com/Textualize/rich.git
 - CONTRIBUTING.md
 - 1.0
https://github.com/OpenInterpreter/open-interpreter.git
 - docs/CONTRIBUTING.md
 - 0.5
https://github.com/home-assistant/core.git
 - 
 - 0
https://github.com/sherlock-project/sherlock.git
 - 
 - 0
https://github.com/explosion/spaCy.git
 - README.md
 - 1.0
https://github.com/soimort/you-get.git
 - README.md
 - 1.0
https://github.com/Textualize/textual.git
 - CONTRIBUTING.md
 - 1.0


In [8]:
import os
from difflib import get_close_matches
from pprint import pprint
from tqdm import tqdm
from doc_test.agent.functions import _get_file_contents, get_api_url

def get_dockerfile(repo, set_name):
    name = repo['url'].split('/')[-1][:-4]
    set_name = set_name.split('_')[-1][:-5]
    dockerfile_path = os.path.join(DOCKERFILE_PATH, set_name, name.lower()+'.dockerfile')
    if not os.path.exists(dockerfile_path):
        print(f"path {dockerfile_path} does not exist")
        return -2, []
    dockerfile = open(dockerfile_path, 'r').readlines()

    return dockerfile

def calc_informativity(repo: Dict[str, Any], set_name: str):
    # Calculate the "informativity" of a repo's documentation based on the following heuristic
    # (Num lines in model dockerfile that appear in documentation) / (Num lines in model dockerfile total)

    # step 1: load dockerfile
    dockerfile = get_dockerfile(repo, set_name)

    dockerfile = [
        ' '.join(line.split(' ')[1:]).strip()
        for line in dockerfile
        if not (
            line.startswith('COPY') or line.startswith('WORKDIR') or line.strip() == ''
        )
    ]

    # step 2: load documents
    if len(repo['relevant_docs']) == 0:
        return -1, []
    api_url = get_api_url(repo['url'])
    documents = [_get_file_contents(api_url, path, repo['ref']).split('\n') for path in repo['relevant_docs']]

    # step 3: find dockerfile lines that occur in documentation
    matched_lines = {}
    for doc, doc_name in zip(documents, repo['relevant_docs']):
        for line in doc:
            matches = get_close_matches(line, dockerfile, cutoff=0.8)
            if len(matches) > 0:
                if matches[0] not in matched_lines:
                    matched_lines[matches[0]] = []
                matched_lines[matches[0]].append((line, doc_name))
            else:
                for df_line in dockerfile:
                    if df_line in line:
                        if df_line not in matched_lines:
                            matched_lines[df_line] = []
                        matched_lines[df_line].append((line, doc_name))

    return (len(matched_lines) / len(dockerfile), matched_lines)

In [9]:
# CALCULATE BOTH HEURISTICS FOR ALL REPOS
for repo_set in REPO_SETS:
    for repo in repos[repo_set]:
        print(repo['url'], repo['ref'])
        print(" - " + ', '.join(repo['relevant_docs']))
        if 'info' not in repo or repo['info'] is None:
            info = calc_informativity(repo, repo_set)
            repo['info'] = info[0]
            print(" - informativity = " + str(info[0]))
        if 'visibility' not in repo or repo['visibility'] is None:
            visibility = calc_visbility(repo)
            repo['visibility'] = visibility
            print(" - visibility = " + str(visibility))

https://github.com/mandarons/icloud-drive-docker.git 8cbcc2c
 - 
https://github.com/typeddjango/django-stubs.git b325955
 - CONTRIBUTING.md
https://github.com/PennyLaneAI/pennylane.git b78565c
 - .github/CONTRIBUTING.md, doc/development/guide/installation.rst, doc/development/guide/tests.rst
https://github.com/CVHub520/X-AnyLabeling.git 7c62ffb
 - docs/en/get_started.md
https://github.com/open-compass/opencompass.git 7c7fa36
 - README.md, docs/en/get_started/installation.md
https://github.com/SciPhi-AI/R2R.git 712defb
 - docs/documentation/installation/local-system.mdx
https://github.com/VainF/Torch-Pruning.git 88e7472
 - README.md
https://github.com/scverse/scvi-tools.git e8f9b36
 - docs/developer/code.md
 - informativity = 0.3333333333333333
 - visibility = 0.3333333333333333
https://github.com/sabnzbd/sabnzbd.git 07250aa
 - README.md
https://github.com/dlt-hub/dlt.git 6451bd7
 - CONTRIBUTING.md
https://github.com/camel-ai/camel.git b8e1f5c
 - README.md, CONTRIBUTING.md, docs/get_sta

In [10]:
# WRITE HEURISTICS  
for repo_set in REPO_SETS:
    with open(repo_set, 'w') as f:
        json.dump(repos[repo_set], f)

In [6]:
from vm_control import VMController, test_dockerfile
from doc_test.utils import notify

def get_repo(name):
    return [(r, repo_set) for repo_set in REPO_SETS for r in repos[repo_set] if name.lower() in r['url'].lower()]

def test_repo(repo, repo_set):
    name = repo['url'].split('/')[-1][:-4]
    vmc = VMController(f"working_logs/{name}.log")
    result = test_dockerfile(
        repo['url'],
        "\n".join(get_dockerfile(repo, repo_set)),
        name,
        vmc,
        repo['ref']
    )
    vmc.clear_cache()
    return result

In [7]:
get_repo("scvi")[0]

({'url': 'https://github.com/scverse/scvi-tools.git',
  'categories': [4],
  'test_type': 'pytest',
  'relevant_docs': ['docs/developer/code.md'],
  'tags': ['install-self', 'pytest', 'pytest-extra'],
  'ref': 'e8f9b36'},
 'eval/resources/python_repos_5k-1k.json')

In [12]:
test_repo(*get_repo("scvi")[0])

FROM python:3.10

WORKDIR /app

COPY . /app/



RUN pip install -e ".[dev]"



RUN python -m pytest -x

attempting to build using dockerfile, logs written to working_logs/scvi-tools.log.
/tmp/tmp.CkSCrHVwDa/scvi-tools




At least 1 test passed.
Docker build completed successfully on virtual machine.


Error response from daemon: No such image: temp_image:latest


Deleted build cache objects:
iassgcruzgc3tr9j8uxen3c1s
bxnyo2fy2b42p2x86gqqxxoak
jibxs9u3elymjz9325tsx5rfw
wkrbtczvom6l5fl6vfnwy3062
azozrnz2699wlkwmqpxet1yj9
lk3hjhsg9bh57yswo8yj47wdf
rwxt085x2mxz21ffyx0fkf1yg
w5vy3n6likwi5r30edt94p8fq
pkn4o6w93y4rymxovivu4m4r6
nzgqp30qca5c486uny9ydxuh5
ga5b4uixh96ibxkxscvpphof0
blklpj2trz4fukgsw9t51asu0
gui7hekjk7jjr9j0nbu1ymnpb
plgwn0lios7kgin2a021oy709
fq6hnrhruuud3eltxobr0qybq
a6qyu3d8c6flryi575zctcphd

Total reclaimed space: 22.9GB


True

In [21]:
notify("come back!")

come back!


In [12]:
repeat = [
    'r2r',
    'torch-pruning',
    'boto3',
    'cloud-custodian',
    'aim',
    'instructor',
    'yfinance',
    'datasets',
    'sympy',
    'core'
    ]

In [17]:
for i, r in enumerate(repeat):
    success = test_repo(*get_repo(r)[0])
    notify(f"({i}/{len(repeat)}) {r}: {success}")

FROM python:3.10



COPY . /app/



WORKDIR /app



RUN pip install -e .



RUN python -m unittest discover

attempting to build using dockerfile, logs written to working_logs/opencompass.log.
/tmp/tmp.62i7JxsKSP/opencompass




At least 1 test passed.
Docker build completed successfully on virtual machine.
Untagged: temp_image:latest
Deleted: sha256:6f6a48576883e124fcf1b6d8248a045959c99e32fb54022c30f337fceb95439e
Deleted build cache objects:
vfo1myje4kkm1ioo4sh8xvxn9
zrcw7cdv3ibetgcq0j96nx0v7
ianbb455cmioxvex93ex6ypu6
1snzd7lbvzd16yrvw05getlzh
m3j4xe2h7n9x2qq4c68u0a0v0
sdy4nab14jd7zaeeasteil4ie
9v8erlkohv21mkns95i63tpjd
mf8fakufezsauc9qspu4l4fvc
qr0zfd6xmzt3nrul5lo0x84wb
r0f9705td7i63xwy8truwglvi
yke383yzije6hnbq0v54832gc
afvvlxd21zneach04tqdisvbz
pusm9qoxy9b73zp7qfpzh57jg
nj75dy0710nd7q5hgwqlrf6vl

Total reclaimed space: 9.247GB
(0/12) opencompass: True
FROM python:3.11

WORKDIR /app

COPY . /app/

RUN pip install poetry

RUN poetry install

RUN poetry run pytest


attempting to build using dockerfile, logs written to working_logs/R2R.log.
/tmp/tmp.wLJJKS70R3/R2R


Error response from daemon: No such image: temp_image:latest


Deleted build cache objects:
1q3rnz8iljmxngv1z2nieadru
ufm2ychjoakvqk3z1dj7vhysn
yt3yo5d7butcc9h3c4hdkny88
k1u32mzafpuwm3bcphubs1jjy
bm8xpqrb7aufl6tn6fylrps01
c884nju7wnitb3nzgjpv2ekjy
8o13nohj4w0k9m9wdiamhg7lb
u8ylnjwqabkep1sbftz31gt4l
n9f7myt6dd9ip2scgbkk4lw22
nar5d4hvzsgl4tqf87kn4qt9d
hy2b5twi5ze41q35m1ze1kghb
mz4pix5yhbaoh0tgg43dzamb3
rjrh3r6nngztkagx1erids3os

Total reclaimed space: 209.8MB
(1/12) r2r: False
FROM python:3.10

WORKDIR /app

COPY . /app/



RUN pip install -r requirements.txt

RUN pip install torchvision

RUN pip install pytest



RUN pytest

attempting to build using dockerfile, logs written to working_logs/Torch-Pruning.log.
/tmp/tmp.eHF0tkpwhB/Torch-Pruning


Error response from daemon: No such image: temp_image:latest


Deleted build cache objects:
piju5f38qwl69gs39x9qog99u
um63ffpqssmizsyo3ei2xo23u
rjqh38lnjwsjev9mggf996psl
7afvec7cexzz0sf29hbmnmuow
aye6u3vvzo8k199qwmjjau8t4
udyhmxpxwssks9eqampatdupf
vab3briduekfhvjvtre7dnozo
7fulxdcs7euqahc47blrtwge9
1hhwhkqp2nlq1ml5nlxr0wytz
rzr506fv2tfq4px4wjle8gdy4
2br8ke99hs05tyy8voum2yead
tjdwk6s5nedqmgtnesnidl2zb

Total reclaimed space: 28.24MB
(2/12) torch-pruning: False
FROM python:3.10

WORKDIR /app

COPY . /app/

RUN pip install -r requirements.txt

RUN make tests

attempting to build using dockerfile, logs written to working_logs/warehouse.log.
/tmp/tmp.T4cP83QUVy/warehouse


Error response from daemon: No such image: temp_image:latest


Deleted build cache objects:
em92784d5t8oonbc6mvu8sym8
g0pixpgskpxbog14an50v3uw9
zuw4ak7eyr3f4llx3pt6ulanz
sw0q97ommhfnyrzqhnjvd9sic
dpfvjxd57qxv7liqpaea3lwk8
etvghcuab25pkm5o008f7ig3k
jru1kmcb985bxcqpkfuxyu4ea
848uow113uor29sw8vdh8yr1u
qhdol0yos1v55j8jxjlildtuj
3qpra6tnvw0x1cos3l64chgsb
p6yc74d2xp4d3xlya3n483epu
jqlvcku4woer9xkszqffzdua0

Total reclaimed space: 1.013GB
(3/12) warehouse: False
FROM python:3.8



COPY . /app/



WORKDIR /app



RUN pip install -r requirements.txt

RUN pip install -r requirements-dev.txt



RUN pytest

attempting to build using dockerfile, logs written to working_logs/boto3.log.
/tmp/tmp.aa5XYPZtq7/boto3


Error response from daemon: No such image: temp_image:latest


Deleted build cache objects:
xxbg83chndijb344jz8eptuei
m84v8atyyisxttzrdd7wilbgs
t4atwtrk4n4ryenwg7pwbjeo8
7u05vmmsyiixymvj1divqv3qy
i7u8cuh45lifv1yvdo1b0ycqb
v7i4uu8688v1b7jzz3fla0235
8pn4egnam52w3pbibv3zzibdw
dzymwiecxdflt6tm87dclw2xe
7hcxb31hiki6th2urms8yglxk
kxlva6yfs9tjtbvbb67u84z18
uwcc2v06jin4tirk68unhgeun
mba2userhh2c1k3dn5x35d6nl
a1oifhgl586p6bvu9rgjxo7wn
ia1aq39wdhxrne75ctrdkqrj5

Total reclaimed space: 188.4MB
(4/12) boto3: False
# Use the official Python image as base

FROM python:3.8



# Set the working directory in the container

WORKDIR /app



# Copy the entire repository into the container

COPY . /app/



# Install Poetry

RUN pip install poetry



# Installing project dependencies

RUN poetry install



RUN poetry run make install



RUN poetry run make test

attempting to build using dockerfile, logs written to working_logs/cloud-custodian.log.
/tmp/tmp.cLnMiixV8M/cloud-custodian


Error response from daemon: No such image: temp_image:latest


Deleted build cache objects:
oa7bisif55t04d9n79h0wxr1r
gcrg3sy37mnaqubzzf9g7cbdv
qmunjp0udyc9y97xxgc34vdvm
t3ehayivhn4e2ok5bpsu0o78a
ktdekmywgh2wt4qtckrobia1o
qbczf9swqfk7biytimxdgwrij
ajf01nvxkcbdbvd2ip9ugktka
xoz7d3jpjwjz1wa9mk90esjuw
qjmvh2o743ny6b40na1h5w9xs
kai32nwo86edg16zyokxab5sy
p8lf0y8i0bacgud0ogqgbkxwg
nkr38hrdchp03w3pd5rfrpyys
hitc7ig4l8inccim9ry3kuk4v

Total reclaimed space: 91.75MB
(5/12) cloud-custodian: False
FROM python:3.8



COPY . /app/



WORKDIR /app



RUN pip install -r tests/requirements.txt



RUN pytest

attempting to build using dockerfile, logs written to working_logs/aim.log.
/tmp/tmp.1GOQ9TZvLV/aim


Error response from daemon: No such image: temp_image:latest


Deleted build cache objects:
sffauzuuxtmt4pzey3ttnccd2
0gzb03gmd0ai6stheszd64tlx
x3h3pnh2q9qr4jn9wwya3srwv
8hw6pvs0cqurdy53t8km8g4xr
c3ux8uj5o5dfg1f52szvjntc3
lw2sbze3jfgpzrk2ttjo76fyz
tolufom0uodh1gwx8k5zq1i78
h4q0b4hdkspcrt2uv2d3jlov4
x4tum9f9fltujb86obibzg99f
qdnqbvua8coc45bv1n5rli6lk
ops3rlr5186fpkflbxr92rab4
31zp6anrdp12zxv583k2fuwl9

Total reclaimed space: 224.6MB
(6/12) aim: False
# Use the official Python image as base

FROM python:3.8



# Set the working directory in the container

WORKDIR /app



# Copy the entire repository into the container

COPY . /app/



# Install Poetry

RUN pip install poetry



# Installing project dependencies

RUN poetry install



# Run the test suite

RUN poetry run pytest

attempting to build using dockerfile, logs written to working_logs/instructor.log.
/tmp/tmp.zho2nvuzFp/instructor


Error response from daemon: No such image: temp_image:latest


Deleted build cache objects:
ndbt3091ypg8s2mfkgrk5rh4v
23ufrf32wf6mu8oeg4ahvq7c4
z7wffmesvo2m2x415r2x3anto
k7ai69si4ltvw3e3utt1b3tpn
wjf5quzj2uollf1spwwwotaf6
v1ht2pxfbjjd6vnjda5fc0yxk
9ehtirgyx3lkgj503l8p1di1t
yootybus45pke6lpwwfy2t2vm
twqe8b7hm5esn6ir6y8r4hu7r
1ppm411bwh4bvs862o2e0jg4u
q8ilafim4pc8oxvto1vweallv
0s3gmxywf4sr3v53dw6koyus5
jqng18j5fnzgd4dehxq7zpcuj
7rqwmtrt4qqelj063wij8qpaq

Total reclaimed space: 1.288GB
(7/12) instructor: False
FROM python:3.8



COPY . /app/



WORKDIR /app



RUN pip install -r requirements.txt

RUN pip install requests_cache requests_ratelimiter



RUN python -m unittest discover

attempting to build using dockerfile, logs written to working_logs/yfinance.log.
/tmp/tmp.Ll2ZcZliDb/yfinance


Error response from daemon: No such image: temp_image:latest


Deleted build cache objects:
dj8w3umexs1q2gib7betx8awu
jb2urzy15nlm64rifg0y1s3qs
2u5zy1rh0gqjdve7vl0gvl76g
4rqdmc3eokf4lay8roqde8a72
s1bgflq5v1z6gydwlwioxox8s
z3ke2k79ylclkon62c5p23x45
gctuhxa6ayasg4f62hncpskb5
0v9wtzc7sj9aeaxfcsggukfoi
curb5025av3z8sx1lk7ohrogw
m8sh4xwudezidpo4pfipbx8cp
y23o3mmigijmz5wtpexjgflgs
9actk1xl2v2wm68zfwnxap6d9
opxu92u7m6b09a96y9w4alug1
r0c2ftk1ol9vn5otm44l73x8o

Total reclaimed space: 214.8MB
(8/12) yfinance: False
FROM python:3.11



COPY . /app/



WORKDIR /app



RUN pip install -r ".[dev]"



RUN make test

attempting to build using dockerfile, logs written to working_logs/datasets.log.
/tmp/tmp.J9aXtzZ21K/datasets


Error response from daemon: No such image: temp_image:latest


Deleted build cache objects:
t2wgofijwhdiawg5onkptr75s
yjnb6vuxn6gw5vryxvkvfx7wk
3didvs8t0wzjwp7vtd0sgc3pc
qes09ydwnhex9p9ffsp0nenjt
6b0dvltb0sgv9ay7k2byg2sg3
frtzxmzotbk5m58m89vdn23rw
4ditgk1rc85ezn66og1uww9nl
bwz3sb49w9emre5y380jalc5b
y10na5d2i7vghq9xq83jt2s85
lz6ufalrqgnnrmwkos8ake446
s4kleph0yj6tpybmowhr2l1n7
wra1479jedwc221lp3p2utmai

Total reclaimed space: 196MB
(9/12) datasets: False
FROM python:3.11



COPY . /app/



WORKDIR /app



RUN pip install mpmath



RUN python setup.py test

attempting to build using dockerfile, logs written to working_logs/sympy.log.
/tmp/tmp.Z8reZlwEE8/sympy




INTERRUPTING


Error response from daemon: No such image: temp_image:latest


Total reclaimed space: 0B
(10/12) sympy: False
FROM python:3.12



COPY . /app/



WORKDIR /app



RUN pip install -r requirements_all.txt

RUN pip install -r requirements_test_all.txt



RUN python -m unittest discover

attempting to build using dockerfile, logs written to working_logs/core.log.
/tmp/tmp.jHEGzQ6SO9/core


Error response from daemon: No such image: temp_image:latest


Deleted build cache objects:
w23rbu8i9j9931maodgopm4lb
xvxpj4rz1zd9dt7teqxsuut11
skf6lpnzp5w77szvqcjdnicpx
lc97xm45p4xy07exzvl9pyb3u
mdpi3oyfg150q6jczchq74yn4
23ep90s604evqt2v7uy731zoy
f927f7t35q401t1dpb6kzzkep
rabrkerayla8x95uuy21l25xh
l6qgwc4f1vy79g3nkayi5x6hp
uz95q3wbxpgvqaopeajhdgbw0
k9rguazt3pqenm6bssur1v3db
lexaozqxz5y7w714i8dxbpz8r
16br5uvi1ix2jqg2lr5338rzc
xxjxbtaeynm9l65c27lffytzs
yixgax0oyr4tvmavdxaw7o5um
jvslqbg6hzhf5f6b6owja8ifw
gpjfiynwu0a7vwf111ph8h45z
hesecll5o3ivs09nwhpo61iu3
pjigdjjcjg1c0gdo1he552ryt
2vfbes6c30g9448spirtoryfn
xu7sq7pa4vxc197ubck18xeya
xa7cwx49wjhs1x8tqoq5e8kl0
n3tcqfqy6suhbai8zgxaysmxo

Total reclaimed space: 4.283GB
(11/12) core: False
