In [7]:
import json
from typing import Any, Dict

In [8]:
REPO_SETS = [
    'eval/resources/python_repos_5k-1k.json',
    'eval/resources/python_repos_10k-5k.json',
    'eval/resources/python_repos_20k-10k.json',
    'eval/resources/python_repos_20k+.json'
]
DOCKERFILE_PATH = "resources/working_dockerfiles"
def load_repos():
    repos = {repo_set: json.load(open(repo_set, 'r')) for repo_set in REPO_SETS}
    return repos

In [9]:
repos = load_repos()

In [10]:
def calc_visbility(repo: Dict[str, Any]):
    # Calculate the "visibility" of a repo's documentation based on the following heursitic
    # 1 / (num directories to navigate + num files to find)
    dirs = [relevant.split('/') for relevant in repo['relevant_docs']]
    paths = [['/'.join(repo_dirs[:i+1]) for i in range(len(repo_dirs))] for repo_dirs in dirs]
    visibility = set([d for directories in paths for d in directories])
    return 1 / len(visibility) if len(visibility) > 0 else 0

In [43]:
for repo in repos[REPO_SETS[-1]]:
    print(repo['url'])
    print(" - " + ', '.join(repo['relevant_docs']))
    visivility = calc_visbility(repo)
    repo['visivility'] = visivility
    print(" - " + str(visivility))


https://github.com/tiangolo/fastapi.git
 - docs/en/docs/contributing.md, docs/en/docs/tutorial/testing.md
 - 0.16666666666666666
https://github.com/psf/black.git
 - docs/contributing/the_basics.md
 - 0.3333333333333333
https://github.com/tqdm/tqdm.git
 - 
 - 0
https://github.com/Textualize/rich.git
 - CONTRIBUTING.md
 - 1.0
https://github.com/OpenInterpreter/open-interpreter.git
 - docs/CONTRIBUTING.md
 - 0.5
https://github.com/home-assistant/core.git
 - 
 - 0
https://github.com/sherlock-project/sherlock.git
 - 
 - 0
https://github.com/explosion/spaCy.git
 - README.md
 - 1.0
https://github.com/soimort/you-get.git
 - README.md
 - 1.0
https://github.com/Textualize/textual.git
 - CONTRIBUTING.md
 - 1.0


In [11]:
import os
from difflib import get_close_matches
from pprint import pprint
from tqdm import tqdm
from doc_test.agent.functions import _get_file_contents, get_api_url

def calc_informativity(repo: Dict[str, Any], set_name: str):
    # Calculate the "informativity" of a repo's documentation based on the following heuristic
    # (Num lines in model dockerfile that appear in documentation) / (Num lines in model dockerfile total)

    # step 1: load dockerfile
    name = repo['url'].split('/')[-1][:-4]
    set_name = set_name.split('_')[-1][:-5]
    dockerfile_path = os.path.join(DOCKERFILE_PATH, set_name, name+'.dockerfile')
    if not os.path.exists(dockerfile_path):
        print(f"path {dockerfile_path} does not exist")
        return -2, []
    dockerfile = open(dockerfile_path, 'r').readlines()

    dockerfile = [
        ' '.join(line.split(' ')[1:]).strip()
        for line in dockerfile
        if not (
            line.startswith('COPY') or line.startswith('WORKDIR') or line.strip() == ''
        )
    ]
    # step 2: load documents
    if len(repo['relevant_docs']) == 0:
        return -1, []
    api_url = get_api_url(repo['url'])
    documents = [_get_file_contents(api_url, path, repo['ref']).split('\n') for path in repo['relevant_docs']]

    # step 3: find dockerfile lines that occur in documentation
    matched_lines = {}
    for doc, doc_name in zip(documents, repo['relevant_docs']):
        for line in doc:
            matches = get_close_matches(line, dockerfile, cutoff=0.8)
            if len(matches) > 0:
                if matches[0] not in matched_lines:
                    matched_lines[matches[0]] = []
                matched_lines[matches[0]].append((line, doc_name))
            else:
                for df_line in dockerfile:
                    if df_line in line:
                        if df_line not in matched_lines:
                            matched_lines[df_line] = []
                        matched_lines[df_line].append((line, doc_name))

    return (len(matched_lines) / len(dockerfile), matched_lines)

In [13]:
for repo_set in REPO_SETS:
    for repo in repos[repo_set]:
        print(repo['url'])
        print(" - " + ', '.join(repo['relevant_docs']))

        info = calc_informativity(repo, repo_set)
        repo['info'] = info[0]
        print(" - informativity = " + str(info[0]))

        visivility = calc_visbility(repo)
        repo['visivility'] = visivility
        print(" - visibility = " + str(visivility))

https://github.com/mandarons/icloud-drive-docker.git
 - 
path resources/working_dockerfiles/20k+/icloud-drive-docker.dockerfile does not exist
 - informativity = -2
 - visibility = 0
https://github.com/typeddjango/django-stubs.git
 - CONTRIBUTING.md
path resources/working_dockerfiles/20k+/django-stubs.dockerfile does not exist
 - informativity = -2
 - visibility = 1.0
https://github.com/Pennyw0rth/NetExec.git
 - tests/README.md
path resources/working_dockerfiles/20k+/NetExec.dockerfile does not exist
 - informativity = -2
 - visibility = 0.5
https://github.com/CVHub520/X-AnyLabeling.git
 - docs/en/get_started.md
path resources/working_dockerfiles/20k+/X-AnyLabeling.dockerfile does not exist
 - informativity = -2
 - visibility = 0.3333333333333333
https://github.com/open-compass/opencompass.git
 - README.md, docs/en/get_started/installation.md
path resources/working_dockerfiles/20k+/opencompass.dockerfile does not exist
 - informativity = -2
 - visibility = 0.2
https://github.com/SciPhi

In [65]:
for repo_set in REPO_SETS:
    with open(repo_set, 'w') as f:
        json.dump(repos[repo_set], f)