# Doc and code similarity experiment

## 1. Install inspect4py

In [1]:
!pip3 install inspect4py



In [2]:
!inspect4py --version

inspect4py, version 0.0.8


## 2. Download dataset and use 'inspect4py' on them

In [3]:
REPOS = {
    'keon/algorithms': 'Algorithms',
    'prabhupant/python-ds': 'Algorithms',
    'grantjenks/python-sortedcontainers': 'Algorithms',
    'TheAlgorithms/Python': 'Algorithms',
    'beetbox/audioread': 'Audio',
    'worldveil/dejavu': 'Audio',
    'keunwoochoi/kapre': 'Audio',
    'librosa/librosa': 'Audio',
    'sergree/matchering': 'Audio',
    'tyiannak/pyAudioAnalysis': 'Audio',
    'jiaaro/pydub': 'Audio',
    'Parisson/TimeSide': 'Audio',
    'lepture/authlib': 'OAuth',
    'pennersr/django-allauth': 'OAuth',
    'evonove/django-oauth-toolkit': 'OAuth',
    'idan/oauthlib': 'OAuth',
    'joestump/python-oauth2': 'OAuth',
    'omab/python-social-auth': 'OAuth',
    'paramiko/paramiko': 'Cryptography',
    'pyca/pynacl': 'Cryptography',
    'jindaxiang/akshare': 'Downloader',
    's3tools/s3cmd': 'Downloader',
    'bloomreach/s4cmd': 'Downloader',
    'euske/pdfminer': 'PDF',
    'mstamy2/PyPDF2': 'PDF',
    'lepture/mistune': 'Markdown',
    'waylan/Python-Markdown': 'Markdown'
}

In [4]:
!mkdir -p output
for repo in REPOS:
    !mkdir -p {repo} & & git clone {f"https://github.com/{repo}.git"} {repo}
    !inspect4py -i {repo} -o output/ {repo} -sc -rm

Cloning into 'keon/algorithms'...
remote: Enumerating objects: 5162, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 5162 (delta 11), reused 16 (delta 3), pack-reused 5136[K
Receiving objects: 100% (5162/5162), 1.42 MiB | 3.58 MiB/s, done.
Resolving deltas: 100% (3230/3230), done.
Creating jsDir:output/keon/algorithms/algorithms/json_files
Creating jsDir:output/keon/algorithms/algorithms/algorithms/json_files
Creating jsDir:output/keon/algorithms/algorithms/algorithms/tree/json_files
Error when processing invert_tree.py:  (<class 'AttributeError'>, AttributeError("'NoneType' object has no attribute '_fields'"), <traceback object at 0x10291db80>)
Error when processing longest_consecutive.py:  (<class 'AttributeError'>, AttributeError("'NoneType' object has no attribute '_fields'"), <traceback object at 0x102a31780>)
Error when processing deepest_left.py:  (<class 'AttributeError'>, AttributeError("'

## 3. Extract docstrings and codes from repositories

In [1]:
import json


def funcs_to_lists(funcs, func_codes, docs):
    for func_name, func_info in funcs.items():
        if func_info.get("source_code") is not None:
            func_codes.append(func_info["source_code"])
        if func_info.get("doc") is None:
            continue
        for key in ["full", "long_description", "short_description"]:
            if func_info["doc"].get(key) is not None:
                docs.append(f"{func_name} {func_info['doc'].get(key)}")
                break


def file_to_lists(filename):
    func_codes = []
    docs = []
    with open(filename, "r") as f:
        dic = json.load(f)
    dic.pop("readme_files", None)
    for dir_name, files in dic.items():
        for file in files:
            if file.get("functions") is not None:
                funcs_to_lists(file["functions"], func_codes, docs)
            if file.get("classes") is not None:
                for class_name, class_info in file["classes"].items():
                    if class_info.get("methods") is not None:
                        funcs_to_lists(class_info["methods"], func_codes, docs)

    return func_codes, docs

In [7]:
repo_info = {}
for repo, topic in REPOS.items():
    repo_info[repo] = {}
    function_list, docstring_list = file_to_lists(f"output/{repo}/directory_info.json")
    repo_info[repo]["docs"] = docstring_list
    repo_info[repo]["funcs"] = function_list
    repo_info[repo]["topic"] = topic

## 4. Download pre-trained model

In [14]:
!pip3 install sentence-transformers
!pip3 install transformers
!curl -o unixcoder.py https: // raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10352  100 10352    0     0   198k      0 --:--:-- --:--:-- --:--:--  215k


## 5. Generate embedding on all repositories

In [16]:
import torch
from unixcoder import UniXcoder
from sentence_transformers import SentenceTransformer

device = torch.device("cuda"
                      if torch.cuda.is_available()
                      else "mps"
if torch.backends.mps.is_available()
else "cpu")
doc_model = SentenceTransformer("all-mpnet-base-v2", device=device)
code_model = UniXcoder("Lazyhope/unixcoder-nine-advtest")
code_model.to(device)


def get_code_embeddings(code):
    tokens_ids = code_model.tokenize([code], max_length=512, mode="<encoder-only>")
    source_ids = torch.tensor(tokens_ids).to(device)
    _, embeddings = code_model(source_ids)

    return embeddings


def get_repo_embeddings(lst, input_type):
    if not lst:
        return None
    with torch.no_grad():
        if input_type == "code":
            embeddings_list = torch.concat([get_code_embeddings(code) for code in lst])
        elif input_type == "doc":
            embeddings_list = doc_model.encode(lst, convert_to_tensor=True)

        mean_embeddings = torch.mean(embeddings_list, dim=0)

    return mean_embeddings

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/444k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

In [17]:
from tqdm import tqdm

# Generate code and docstring embeddings for all repositories
for repo_name, repo_dict in tqdm(repo_info.items()):
    print(f" - Generating embeddings for {repo_name} - ")
    if repo_dict.get("code_embeddings") is None:
        repo_dict["code_embeddings"] = get_repo_embeddings(repo_dict["funcs"], input_type="code")
    if repo_dict.get("doc_embeddings") is None:
        repo_dict["doc_embeddings"] = get_repo_embeddings(repo_dict["docs"], input_type="doc")

  0%|          | 0/27 [00:00<?, ?it/s]

 - Generating embeddings for keon/algorithms - 


  4%|▎         | 1/27 [02:17<59:25, 137.15s/it]

 - Generating embeddings for prabhupant/python-ds - 


  7%|▋         | 2/27 [02:42<29:48, 71.53s/it] 

 - Generating embeddings for grantjenks/python-sortedcontainers - 


 11%|█         | 3/27 [03:04<19:34, 48.92s/it]

 - Generating embeddings for TheAlgorithms/Python - 


 15%|█▍        | 4/27 [10:32<1:19:05, 206.31s/it]

 - Generating embeddings for beetbox/audioread - 


 19%|█▊        | 5/27 [10:40<49:25, 134.81s/it]  

 - Generating embeddings for worldveil/dejavu - 


 22%|██▏       | 6/27 [10:52<32:38, 93.24s/it] 

 - Generating embeddings for keunwoochoi/kapre - 


 26%|██▌       | 7/27 [11:08<22:35, 67.76s/it]

 - Generating embeddings for librosa/librosa - 


 30%|██▉       | 8/27 [12:46<24:30, 77.37s/it]

 - Generating embeddings for sergree/matchering - 


 33%|███▎      | 9/27 [12:54<16:42, 55.68s/it]

 - Generating embeddings for tyiannak/pyAudioAnalysis - 


 37%|███▋      | 10/27 [13:19<13:08, 46.37s/it]

 - Generating embeddings for jiaaro/pydub - 


 41%|████      | 11/27 [14:40<15:08, 56.79s/it]

 - Generating embeddings for Parisson/TimeSide - 


 44%|████▍     | 12/27 [16:07<16:31, 66.12s/it]

 - Generating embeddings for lepture/authlib - 


 48%|████▊     | 13/27 [18:55<22:36, 96.90s/it]

 - Generating embeddings for pennersr/django-allauth - 


 52%|█████▏    | 14/27 [20:46<21:55, 101.18s/it]

 - Generating embeddings for evonove/django-oauth-toolkit - 


 56%|█████▌    | 15/27 [22:05<18:54, 94.57s/it] 

 - Generating embeddings for idan/oauthlib - 


 59%|█████▉    | 16/27 [23:39<17:17, 94.31s/it]

 - Generating embeddings for joestump/python-oauth2 - 


 63%|██████▎   | 17/27 [24:06<12:21, 74.20s/it]

 - Generating embeddings for omab/python-social-auth - 


 67%|██████▋   | 18/27 [24:26<08:39, 57.75s/it]

 - Generating embeddings for paramiko/paramiko - 


 70%|███████   | 19/27 [25:34<08:08, 61.00s/it]

 - Generating embeddings for pyca/pynacl - 


 74%|███████▍  | 20/27 [26:41<07:19, 62.79s/it]

 - Generating embeddings for jindaxiang/akshare - 


 78%|███████▊  | 21/27 [29:25<09:18, 93.08s/it]

 - Generating embeddings for s3tools/s3cmd - 


 81%|████████▏ | 22/27 [29:51<06:05, 73.02s/it]

 - Generating embeddings for bloomreach/s4cmd - 


 85%|████████▌ | 23/27 [29:53<03:26, 51.58s/it]

 - Generating embeddings for euske/pdfminer - 


 89%|████████▉ | 24/27 [29:56<01:51, 37.06s/it]

 - Generating embeddings for mstamy2/PyPDF2 - 


 93%|█████████▎| 25/27 [31:01<01:30, 45.37s/it]

 - Generating embeddings for lepture/mistune - 


 96%|█████████▋| 26/27 [31:55<00:48, 48.10s/it]

 - Generating embeddings for waylan/Python-Markdown - 


100%|██████████| 27/27 [33:46<00:00, 75.06s/it]


## 6. Similarity calculation

In [18]:
from torch.nn import CosineSimilarity
from itertools import combinations

cossim = CosineSimilarity(dim=0, eps=1e-8)
res = []
num_of_repos = len(repo_info)
num_of_rows = num_of_repos * (num_of_repos - 1) // 2
for repo1, repo2 in tqdm(combinations(repo_info.keys(), 2), total=num_of_rows):
    code_embeddings1 = repo_info[repo1]["code_embeddings"]
    code_embeddings2 = repo_info[repo2]["code_embeddings"]
    if code_embeddings1 is None or code_embeddings2 is None:
        code_similarity = None
    else:
        code_similarity = cossim(code_embeddings1, code_embeddings2).cpu().detach().numpy().item()

    doc_embeddings1 = repo_info[repo1]["doc_embeddings"]
    doc_embeddings2 = repo_info[repo2]["doc_embeddings"]
    if doc_embeddings1 is None or doc_embeddings2 is None:
        doc_similarity = None
    else:
        doc_similarity = cossim(doc_embeddings1, doc_embeddings2).cpu().detach().numpy().item()

    topic1 = repo_info[repo1]["topic"]
    topic2 = repo_info[repo2]["topic"]

    res.append((repo1, repo2, topic1, topic2, code_similarity, doc_similarity))

100%|██████████| 351/351 [00:01<00:00, 208.58it/s]


In [19]:
import pandas as pd

df = pd.DataFrame(res, columns=["repo1", "repo2", "topic1", "topic2", "code_sim", "doc_sim"])

# NaN values due to missing code/docstring in the repo will be skipped
df["avg_sim"] = df[["code_sim", "doc_sim"]].mean(axis=1, skipna=True)
df

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,avg_sim
0,keon/algorithms,prabhupant/python-ds,Algorithms,Algorithms,0.776927,0.822550,0.799739
1,keon/algorithms,grantjenks/python-sortedcontainers,Algorithms,Algorithms,0.727400,0.707484,0.717442
2,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.897334,0.846333,0.871834
3,keon/algorithms,beetbox/audioread,Algorithms,Audio,0.009483,0.279662,0.144572
4,keon/algorithms,worldveil/dejavu,Algorithms,Audio,0.176745,0.410920,0.293833
...,...,...,...,...,...,...,...
346,euske/pdfminer,lepture/mistune,PDF,Markdown,0.297969,0.251666,0.274818
347,euske/pdfminer,waylan/Python-Markdown,PDF,Markdown,0.305368,0.266002,0.285685
348,mstamy2/PyPDF2,lepture/mistune,PDF,Markdown,0.331682,0.438587,0.385135
349,mstamy2/PyPDF2,waylan/Python-Markdown,PDF,Markdown,0.525616,0.608366,0.566991


In [20]:
# Sort the table based on code semantic similarity
df.sort_values("code_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,avg_sim
0,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.936373,0.952218,0.944295
1,evonove/django-oauth-toolkit,idan/oauthlib,OAuth,OAuth,0.919535,0.871041,0.895288
2,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.897334,0.846333,0.871834
3,lepture/authlib,evonove/django-oauth-toolkit,OAuth,OAuth,0.889849,0.813740,0.851794
4,idan/oauthlib,joestump/python-oauth2,OAuth,OAuth,0.880004,0.884434,0.882219
...,...,...,...,...,...,...,...
346,tyiannak/pyAudioAnalysis,pennersr/django-allauth,Audio,OAuth,-0.127913,0.133521,0.002804
347,sergree/matchering,pennersr/django-allauth,Audio,OAuth,-0.128052,,-0.128052
348,jindaxiang/akshare,s3tools/s3cmd,Downloader,Downloader,-0.128295,0.174601,0.023153
349,librosa/librosa,pennersr/django-allauth,Audio,OAuth,-0.134422,0.117186,-0.008618


In [21]:
# Sort the table based on docstring semantic similarity
df.sort_values("doc_sim", ascending=False).reset_index(drop=True)

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,avg_sim
0,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.936373,0.952218,0.944295
1,idan/oauthlib,joestump/python-oauth2,OAuth,OAuth,0.880004,0.884434,0.882219
2,evonove/django-oauth-toolkit,idan/oauthlib,OAuth,OAuth,0.919535,0.871041,0.895288
3,lepture/authlib,joestump/python-oauth2,OAuth,OAuth,0.835101,0.849447,0.842274
4,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.897334,0.846333,0.871834
...,...,...,...,...,...,...,...
346,sergree/matchering,bloomreach/s4cmd,Audio,Downloader,0.003938,,0.003938
347,sergree/matchering,euske/pdfminer,Audio,PDF,0.146911,,0.146911
348,sergree/matchering,mstamy2/PyPDF2,Audio,PDF,0.192575,,0.192575
349,sergree/matchering,lepture/mistune,Audio,Markdown,0.021387,,0.021387


In [22]:
# Sort the table based on average similarity and save it
df = df.sort_values("avg_sim", ascending=False).reset_index(drop=True)
df

Unnamed: 0,repo1,repo2,topic1,topic2,code_sim,doc_sim,avg_sim
0,lepture/authlib,idan/oauthlib,OAuth,OAuth,0.936373,0.952218,0.944295
1,evonove/django-oauth-toolkit,idan/oauthlib,OAuth,OAuth,0.919535,0.871041,0.895288
2,idan/oauthlib,joestump/python-oauth2,OAuth,OAuth,0.880004,0.884434,0.882219
3,keon/algorithms,TheAlgorithms/Python,Algorithms,Algorithms,0.897334,0.846333,0.871834
4,lepture/authlib,evonove/django-oauth-toolkit,OAuth,OAuth,0.889849,0.813740,0.851794
...,...,...,...,...,...,...,...
346,jindaxiang/akshare,bloomreach/s4cmd,Downloader,Downloader,-0.171228,0.085705,-0.042762
347,sergree/matchering,omab/python-social-auth,Audio,OAuth,-0.048035,,-0.048035
348,sergree/matchering,evonove/django-oauth-toolkit,Audio,OAuth,-0.058744,,-0.058744
349,sergree/matchering,s3tools/s3cmd,Audio,Downloader,-0.062717,,-0.062717
