# Graph creation

In [4]:
%load_ext autoreload
%autoreload 2
import requests
import os
from dotenv import load_dotenv
from datetime import datetime
import time
import pandas as pd
from datasets import load_dataset
import sys
import json
load_dotenv()


from utils import get_sql_query

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
GRAPHS_URL = "https://graphs-api.codegpt.co" # http://localhost:8002
SECRET_KEY = os.environ['SECRET_KEY']

data_template = {
    "git_provider": "github",
    "repo_org": "myOrg",
    "repo_name": "myRepo",
    "branch": "myBranch",
    "connection_id": os.environ['CONNECTION_ID'],
    "generate_documentation": True,
}

# TEST RESPONSE
response = requests.get(GRAPHS_URL)
response.json()

{'message': 'Hello, World!'}

In [6]:
# We use the Lite version of the SWE-bench dataset
swe_ds = load_dataset("princeton-nlp/SWE-bench_Lite", cache_dir="../data")
swe_df = pd.DataFrame(swe_ds['test'])
print(swe_df.shape)
swe_df.head()

Generating dev split: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 23/23 [00:00<00:00, 397.61 examples/s]
Generating test split: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 300/300 [00:00<00:00, 12804.30 examples/s]


(300, 12)


Unnamed: 0,repo,instance_id,base_commit,patch,test_patch,problem_statement,hints_text,created_at,version,FAIL_TO_PASS,PASS_TO_PASS,environment_setup_commit
0,astropy/astropy,astropy__astropy-12907,d16bfe05a744909de4b27f5875fe0d4ed41ce607,diff --git a/astropy/modeling/separable.py b/a...,diff --git a/astropy/modeling/tests/test_separ...,Modeling's `separability_matrix` does not comp...,,2022-03-03T15:14:54Z,4.3,"[""astropy/modeling/tests/test_separable.py::te...","[""astropy/modeling/tests/test_separable.py::te...",298ccb478e6bf092953bca67a3d29dc6c35f6752
1,astropy/astropy,astropy__astropy-14182,a5917978be39d13cd90b517e1de4e7a539ffaa48,diff --git a/astropy/io/ascii/rst.py b/astropy...,diff --git a/astropy/io/ascii/tests/test_rst.p...,Please support header rows in RestructuredText...,,2022-12-16T11:13:37Z,5.1,"[""astropy/io/ascii/tests/test_rst.py::test_rst...","[""astropy/io/ascii/tests/test_rst.py::test_rea...",5f74eacbcc7fff707a44d8eb58adaa514cb7dcb5
2,astropy/astropy,astropy__astropy-14365,7269fa3e33e8d02485a647da91a5a2a60a06af61,diff --git a/astropy/io/ascii/qdp.py b/astropy...,diff --git a/astropy/io/ascii/tests/test_qdp.p...,ascii.qdp Table format assumes QDP commands ar...,Welcome to Astropy ðŸ‘‹ and thank you for your fi...,2023-02-06T19:20:34Z,5.1,"[""astropy/io/ascii/tests/test_qdp.py::test_rou...","[""astropy/io/ascii/tests/test_qdp.py::test_get...",5f74eacbcc7fff707a44d8eb58adaa514cb7dcb5
3,astropy/astropy,astropy__astropy-14995,b16c7d12ccbc7b2d20364b89fb44285bcbfede54,diff --git a/astropy/nddata/mixins/ndarithmeti...,diff --git a/astropy/nddata/mixins/tests/test_...,"In v5.3, NDDataRef mask propagation fails when...",Welcome to Astropy ðŸ‘‹ and thank you for your fi...,2023-06-27T19:48:18Z,5.2,"[""astropy/nddata/mixins/tests/test_ndarithmeti...","[""astropy/nddata/mixins/tests/test_ndarithmeti...",362f6df12abf9bd769d4915fabf955c993ea22cf
4,astropy/astropy,astropy__astropy-6938,c76af9ed6bb89bfba45b9f5bc1e635188278e2fa,diff --git a/astropy/io/fits/fitsrec.py b/astr...,diff --git a/astropy/io/fits/tests/test_checks...,Possible bug in io.fits related to D exponents...,It is tested with `astropy/io/fits/tests/test_...,2017-12-07T00:01:14Z,1.3,"[""astropy/io/fits/tests/test_checksum.py::Test...","[""astropy/io/fits/tests/test_checksum.py::Test...",848c8fa21332abd66b44efe3cb48b72377fb32cc


In [14]:
# Get current graphs for the swe-bench user 
sql = f"""SELECT g.id, g.repo_id, g.status, r.commit_hash, r.repo_org, r.repo_name, r.created_at, r.size
FROM graphs g
JOIN repositories r
ON r.id = g.repo_id
WHERE user_id = '{os.environ['USER_ID']}'
--AND r.language = 'py'
"""

db_info = (get_sql_query(sql, ['graph_id', 'repo_id', 'status', 'base_commit', 'repo_org', 'repo_name', 'created_at', 'size'])
           .assign(repo = lambda x: x['repo_org'] + '/' + x['repo_name'])
           [['graph_id', 'repo_id', 'base_commit', 'repo', 'status', 'created_at', 'size']]
           )
# now = datetime.now().strftime('%Y%m%d_%H%M%S')
db_info.to_csv(f'../data/db_user_info.csv', index=False)
print(db_info.shape)
db_info

(417, 7)


Unnamed: 0,graph_id,repo_id,base_commit,repo,status,created_at,size
0,e89f5a7e-7d80-4de9-9bd2-d866f5b47f82,6742ecc5-d44b-4b8d-b3ac-037ebb641659,cebe3fb299a4ea6787a11d8700a9933cc838bcfb,simdjson/simdjson,completed,2025-07-01 17:20:08.793221+00:00,4.057
1,3bf35b06-1492-4ba0-9d91-510c597c7530,2a408baf-7e8d-4f64-acf6-3f4554376d01,ccea338070c795fd966a4dc08b19268b6fbad5ef,fmtlib/fmt,completed,2025-07-01 17:21:41.768607+00:00,2.545
2,5f078a97-3f2c-4cdb-8bc4-d4d1eebc0882,1a04b2c9-854d-4cc1-a684-fc80751bbb47,22cdfb0c93f8ec78492d87edb810f10cb7f57a31,mwaskom/seaborn,completed,2025-02-25 14:50:39.155791+00:00,1.779
3,5264b99d-336a-432c-b88d-6fcff4166541,487acfb5-8e7d-49a4-8322-1a95344ccd46,a7e38c5c61928033a2dc1915cbee8caa8544a4d0,pytest-dev/pytest,completed,2025-02-26 15:50:52.933833+00:00,2.926
4,18fafc0c-df8b-4051-884c-cd700d801857,695517ce-f198-43bc-9ae4-346c056555cb,b2571883cae4290b4ef0dd0a583d22c14116c4a3,zeromicro/go-zero,completed,2025-07-01 17:22:16.019422+00:00,2.628
...,...,...,...,...,...,...,...
412,37cc23a3-8c3b-4769-b6ca-158ee3975015,97cbf5fc-0a84-40aa-8dd4-52a3b83dce2d,991883df4d5910851130e3dc0e21fcbce604ea7d,darkreader/darkreader,completed,2025-07-01 15:29:50.066593+00:00,0.887
413,b61130d6-aaca-4098-a9d8-be73fb926bb3,dc34834b-b926-4ff8-bef7-6750300476df,2fdbf07978813ff40bea88f9ca9961cece59467c,fasterxml/jackson-core,completed,2025-07-01 15:35:27.942492+00:00,3.970
414,4f3b92d8-dac3-4134-b76a-146b57b77241,9892222f-e368-4382-9773-eea3b94d02bb,74bb7b2533af3f063d0794fe3962fb5226c27751,simdjson/simdjson,completed,2025-07-01 15:26:56.556412+00:00,4.554
415,360cf27c-3590-4737-ae32-7bf681067079,481782a6-29d9-4c8b-af4e-437bf3b87c8a,3ed7f4572534383e54f9fd0d2521131f64283410,fasterxml/jackson-databind,failed,2025-07-01 17:28:07.392314+00:00,8.330


In [15]:
# graphs by status
db_info.value_counts('status')

status
completed    415
failed         2
Name: count, dtype: int64

In [19]:
# graphs by status
db_info.value_counts(['repo', 'status']).head(12)

repo                       status   
django/django              completed    121
sympy/sympy                completed     81
matplotlib/matplotlib      completed     31
scikit-learn/scikit-learn  completed     29
pytest-dev/pytest          completed     26
sphinx-doc/sphinx          completed     24
astropy/astropy            completed     15
pydata/xarray              completed     13
pylint-dev/pylint          completed     13
psf/requests               completed     13
mwaskom/seaborn            completed      6
pallets/flask              completed      4
Name: count, dtype: int64

In [20]:
merged_df = swe_df[['repo', 'instance_id', 'base_commit']].merge(db_info, on=['repo', 'base_commit'], how='left')
# From the merged dataframe we extract those without graph_id
repos = merged_df[(merged_df['graph_id'].isnull())]
print("Repos without graph ID:", repos.shape[0])

Repos without graph ID: 0


In [21]:
# Since the graph creation process uses previous commits as reference to accelerate the process,
# we will send a unique request for each repository.
# Once these graphs are created, we can run this notebook again
repos_unique = (pd.concat([
    # repos.drop_duplicates(subset=['repo'], keep='first'),
    repos.reset_index(drop=True).sample(frac=1),
    # repos.drop_duplicates(subset=['repo'], keep='last'),
    ]).drop_duplicates(subset=['instance_id'])
)
repos_unique

Unnamed: 0,repo,instance_id,base_commit,graph_id,repo_id,status,created_at,size


In [22]:
# A possible error here is that the secret_key is not valid or expired.
max_attempts = 1

for i, r in repos_unique.iterrows():
    attempts = 0
    data = data_template.copy()
    while attempts < max_attempts:
        if attempts == max_attempts:
            print(f"Failed to create graph {i+1} out of {len(repos_unique)}")
            break
        attempts += 1
        repo_org, repo_name = r['repo'].split('/')

        data['repo_org'] = repo_org
        data['repo_name'] = repo_name
        data['branch'] = r['base_commit']
        
        response = requests.post(f"{GRAPHS_URL}/v1/repo", 
                         headers={"Authorization": f"Bearer {SECRET_KEY}"},
                         json=data)
        if response.ok:
            print(r['repo'], response.json())
            break
        else:
            if response.status_code == 504: # Timeout, the graph is still being created
                break
            print(response)
    # wait 120 seconds
    time.sleep(120)