In [None]:
import requests
import re
import time
import pandas as pd
from tqdm import tqdm  # For progress tracking

# API details
API_URL = "https://openrouter.ai/api/v1/chat/completions"

In [None]:
# Load data
test_df = pd.read_csv('summarized_java_code_8000_rows_gimini.csv')
test_df = test_df.iloc[6100:6800]
test_df

In [None]:
test_df = test_df[test_df['Summaries'].isna()]

In [None]:
# Load the CSV file
test_df = test_df.drop(columns=['Summaries'])
test_df

Unnamed: 0,func_code_string,func_documentation_string
6100,@Override\n public void memberRemoved(final M...,Removes the node map entry.
6101,@Override\n public String electNewLockManager...,Elects a new server as coordinator. The electi...
6102,protected static OObjectDatabaseTx getDatabase...,Gets the current thread database as a ODatabas...
6103,public String toCreateIndexDDL(final String in...,{@inheritDoc}\n\n@param indexName\n@param inde...
6104,protected void init(final Configuration config...,Builds a OrientGraph instance passing a config...
...,...,...
6795,@Override\n @Deprecated\n public <REC extend...,{@inheritDoc}
6796,public OCommandRequest command(final OCommandR...,{@inheritDoc}
6797,public <RET extends List<?>> RET query(final O...,{@inheritDoc}
6798,@Override\n public long countClusterElements(...,{@inheritDoc}


In [None]:
import requests
import time
import pandas as pd
from tqdm import tqdm
import itertools

# Define multiple API keys (replace with actual keys – do NOT expose real keys in public repos)
API_KEYS = [
    "<YOUR_API_KEY_1>",
    "<YOUR_API_KEY_2>",
    "<YOUR_API_KEY_3>",
    "<YOUR_API_KEY_4>",
    "<YOUR_API_KEY_5>",
    "<YOUR_API_KEY_6>"
]

# API details
API_URL = "https://openrouter.ai/api/v1/chat/completions"

# Set up API key tracking
api_key_usage = {key: 0 for key in API_KEYS}  # Track requests per key
active_keys = API_KEYS.copy()  # Keeps track of available keys

# Function to get the next available API key with remaining quota
def get_next_api_key():
    global active_keys
    for key in active_keys:
        if api_key_usage[key] < 150:
            return key
    return None  # If all keys are exhausted, return None

# Function to summarize a single Java function using a specific API key
def summarize_single_code(code, api_key):
    data = {
        "model": "deepseek/deepseek-r1-distill-llama-70b:free",
        "messages": [
            {
                "role": "user",
                "content": (
                    "Analyze the following Java function and provide an accurate, standalone summary in exactly three sentences. "
                    "Ensure the summary precisely describes this function’s purpose, parameters, and key functionality "
                    "without including any unrelated information. The summary should be concise but detailed enough to reflect "
                    "the function's core operations.\n\n"
                    f"Function:\n```java\n{code}\n```"
                )
            }
        ]
    }

    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}

    try:
        response = requests.post(API_URL, headers=headers, json=data)
        response.raise_for_status()
        summary_text = response.json().get('choices', [{}])[0].get('message', {}).get('content', "").strip()

        # Check if the response is generic
        if not summary_text or "Java code" in summary_text or len(summary_text.split()) < 10:
            print("Warning: Possible generic response, needs manual review.")
            return None

        return summary_text

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to distribute summarization across API keys
def summarize_code_df(df, delay=1):
    df = df.copy()
    summaries = []

    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Functions"):
        api_key = get_next_api_key()  # Get an API key with quota left

        if api_key is None:
            print("All API keys exhausted. Stopping execution.")
            break  # Stop processing if all keys are used up

        summary = summarize_single_code(str(row['func_code_string']), api_key)
        summaries.append(summary)
        api_key_usage[api_key] += 1  # Track usage

        # If key reaches 150 requests, remove it from active keys
        if api_key_usage[api_key] >= 150:
            active_keys.remove(api_key)

        time.sleep(delay)  # Avoid hitting rate limits

    df['Summaries'] = summaries
    return df


In [None]:
# Ensure you have a DataFrame with at least 600 rows
test_df = summarize_code_df(test_df)

# Display the DataFrame with summaries
test_df

Processing Functions:   6%|▌         | 42/700 [09:47<2:41:36, 14.74s/it]



Processing Functions:   6%|▋         | 45/700 [10:25<2:27:58, 13.56s/it]



Processing Functions:   7%|▋         | 52/700 [11:56<2:21:25, 13.09s/it]



Processing Functions:  16%|█▋        | 114/700 [28:36<2:40:42, 16.45s/it]



Processing Functions:  22%|██▏       | 154/700 [38:46<2:13:43, 14.70s/it]



Processing Functions:  24%|██▎       | 165/700 [40:23<1:09:33,  7.80s/it]



Processing Functions:  24%|██▎       | 166/700 [40:25<55:01,  6.18s/it]  



Processing Functions:  25%|██▌       | 177/700 [42:54<2:03:48, 14.20s/it]



Processing Functions:  26%|██▌       | 182/700 [43:39<1:18:20,  9.07s/it]



Processing Functions:  29%|██▉       | 202/700 [48:28<1:24:55, 10.23s/it]



Processing Functions:  33%|███▎      | 231/700 [54:37<1:47:29, 13.75s/it]



Processing Functions:  34%|███▎      | 235/700 [55:19<1:31:08, 11.76s/it]



Processing Functions:  46%|████▋     | 325/700 [1:14:06<2:14:13, 21.48s/it]



Processing Functions:  48%|████▊     | 337/700 [1:17:40<2:21:21, 23.36s/it]



Processing Functions:  50%|█████     | 350/700 [1:20:23<1:29:00, 15.26s/it]



Processing Functions:  52%|█████▏    | 362/700 [1:22:29<57:44, 10.25s/it]  



Processing Functions:  53%|█████▎    | 372/700 [1:24:54<1:08:15, 12.49s/it]



Processing Functions:  61%|██████    | 424/700 [1:37:21<43:26,  9.44s/it]



Processing Functions:  62%|██████▏   | 435/700 [1:39:26<53:34, 12.13s/it]  



Processing Functions:  68%|██████▊   | 475/700 [1:48:01<39:12, 10.46s/it]



Processing Functions:  68%|██████▊   | 479/700 [1:48:57<46:16, 12.56s/it]



Processing Functions:  69%|██████▉   | 483/700 [1:49:35<36:32, 10.10s/it]



Processing Functions:  70%|███████   | 493/700 [1:52:05<57:36, 16.70s/it]



Processing Functions:  72%|███████▏  | 501/700 [1:54:06<50:40, 15.28s/it]



Processing Functions:  73%|███████▎  | 513/700 [1:56:26<44:47, 14.37s/it]



Processing Functions:  76%|███████▌  | 533/700 [2:00:03<23:52,  8.58s/it]



Processing Functions:  76%|███████▋  | 535/700 [2:00:30<33:36, 12.22s/it]



Processing Functions:  77%|███████▋  | 539/700 [2:01:45<47:04, 17.54s/it]



Processing Functions:  79%|███████▉  | 553/700 [2:05:03<33:01, 13.48s/it]



Processing Functions:  80%|███████▉  | 559/700 [2:06:36<41:00, 17.45s/it]



Processing Functions:  86%|████████▌ | 602/700 [2:17:27<22:46, 13.94s/it]



Processing Functions:  88%|████████▊ | 617/700 [2:21:24<15:01, 10.87s/it]



Processing Functions:  91%|█████████ | 636/700 [2:26:34<19:17, 18.08s/it]



Processing Functions:  92%|█████████▏| 641/700 [2:27:35<13:26, 13.67s/it]



Processing Functions:  95%|█████████▍| 663/700 [2:32:15<08:11, 13.28s/it]



Processing Functions:  95%|█████████▌| 668/700 [2:33:26<07:17, 13.67s/it]



Processing Functions:  96%|█████████▌| 670/700 [2:33:41<05:18, 10.63s/it]



Processing Functions:  99%|█████████▉| 693/700 [2:39:29<01:13, 10.54s/it]



Processing Functions: 100%|██████████| 700/700 [2:40:54<00:00, 13.79s/it]


Unnamed: 0,func_code_string,func_documentation_string,Summaries
6100,@Override\n public void memberRemoved(final M...,Removes the node map entry.,"This function, `memberRemoved`, is designed to..."
6101,@Override\n public String electNewLockManager...,Elects a new server as coordinator. The electi...,"This function, `electNewLockManager()`, is des..."
6102,protected static OObjectDatabaseTx getDatabase...,Gets the current thread database as a ODatabas...,The function `getDatabase()` retrieves the cur...
6103,public String toCreateIndexDDL(final String in...,{@inheritDoc}\n\n@param indexName\n@param inde...,"This Java function, `toCreateIndexDDL`, genera..."
6104,protected void init(final Configuration config...,Builds a OrientGraph instance passing a config...,The function `init` initializes various config...
...,...,...,...
6795,@Override\n @Deprecated\n public <REC extend...,{@inheritDoc},"The `browseCluster` method, which is deprecate..."
6796,public OCommandRequest command(final OCommandR...,{@inheritDoc},The function `command` performs security check...
6797,public <RET extends List<?>> RET query(final O...,{@inheritDoc},The `query` function executes a query using th...
6798,@Override\n public long countClusterElements(...,{@inheritDoc},The function `countClusterElements` calculates...


In [None]:
# Save and download the updated DataFrame
from google.colab import files

test_df.to_csv("deep_seek_6100_to_6800_with_summaries.csv", index=False)
files.download("deep_seek_6100_to_6800_with_summaries.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
merged_deep_seek_summaries = pd.read_csv("deep_seek_6100_to_6800_with_summaries.csv")

# Identify rows with missing summaries
missing_summaries_df = merged_deep_seek_summaries[merged_deep_seek_summaries['Summaries'].isna()]

# Reprocess missing summaries
if not missing_summaries_df.empty:
    updated_summaries_df = summarize_code_df(missing_summaries_df)

    # Update the original DataFrame
    merged_deep_seek_summaries.update(updated_summaries_df)

    # Save the updated DataFrame
    merged_deep_seek_summaries.to_csv("deep_seek_6100_to_6800_with_summaries.csv", index=False)