In [None]:
# 📌 Step 1: Install & Import
!pip install cohere datasets tqdm

import cohere
from datasets import load_dataset, get_dataset_config_names
import json
import os
import time
from tqdm import tqdm

Collecting cohere
  Downloading cohere-5.15.0-py3-none-any.whl.metadata (3.4 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting httpx-sse==0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-2.32.0.20250328-py3-none-any.whl.metadata (2.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1

In [None]:
# 🔐 Step 2: Set Up Cohere Client
client = cohere.ClientV2("")

In [None]:
# 🔐 Step 3: Set Up the folder
from google.colab import drive
drive.mount('/content/drive')

# Navigate to your folder (change the path to match your Google Drive structure)
%cd /content/drive/MyDrive/CodeClarity

In [None]:
# 🔐 Step 3: Set Up the generation
import cohere
import os


# Ensure summaries directory exists (in current working directory)
output_dir = os.path.join(os.getcwd(), "summaries")
os.makedirs(output_dir, exist_ok=True)

test_file_path = os.path.join(output_dir, "test.txt")

try:
    response = client.chat(
        model="command-a-03-2025",
        messages=[
            {"role": "user", "content": "Say hello in Spanish"}
        ]
    )
    output_text = ''.join(part.text for part in response.message.content)
    print("API test succeeded. Response:")
    print(output_text)

    # Write to summaries/test.txt
    with open(test_file_path, "w", encoding="utf-8") as f:
        f.write(output_text + "\n")
    print(f"Output written to: {test_file_path}")

except Exception as e:
    print("Cohere API test failed:")
    print(e)


API test succeeded. Response:
¡Hola!
Output written to: /content/drive/MyDrive/CodeClarity/summaries/test.txt


In [None]:
# Required Libraries
import cohere
from datasets import load_dataset, get_dataset_config_names
import json
import os
import time
from tqdm import tqdm

# Resolve current script location
script_dir = os.getcwd()

# Define Natural Languages (from multilingual study)
natural_languages = [
    "Spanish", "Mandarin Chinese", "Arabic", "Swahili", "Yoruba",
    "Tamil", "Hindi", "Portuguese", "Filipino", "French"
]

# Programming Languages from CodeSearchNet
code_languages = get_dataset_config_names("code_search_net")
code_languages = [lang for lang in code_languages if lang != "all"]

# Create summaries directory relative to script
output_dir = os.path.join(script_dir, "summaries")
os.makedirs(output_dir, exist_ok=True)

# Error log file in script directory
error_log_path = os.path.join(script_dir, "errors.log")
if os.path.exists(error_log_path):
    os.remove(error_log_path)

# Set sample size
MAX_SAMPLES = 1  # Adjust as needed

# Summary Generation Loop
SPLIT = "validation"

for code_lang in code_languages:
    print(f"\nLanguage: {code_lang}")
    dataset = load_dataset("code_search_net", code_lang, split=SPLIT, trust_remote_code=True)
    dataset = dataset.shuffle(seed=42).select(range(min(MAX_SAMPLES, len(dataset))))

    for natural_lang in natural_languages:
        print(f"  Summarizing in: {natural_lang}")

        filename = os.path.join(output_dir, f"summary_{code_lang}_{natural_lang.lower().replace(' ', '_')}.jsonl")
        with open(filename, "w", encoding="utf-8") as f:
            for i, example in enumerate(tqdm(dataset, desc=f"{code_lang} → {natural_lang}", unit="fn")):
                code_snippet = example.get("func_code_string")
                docstring = example.get("func_documentation_string")
                if not code_snippet:
                    continue

                messages = [
                    {
                        "role": "system",
                        "content": (
                            "You are helping someone who doesn't know how to code. "
                            "Explain what the function does in simple terms."
                        )
                    },
                    {
                        "role": "user",
                        "content": f"Can you explain what this function does in {natural_lang}?\n\n{code_snippet}"
                    }
                ]

                summary = None
                for attempt in range(3):
                    try:
                        response = client.chat(
                            model="command-a-03-2025",
                            messages=messages,
                            temperature=0.2
                        )
                        summary = ''.join(part.text for part in response.message.content)
                        break
                    except Exception as e:
                        error_msg = f"[{code_lang} | {natural_lang} | Example {i}] Attempt {attempt + 1} failed: {e}"
                        print(error_msg)
                        with open(error_log_path, "a", encoding="utf-8") as err_log:
                            err_log.write(error_msg + "\n")
                        time.sleep(2)

                record = {
                    "code": code_snippet,
                    "docstring": docstring,
                    "summary": summary if summary else "ERROR: Failed to generate summary"
                }

                f.write(json.dumps(record, ensure_ascii=False) + "\n")

        print(f"Saved: {filename}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

code_search_net.py:   0%|          | 0.00/8.44k [00:00<?, ?B/s]


Language: java


java.zip:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/454451 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/26909 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/15328 [00:00<?, ? examples/s]

  Summarizing in: Spanish


java → Spanish: 100%|██████████| 1/1 [00:07<00:00,  7.66s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_java_spanish.jsonl
  Summarizing in: Mandarin Chinese


java → Mandarin Chinese: 100%|██████████| 1/1 [00:04<00:00,  4.93s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_java_mandarin_chinese.jsonl
  Summarizing in: Arabic


java → Arabic: 100%|██████████| 1/1 [00:08<00:00,  8.74s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_java_arabic.jsonl
  Summarizing in: Swahili


java → Swahili: 100%|██████████| 1/1 [00:08<00:00,  8.69s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_java_swahili.jsonl
  Summarizing in: Yoruba


java → Yoruba: 100%|██████████| 1/1 [00:16<00:00, 16.47s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_java_yoruba.jsonl
  Summarizing in: Tamil


java → Tamil: 100%|██████████| 1/1 [00:30<00:00, 30.66s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_java_tamil.jsonl
  Summarizing in: Hindi


java → Hindi: 100%|██████████| 1/1 [00:12<00:00, 12.94s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_java_hindi.jsonl
  Summarizing in: Portuguese


java → Portuguese: 100%|██████████| 1/1 [00:11<00:00, 11.07s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_java_portuguese.jsonl
  Summarizing in: Filipino


java → Filipino: 100%|██████████| 1/1 [00:11<00:00, 11.88s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_java_filipino.jsonl
  Summarizing in: French


java → French: 100%|██████████| 1/1 [00:09<00:00,  9.50s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_java_french.jsonl

Language: go


go.zip:   0%|          | 0.00/488M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/317832 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/14291 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/14242 [00:00<?, ? examples/s]

  Summarizing in: Spanish


go → Spanish: 100%|██████████| 1/1 [00:06<00:00,  6.13s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_go_spanish.jsonl
  Summarizing in: Mandarin Chinese


go → Mandarin Chinese: 100%|██████████| 1/1 [00:05<00:00,  5.10s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_go_mandarin_chinese.jsonl
  Summarizing in: Arabic


go → Arabic: 100%|██████████| 1/1 [00:08<00:00,  8.60s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_go_arabic.jsonl
  Summarizing in: Swahili


go → Swahili: 100%|██████████| 1/1 [00:09<00:00,  9.51s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_go_swahili.jsonl
  Summarizing in: Yoruba


go → Yoruba: 100%|██████████| 1/1 [00:09<00:00,  9.24s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_go_yoruba.jsonl
  Summarizing in: Tamil


go → Tamil: 100%|██████████| 1/1 [00:17<00:00, 17.70s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_go_tamil.jsonl
  Summarizing in: Hindi


go → Hindi: 100%|██████████| 1/1 [00:12<00:00, 12.05s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_go_hindi.jsonl
  Summarizing in: Portuguese


go → Portuguese: 100%|██████████| 1/1 [00:07<00:00,  7.20s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_go_portuguese.jsonl
  Summarizing in: Filipino


go → Filipino: 100%|██████████| 1/1 [00:07<00:00,  7.30s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_go_filipino.jsonl
  Summarizing in: French


go → French: 100%|██████████| 1/1 [00:06<00:00,  6.44s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_go_french.jsonl

Language: python


python.zip:   0%|          | 0.00/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

  Summarizing in: Spanish


python → Spanish: 100%|██████████| 1/1 [00:08<00:00,  8.35s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_python_spanish.jsonl
  Summarizing in: Mandarin Chinese


python → Mandarin Chinese: 100%|██████████| 1/1 [00:07<00:00,  7.85s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_python_mandarin_chinese.jsonl
  Summarizing in: Arabic


python → Arabic: 100%|██████████| 1/1 [00:10<00:00, 10.79s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_python_arabic.jsonl
  Summarizing in: Swahili


python → Swahili: 100%|██████████| 1/1 [00:13<00:00, 13.15s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_python_swahili.jsonl
  Summarizing in: Yoruba


python → Yoruba: 100%|██████████| 1/1 [00:15<00:00, 15.61s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_python_yoruba.jsonl
  Summarizing in: Tamil


python → Tamil: 100%|██████████| 1/1 [00:28<00:00, 28.66s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_python_tamil.jsonl
  Summarizing in: Hindi


python → Hindi: 100%|██████████| 1/1 [00:16<00:00, 16.54s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_python_hindi.jsonl
  Summarizing in: Portuguese


python → Portuguese: 100%|██████████| 1/1 [00:09<00:00,  9.51s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_python_portuguese.jsonl
  Summarizing in: Filipino


python → Filipino: 100%|██████████| 1/1 [00:08<00:00,  8.95s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_python_filipino.jsonl
  Summarizing in: French


python → French: 100%|██████████| 1/1 [00:09<00:00,  9.82s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_python_french.jsonl

Language: javascript


javascript.zip:   0%|          | 0.00/1.66G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/123889 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6483 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8253 [00:00<?, ? examples/s]

  Summarizing in: Spanish


javascript → Spanish: 100%|██████████| 1/1 [00:08<00:00,  8.33s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_javascript_spanish.jsonl
  Summarizing in: Mandarin Chinese


javascript → Mandarin Chinese: 100%|██████████| 1/1 [00:06<00:00,  6.63s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_javascript_mandarin_chinese.jsonl
  Summarizing in: Arabic


javascript → Arabic: 100%|██████████| 1/1 [00:10<00:00, 10.52s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_javascript_arabic.jsonl
  Summarizing in: Swahili


javascript → Swahili: 100%|██████████| 1/1 [00:11<00:00, 11.84s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_javascript_swahili.jsonl
  Summarizing in: Yoruba


javascript → Yoruba: 100%|██████████| 1/1 [00:13<00:00, 13.09s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_javascript_yoruba.jsonl
  Summarizing in: Tamil


javascript → Tamil: 100%|██████████| 1/1 [00:24<00:00, 24.54s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_javascript_tamil.jsonl
  Summarizing in: Hindi


javascript → Hindi: 100%|██████████| 1/1 [00:14<00:00, 14.84s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_javascript_hindi.jsonl
  Summarizing in: Portuguese


javascript → Portuguese: 100%|██████████| 1/1 [00:07<00:00,  7.73s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_javascript_portuguese.jsonl
  Summarizing in: Filipino


javascript → Filipino: 100%|██████████| 1/1 [00:09<00:00,  9.37s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_javascript_filipino.jsonl
  Summarizing in: French


javascript → French: 100%|██████████| 1/1 [00:09<00:00,  9.09s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_javascript_french.jsonl

Language: ruby


ruby.zip:   0%|          | 0.00/112M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/48791 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2279 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2209 [00:00<?, ? examples/s]

  Summarizing in: Spanish


ruby → Spanish: 100%|██████████| 1/1 [00:06<00:00,  6.83s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_ruby_spanish.jsonl
  Summarizing in: Mandarin Chinese


ruby → Mandarin Chinese: 100%|██████████| 1/1 [00:07<00:00,  7.53s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_ruby_mandarin_chinese.jsonl
  Summarizing in: Arabic


ruby → Arabic: 100%|██████████| 1/1 [00:11<00:00, 11.10s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_ruby_arabic.jsonl
  Summarizing in: Swahili


ruby → Swahili: 100%|██████████| 1/1 [00:16<00:00, 16.69s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_ruby_swahili.jsonl
  Summarizing in: Yoruba


ruby → Yoruba: 100%|██████████| 1/1 [00:14<00:00, 14.44s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_ruby_yoruba.jsonl
  Summarizing in: Tamil


ruby → Tamil: 100%|██████████| 1/1 [00:28<00:00, 28.52s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_ruby_tamil.jsonl
  Summarizing in: Hindi


ruby → Hindi: 100%|██████████| 1/1 [00:09<00:00,  9.13s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_ruby_hindi.jsonl
  Summarizing in: Portuguese


ruby → Portuguese: 100%|██████████| 1/1 [00:06<00:00,  6.63s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_ruby_portuguese.jsonl
  Summarizing in: Filipino


ruby → Filipino: 100%|██████████| 1/1 [00:10<00:00, 10.79s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_ruby_filipino.jsonl
  Summarizing in: French


ruby → French: 100%|██████████| 1/1 [00:08<00:00,  8.29s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_ruby_french.jsonl

Language: php


php.zip:   0%|          | 0.00/852M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/523712 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/28391 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/26015 [00:00<?, ? examples/s]

  Summarizing in: Spanish


php → Spanish: 100%|██████████| 1/1 [00:10<00:00, 10.87s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_php_spanish.jsonl
  Summarizing in: Mandarin Chinese


php → Mandarin Chinese: 100%|██████████| 1/1 [00:10<00:00, 10.88s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_php_mandarin_chinese.jsonl
  Summarizing in: Arabic


php → Arabic: 100%|██████████| 1/1 [00:10<00:00, 10.98s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_php_arabic.jsonl
  Summarizing in: Swahili


php → Swahili: 100%|██████████| 1/1 [00:13<00:00, 13.54s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_php_swahili.jsonl
  Summarizing in: Yoruba


php → Yoruba: 100%|██████████| 1/1 [00:16<00:00, 16.73s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_php_yoruba.jsonl
  Summarizing in: Tamil


php → Tamil: 100%|██████████| 1/1 [00:26<00:00, 26.02s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_php_tamil.jsonl
  Summarizing in: Hindi


php → Hindi: 100%|██████████| 1/1 [00:18<00:00, 18.74s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_php_hindi.jsonl
  Summarizing in: Portuguese


php → Portuguese: 100%|██████████| 1/1 [00:11<00:00, 11.05s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_php_portuguese.jsonl
  Summarizing in: Filipino


php → Filipino: 100%|██████████| 1/1 [00:12<00:00, 12.01s/fn]


Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_php_filipino.jsonl
  Summarizing in: French


php → French: 100%|██████████| 1/1 [00:10<00:00, 10.92s/fn]

Saved: /content/drive/MyDrive/CodeClarity/summaries/summary_php_french.jsonl



