### REQUIRED PIP INSTALLS

In [None]:
!pip install transformers kaggle datasets tqdm

### DOWLOAD THE DATASET

In [None]:
from google.colab import files

files.upload()

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
import kaggle
from pathlib import Path


def download_dataset_from_kaggle(path="data"):
    """
    Download the CodeSearchNet dataset from Kaggle.
    Make sure to have the Kaggle API token in ~/.kaggle/kaggle.json

    Returns:
        str: Path to the downloaded dataset.
    """
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    kaggle.api.authenticate()
    kaggle.api.dataset_download_files("omduggineni/codesearchnet", path=path, unzip=True)


download_dataset_from_kaggle()

### LOAD THE DATASET

In [None]:
import glob

from datasets import load_dataset
from pathlib import Path


def load_local_dataset(lang="all", path="data"):
    """
    Load a local dataset from the downloaded Kaggle dataset.

    Args:
        lang (str): The language to be used for the dataset.
        path (str, optional): Path to the downloaded dataset. Defaults to "data".

    Returns:
        Dataset: dataset loaded from local files
    """
    path = Path(path)

    if lang != "all":
        # Read the downloaded dataset
        path = path / lang / lang / "final/jsonl"
        dataset = load_dataset(
            "json",
            data_files={
                "train": list(sorted(glob.glob(path.as_posix() + "/train/*.jsonl"))),
                "validation": list(sorted(glob.glob(path.as_posix() + "/valid/*.jsonl"))),
                "test": list(sorted(glob.glob(path.as_posix() + "/test/*.jsonl"))),
            },
        )
    else:
        train_files = glob.glob(path.as_posix() + "/**/train/*.jsonl", recursive=True)
        valid_files = glob.glob(path.as_posix() + "/**/valid/*.jsonl", recursive=True)
        test_files = glob.glob(path.as_posix() + "/**/test/*.jsonl", recursive=True)
        dataset = load_dataset(
            "json",
            data_files={
                "train": train_files,
                "validation": valid_files,
                "test": test_files,
            },
        )

    return dataset


dataset = load_local_dataset("python")

### LOAD THE MODEL

In [None]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration

tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-base", truncation_side="right")
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-base-codexglue-sum-python").to("cuda")

### GENERATE THE SUMMARIES AND ANOTATE THE DATASET

In [None]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import json

BATCH_SIZE = 16
PARTITION = "train"
# all_summaries = []


def tokenization_collator(batch_sample):
    code = list(map(lambda x: x["original_string"], batch_sample))
    return tokenizer(code, return_tensors="pt", padding="longest", truncation=True).input_ids.to("cuda")


for PARTITION in ["train", "test", "validation"]:
    dataloader = DataLoader(dataset[PARTITION], batch_size=BATCH_SIZE, collate_fn=tokenization_collator)

    with open(PARTITION + ".jsonl", "w") as f:
        for batch_num, batch_data in tqdm(
            enumerate(iter(dataloader)), total=(len(dataset[PARTITION]) // BATCH_SIZE) + 1
        ):
            generated_ids = model.generate(batch_data, max_length=512)
            summaries = tokenizer.batch_decode(generated_ids.squeeze(), skip_special_tokens=True)

            # all_summaries.extend(summaries)
            for summary in summaries:
                f.write(json.dumps({"summary": summary}))
                f.write("\n")

In [None]:
import gzip

with gzip.open("dataset.jsonl.gz", "w") as w:
    for PARTITION in ["train", "test", "validation"]:
        with open(PARTITION + ".jsonl") as f:
            for line_pos, line in enumerate(f.readlines()):
                d = dataset[PARTITION][line_pos].copy()
                d.update(json.loads(line))
                b = json.dumps(d) + "\n"
                w.write(b.encode("UTF-8"))

### PUSH THE DATASET INTO HUGGINGFACE'S HUB

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
processed_dataset = load_dataset("json", data_files="./dataset.jsonl.gz")
processed_dataset.push_to_hub("Nan-Do/codesearchnet_python")