Takes this Kaggle dataset 'leetcode-solutions'
https://www.kaggle.com/datasets/erichartford/leetcode-solutions, and turns them into basic
dialogue using a preset list of user prompt tempaltes.

In [None]:
ONE_STEP_TEMPLATES = [
    "Can you write a program in ${lang} where\n${content}",
    "How would you implement a function in ${lang} that\n${content}",
    "Write a ${lang} function for\n${content}",
    "Can you create a ${lang} program that\n${content}",
    "Implement a function in ${lang} to\n${content}",
    "Write a ${lang} script for\n${content}",
    "How would you code a program in ${lang} to\n${content}",
    "Create a ${lang} function for\n${content}",
    "Write a ${lang} program that can\n${content}",
    "Can you implement a function in ${lang} that\n${content}",
]

In [None]:
import os
import kaggle
import pandas as pd
import random
from IPython.display import display
from datasets import Dataset
import requests

data_source = "https://www.kaggle.com/datasets/erichartford/leetcode-solutions"
lc_contests_data_source = "https://github.com/Nan-Do/LeetCodeContestsDataset/raw/main/submissions.json"

output_dir = "data"
os.makedirs(output_dir, exist_ok=True)

In [None]:
kaggle.api.dataset_download_files("erichartford/leetcode-solutions", "data", unzip=True)
r = requests.get(lc_contests_data_source, allow_redirects=True)
with open("data/lc_contests.json", "wb") as f:
    for chunk in r.iter_content(chunk_size=1024):
        if chunk:
            f.write(chunk)

In [None]:
leetcode_solutions = pd.read_json("data/leetcode-solutions.jsonl", lines=True)
leetcode_contests = pd.read_json("data/lc_contests.json")

# Create dataframe with columns INSTRUCTION, RESPONSE, SOURCE
# The INSTRUCTION a random choice from ONE_STEP_TEMPLATES with the language and content filled in
# The RESPONSE is the answer to the question being posed
# The SOURCE is the URL of the dataset
oa_leet10k = []
for index, row in leetcode_solutions.iterrows():
    content = row["content"]
    for lang in ["c++", "java", "javascript", "python"]:
        if lang in row["answer"]:
            oa_leet10k.append(
                {
                    "INSTRUCTION": random.choice(ONE_STEP_TEMPLATES)
                    .replace("${lang}", lang)
                    .replace("${content}", content),
                    "RESPONSE": row["answer"][lang],
                    "SOURCE": data_source,
                }
            )

oa_leetcode_contests = []
for index, row in leetcode_contests.iterrows():
    oa_leetcode_contests.append(
        {
            "INSTRUCTION": row["instruction"] + "\n" + row["input"],
            "RESPONSE": row["output"],
            "SOURCE": "https://github.com/Nan-Do/LeetCodeContestsDataset",
        }
    )

oa_leet10k = pd.DataFrame(oa_leet10k)
oa_leetcode_contests = pd.DataFrame(oa_leetcode_contests)

print(f"oa_leet10k: {oa_leet10k.shape[0]}, oa_leetcode_contests: {oa_leetcode_contests.shape[0]}")

# Print the first 5 rows of the dataframe with full width and newline characters correctly displayed in the RESPONSE column
with pd.option_context("display.max_colwidth", 80):
    # Assuming the variable df contains the relevant DataFrame
    display(
        oa_leet10k.head(5).style.set_properties(
            **{
                "text-align": "left",
                "white-space": "pre-wrap",
            }
        ),
        oa_leetcode_contests.head(5).style.set_properties(
            **{
                "text-align": "left",
                "white-space": "pre-wrap",
            }
        ),
    )

In [None]:
# Upload dataset to HF
oa_leet10k.to_parquet("oa_leet10k.parquet", row_group_size=100, engine="pyarrow")
ds_leet10k = Dataset.from_parquet("oa_leet10k.parquet")
oa_leetcode_contests.to_parquet("oa_leetcode_contests.parquet", row_group_size=100, engine="pyarrow")
ds_leetcode_contests = Dataset.from_parquet("oa_leetcode_contests.parquet")
# Uncomment to push dataset to HF
# ds_leet10k.push_to_hub("ehartford/oa_leet10k")
# ds_leetcode_contests.push_to_hub("ehartford/oa_leet10k")