In [1]:
### ! This notebook has been executed on kaggle

In [2]:
import random

from datasets import concatenate_datasets, load_dataset

In [None]:
ALPACA_URL = "https://raw.githubusercontent.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/main/data/alpaca_gpt4_data.json"
ds_alpaca = load_dataset("json", data_files=ALPACA_URL)

In [None]:
def prompt_no_input(row):
    return {
        "prompt": (
            "Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{str(row['instruction'])}\n\n### Response:\n{str(row['output'])}"
        )
    }


def prompt_input(row):
    return {
        "prompt": (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{str(row['instruction'])}\n\n### Input:\n{str(row['input'])}\n\n### Response:\n{str(row['output'])}"
        )
    }


def create_alpaca_prompt(row):
    return prompt_no_input(row) if row["input"] == "" else prompt_input(row)


ds_alpaca = ds_alpaca.map(create_alpaca_prompt)
ds_alpaca = ds_alpaca.remove_columns(["instruction", "output", "input"])
ds_alpaca = ds_alpaca.map(lambda example: {"source": "alpaca"})

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [None]:
ds_dolly = load_dataset("databricks/databricks-dolly-15k")
ds_dolly

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'response', 'category'],
        num_rows: 15011
    })
})

In [None]:
instruct_information_extraction = [
    "Extract key information from the following context and answer the question provided.",
    "Based on the provided context, answer the following question.",
    "Given the provided context, provide an answer to the following question.",
    "Identify the main details in the provided context and answer the associated question.",
    "From the given context, extract crucial information to respond to the inquiry.",
    "Answer the question based on the context below. Keep the answer short and concise.",
]


def prompt_information_extraction(row):
    return {
        "prompt": (
            random.choice(instruct_information_extraction)
            + f"\n\n### Instruction:\n{str(row['instruction'])}\n\n### Input:\n{str(row['context'])}\n\n### Response:\n{str(row['response'])}"
        )
    }


def prompt_general_qa(row):
    return {
        "prompt": (
            f"### Instruction:\n{str(row['instruction'])}\n\n### Response:\n{str(row['response'])}"
        )
    }


summarization_prompts = [
    "Summarize the following text.",
    "Provide a concise summary of the given passage.",
    "Condense the provided information into a brief summary.",
    "Offer a summary of the following content.",
    "Extract the key points from the text and provide a summary.",
    "Compose a short summary of the provided passage.",
    "Condense the following text into a concise summary.",
]


def prompt_summarization(row):
    return {
        "prompt": (
            random.choice(summarization_prompts)
            + f"\n\n### Instruction:\n{str(row['instruction'])}\n\n### Input:\n{str(row['context'])}\n\n### Response:\n{str(row['response'])}"
        )
    }


creative_writing_prompts = [
    "Let your imagination run wild and create a story inspired by the following prompt.",
    "Craft a piece of creative writing based on the provided inspiration.",
    "Use the following prompt as inspiration to create an original story.",
    "Write a short story or poem inspired by the following prompt.",
    "Engage your creativity and compose a piece of writing based on the provided stimulus.",
    "Create an original narrative inspired by the following prompt.",
    "Let your creativity flow and write a story using the following prompt as inspiration.",
]


def prompt_creative_writing(row):
    return {
        "prompt": (
            random.choice(creative_writing_prompts)
            + f"### Instruction:\n{str(row['instruction'])}\n\n### Response:\n{str(row['response'])}"
        )
    }


def prompt_generator(row):
    category = row["category"]
    if category == "information_extraction":
        return prompt_information_extraction(row)
    if category == "open_qa":
        return prompt_general_qa(row)
    if category == "general_qa":
        return prompt_general_qa(row)
    if category == "brainstorming":
        return prompt_general_qa(row)
    if category == "summarization":
        return prompt_summarization(row)
    if category == "creative_writing":
        return prompt_creative_writing(row)

In [None]:
ds_dolly = ds_dolly.filter(
    lambda sample: sample["category"]
    in ["information_extraction", "open_qa", "general_qa", "summarization", "creative_writing"]
)

In [None]:
ds_dolly = ds_dolly.map(prompt_generator)
ds_dolly = ds_dolly.remove_columns(["instruction", "context", "response", "category"])
ds_dolly = ds_dolly.map(lambda example: {"source": "lamini"})

In [None]:
ds_lamini = load_dataset("MBZUAI/LaMini-instruction")
ds_lamini = ds_lamini.map(prompt_general_qa, num_proc=30)
ds_lamini = ds_lamini.remove_columns(["instruction", "response", "instruction_source"])
ds_lamini = ds_lamini.map(lambda example: {"source": "lamini"})

Map:   0%|          | 0/2585615 [00:00<?, ? examples/s]

In [None]:
grammar_correction_prompts = [
    "Correct",
    "Identify and correct the grammatical errors in the following passage.",
    "Proofread the following text and correct any grammar mistakes.",
    "Improve the grammar of the provided passage by making necessary corrections.",
    "Correct the grammar errors in the following passage to improve readability.",
    "Edit the following text to ensure proper grammar and sentence structure.",
    "Identify grammatical mistakes in the following passage and provide corrections.",
    "Review the following text and rectify any grammatical errors.",
]


def prompt_grammar_correction(row):
    return {
        "prompt": (
            random.choice(grammar_correction_prompts)
            + f"### Instruction:\n{str(row['input'])}\n\n### Response:\n{str(row['output'])}"
        )
    }

In [None]:
## Grammar

In [None]:
ds_grammar = load_dataset("liweili/c4_200m", split="train", streaming=True)
bucket = []
for i, _ in enumerate(iter(ds_grammar)):
    bucket.append(_)
    if i == int(2e5):
        break

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
import datasets

ds_grammar = datasets.Dataset.from_list(bucket)

In [None]:
ds_grammar = ds_grammar.map(prompt_grammar_correction, num_proc=35)
ds_grammar = ds_grammar.remove_columns(["input", "output"])
ds_grammar = ds_grammar.map(lambda example: {"source": "grammar"})

Map (num_proc=35):   0%|          | 0/200001 [00:00<?, ? examples/s]

Map:   0%|          | 0/200001 [00:00<?, ? examples/s]

In [None]:
text_rewriting_prompts = [
    "Rewrite",
    "Rewrite the following text to improve clarity and conciseness.",
    "Paraphrase the provided passage to convey the same meaning in different words.",
    "Revise the following text to enhance coherence and readability.",
    "Rephrase the provided content while retaining the original meaning.",
    "Create a revised version of the following passage with improved structure and flow.",
    "Reword the following text to make it more engaging and understandable.",
    "Provide an alternative version of the following passage with improved language.",
]


def prompt_text_rewriting(row):
    return {
        "prompt": (
            random.choice(text_rewriting_prompts)
            + f"### Instruction:\n{str(row['dialogue'])}\n\n### Response:\n{str(row['summary'])}"
        )
    }

In [None]:
ds_samsum = load_dataset("samsum")
ds_samsum = ds_samsum.map(prompt_text_rewriting)
ds_samsum = ds_samsum.remove_columns(["id", "dialogue", "summary"])
ds_samsum = ds_samsum.map(lambda example: {"source": "samsum"})

Downloading data:   0%|          | 0.00/6.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/335k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
ds = concatenate_datasets(
    [
        ds_grammar,
        ds_alpaca["train"],
        ds_dolly["train"],
        ds_lamini["train"],
        ds_samsum["train"],
        ds_samsum["test"],
        ds_samsum["validation"],
    ]
)

In [None]:
ds.save_to_disk("finetune-dataset")

Saving the dataset (0/3 shards):   0%|          | 0/2863323 [00:00<?, ? examples/s]

In [None]:
!zip -r file.zip 'finetune-dataset/'
from IPython.display import FileLink

updating: finetune-dataset/ (stored 0%)
updating: finetune-dataset/data-00001-of-00003.arrow (deflated 65%)
updating: finetune-dataset/dataset_info.json (deflated 43%)
updating: finetune-dataset/data-00002-of-00003.arrow (deflated 65%)
updating: finetune-dataset/state.json (deflated 54%)
updating: finetune-dataset/data-00000-of-00003.arrow (deflated 68%)


In [3]:
## Local

In [56]:
from datasets import load_from_disk

In [57]:
ds = load_from_disk(
    "/home/pranav-pc/projects/OpenTransformer/multiformer/data/finetune/instruct-dataset"
)

In [58]:
from bs4 import BeautifulSoup

sentiment_analysis_prompts = [
    "Provide your sentiment analysis for the following text.",
    "Analyze the sentiment of the provided passage.",
    "Offer your assessment of the sentiment expressed in the following text.",
    "Share your thoughts on the sentiment conveyed in the provided passage.",
    "Evaluate the sentiment in the following text and provide your analysis.",
    "Assess the sentiment of the provided content and offer your perspective.",
    "Interpret the sentiment of the following passage and provide your analysis.",
]


def prompt_sentiment_analysis(row):
    return {
        "prompt": (
            random.choice(sentiment_analysis_prompts)
            + f"### Instruction:\n{str(row['text'])}\n\n### Response:\n{str(row['output'])}"
        )
    }


def _remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    # Get the text without HTML tags
    clean_text = soup.get_text()
    return clean_text

In [59]:
ds_imdb = load_dataset("stanfordnlp/imdb")
ds_imdb = ds_imdb.map(
    lambda example: {"text": _remove_html_tags(example["text"])},
    num_proc=30,
)
label_map = {0: "neg", 1: "pos"}

In [60]:
del ds_imdb["unsupervised"]
del ds_imdb["test"]

In [61]:
ds_imdb = ds_imdb.map(
    lambda example: {"output": "Negative" if not example["label"] else "Postive"}, num_proc=30
)
ds_imdb = ds_imdb.map(prompt_sentiment_analysis)

ds_imdb = ds_imdb.remove_columns(["text", "label", "output"])
ds_imdb = ds_imdb.map(lambda example: {"source": "imdb"})

In [62]:
## TinnyStories Instruct
ds_stories = load_dataset(
    "skeskinen/TinyStories-Instruct-hf",
    cache_dir="/home/pranav-pc/projects/OpenTransformer/multiformer/data/downloads",
)

In [63]:
def preprocess_text(corpus):
    splited_text = []
    for text in corpus:

        if "Random sentence" in text:
            clean_text = text.split("Random sentence:")[1]
            clean_text = " ".join(
                [
                    sent
                    for sent in clean_text.split("\n")
                    if not (sent.startswith("Features") | sent.startswith("Words"))
                ]
            )
            splited_text.append(clean_text.strip())
        elif "Summary" in text:
            clean_text = text.split("Summary:")[1]
            clean_text = " ".join(
                [
                    sent
                    for sent in clean_text.split("\n")
                    if not (sent.startswith("Features") | sent.startswith("Words"))
                ]
            )
            splited_text.append(clean_text.strip())
        else:
            # print(text)
            pass
    return splited_text

In [64]:
ds_stories = ds_stories.map(
    lambda example: {"text": preprocess_text(example["text"])}, batched=True, num_proc=30
)

Map (num_proc=30):   0%|          | 0/2476533 [00:00<?, ? examples/s]

Map (num_proc=30):   0%|          | 0/25028 [00:00<?, ? examples/s]

In [65]:
ds_stories

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2476532
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 25027
    })
})

In [66]:
story_generation_prompts = [
    "Create a story inspired by the following text.",
    "Craft a narrative based on the provided content.",
    "Develop a tale using the following text as inspiration.",
    "Compose a story using the provided passage as a starting point.",
    "Imagine a plot that stems from the following text.",
    "Weave together a tale inspired by the provided content.",
    "Generate a storyline based on the text provided below.",
    "Let your imagination run wild and create a story inspired by the following prompt.",
    "Craft a piece of creative writing based on the provided inspiration.",
    "Use the following prompt as inspiration to create an original story.",
    "Write a short story or poem inspired by the following prompt.",
    "Engage your creativity and compose a piece of writing based on the provided stimulus.",
    "Create an original narrative inspired by the following prompt.",
    "Let your creativity flow and write a story using the following prompt as inspiration.",
]


def prompt_story_generation(text):
    return {
        "prompt": (
            random.choice(story_generation_prompts)
            + f"### Instruction:\n{str(text)}".replace("Story:", "\n\n### Response:").replace(
                "\n\n\n", "\n"
            )
        )
    }

In [67]:
ds_stories = ds_stories.map(lambda example: prompt_story_generation(example["text"]), num_proc=30)

Map (num_proc=30):   0%|          | 0/2476532 [00:00<?, ? examples/s]

Map (num_proc=30):   0%|          | 0/25027 [00:00<?, ? examples/s]

In [68]:
ds_stories = ds_stories.remove_columns(["text"])
ds_stories = ds_stories.map(lambda example: {"source": "tinny-stories"})

Map:   0%|          | 0/2476532 [00:00<?, ? examples/s]

Map:   0%|          | 0/25027 [00:00<?, ? examples/s]

In [71]:
ds = concatenate_datasets([ds, ds_imdb["train"], ds_stories["train"], ds_stories["validation"]])

In [77]:
ds = ds.shuffle()
ds = ds.train_test_split(0.2)

In [78]:
ds.save_to_disk(
    "/home/pranav-pc/projects/OpenTransformer/multiformer/data/finetune/instruct-dataset-v3"
)

Saving the dataset (0/6 shards):   0%|          | 0/4311905 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/1077977 [00:00<?, ? examples/s]

In [81]:
ds

DatasetDict({
    train: Dataset({
        features: ['prompt', 'source'],
        num_rows: 4311905
    })
    test: Dataset({
        features: ['prompt', 'source'],
        num_rows: 1077977
    })
})