In [10]:
from glob import glob
import json
import mwparserfromhell
from tqdm import trange
from transformers import AutoModel, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk.tokenize import sent_tokenize
import re

# %pip install -qU langchain-text-splitters

from langchain_text_splitters import RecursiveCharacterTextSplitter

import sys
sys.path.append("..")
from src.dedup import deduplicated_contents, get_duplicates, get_minhash_lsh

In [2]:
def get_num_tokens(text):
    return len(text.split()) * 1.2

In [3]:
data = []
for filename in glob("../data/wikimedia/*/*.json"):
    lines = json.load(open(filename))
    data.extend(lines)
# Write to file
# with open('data.json', 'w') as f:
#     json.dump(data, f, indent=4)
# data = json.load(open("data.json"))

In [4]:
for i in trange(len(data)):
    data[i]["id"] = i
    if "wikitext" in data[i]:
        data[i]["content"] = (
            mwparserfromhell.parse(data[i]["wikitext"]).strip_code().strip()
        )

100%|██████████| 13507/13507 [00:26<00:00, 507.53it/s]


In [5]:
def preview_row(row):
    print(row["title"])
    print(row["content"][:5000])
    print(row["wikitext"][:5000])
    print()

In [2]:
# Write to file
# with open('data.json', 'w') as f:
#     json.dump(data, f, indent=4)
# data = json.load(open("data.json"))

In [149]:
exclude_sections = [
    "References",
    "See Also",
    "External Links",
    "Credits",
    "Gameplay",
    "Video Games",
    "Naruto the Movie",
    "Road to Ninja",
    "In Other Media",
    "Movies",
]


def get_output_from_section(
    page,
    return_entities=False,
    return_images=False,
    return_infobox=False,
    return_tree=False,
):
    page_content = ""

    entities = []
    images = []
    headings = []
    infobox = {}
    summary = ""
    content_headings = []

    sections = page.get_sections(levels=[2], include_headings=True, include_lead=True)
    for i, section in enumerate(sections):
        content = ""
        section_heading = (
            section.get(0).title.strip_code().strip() if i > 0 else "Summary"
        )
        if section_heading in exclude_sections:
            continue

        for node in section.nodes:
            if isinstance(node, mwparserfromhell.nodes.heading.Heading):
                headings.append({"level": node.level, "text": node.title.strip()})
                # content += "\n" + node.title.strip_code()

            elif isinstance(node, mwparserfromhell.nodes.template.Template):
                if return_infobox and "infobox" in node.name or "Infobox" in node.name:
                    box_name = (
                        node.name.strip_code().lower().replace("infobox", "").strip()
                    )
                    tmp_box = {}
                    for param in node.params:
                        k, v = param.split("=", 1)
                        tmp_box[k.strip()] = v.strip()

                    infobox[box_name] = tmp_box

                if "translation" in node.name or "Translation" in node.name:
                    content += " ".join([n.value.strip_code() for n in node.params])
            elif isinstance(node, mwparserfromhell.nodes.text.Text):
                content += node.value
            elif isinstance(node, mwparserfromhell.nodes.tag.Tag):
                if node.tag == "gallery":
                    continue
                content += " " + node.contents.strip_code()
            elif isinstance(node, mwparserfromhell.nodes.wikilink.Wikilink):
                entities.append(node)
                title = node.title.strip_code()
                if "File:" in title:
                    images.append(node)
                    continue
                content += node.title.strip_code()

        if i == 0:
            summary = content
        else:
            content_headings.append({section_heading: content})

        page_content += content

    output = {
        "text": re.sub(r"\n+", "\n", page_content),
        "heading": headings,
        "summary": summary,
    }
    if return_entities:
        output["entities"] = entities

    if return_images:
        output["images"] = images

    if return_infobox:
        output["infobox"] = infobox

    if return_tree:
        output["content_headings"] = content_headings

    return output

In [218]:
import re
from collections import defaultdict


class Node:
    def __init__(self, name="", level=1):
        self.childrens = {}
        self.name = name
        self.level = level

    def __repr__(self):
        return f"Node({self.name}, level: {self.level} {self.childrens})"

    def find_leaf_nodes(self):
        leaf_nodes = []

        def traverse(node):
            if not node.childrens:
                leaf_nodes.append(node)
            else:
                for child in node.childrens.values():
                    traverse(child)

        traverse(self)
        return leaf_nodes


def parse_headings_to_nodes(headings):
    root = Node()
    stack = [(root, 0)]

    for line in headings:
        title, level = line
        while stack and stack[-1][1] >= level:
            stack.pop()

        current_node = stack[-1][0]
        if title not in current_node.childrens:
            current_node.childrens[title] = Node(name=title, level=level)

        stack.append((current_node.childrens[title], level))

    return root


def get_output_from_lowest_section(
    page: mwparserfromhell.wikicode.Wikicode,
    return_entities=False,
    return_images=False,
    return_infobox=False,
    return_tree=False,
):
    page_content = ""

    entities = []
    images = []
    headings = []
    infobox = {}
    summary = ""
    content_headings = {}

    sections = page.get_sections(levels=[], include_headings=True, include_lead=True)
    headings = [
        (row.get(0).title.strip_code().strip(), row.get(0).level)
        for row in sections[1:]
    ]
    parsed_tree = parse_headings_to_nodes(headings)
    leaves_nodes = parsed_tree.find_leaf_nodes() + [Node(name="Summary", level=1)]

    for leave_node in leaves_nodes:
        content = ""
        if leave_node.name in exclude_sections:
            continue
        section = page.get_sections(
            levels=[leave_node.level],
            matches=f"{leave_node.name}.*",
            include_headings=True,
            include_lead=False,
        )
        if not section:
            section = sections[0]
        else:
            section = section[0]

        for node in section.nodes:
            if isinstance(node, mwparserfromhell.nodes.heading.Heading):
                title = node.title.strip_code().strip()
                current_level = node.level
                # print(heading_stack, len(content))
            elif isinstance(node, mwparserfromhell.nodes.template.Template):
                if return_infobox and "infobox" in node.name or "Infobox" in node.name:
                    box_name = (
                        node.name.strip_code().lower().replace("infobox", "").strip()
                    )
                    tmp_box = {}
                    for param in node.params:
                        k, v = param.split("=", 1)
                        tmp_box[k.strip()] = v.strip()

                    infobox[box_name] = tmp_box

                if "translation" in node.name or "Translation" in node.name:
                    content += " ".join([n.value.strip_code() for n in node.params])
            elif isinstance(node, mwparserfromhell.nodes.text.Text):
                content += node.value
            elif isinstance(node, mwparserfromhell.nodes.tag.Tag):
                if node.tag == "gallery":
                    continue
                content += " " + node.contents.strip_code()
            elif isinstance(node, mwparserfromhell.nodes.wikilink.Wikilink):
                entities.append(node)
                title = node.title.strip_code()
                if "File:" in title:
                    images.append(node)
                    continue
                content += node.title.strip_code()

        if content:
            content_headings[leave_node.name] = content

        page_content += content

    output = {
        "text": re.sub(r"\n+", "\n", page_content),
        "heading": headings,
        "summary": summary,
    }
    if return_entities:
        output["entities"] = entities

    if return_images:
        output["images"] = images

    if return_infobox:
        output["infobox"] = infobox

    if return_tree:
        output["content_headings"] = content_headings

    return output

In [219]:
wikipage = mwparserfromhell.parse(data[8955]["wikitext"])
parsed_data = get_output_from_lowest_section(
    wikipage,
    return_images=True,
    return_infobox=True,
    return_entities=True,
    return_tree=True,
)

In [221]:
extracted_data = []
for i in trange(len(data)):
    wikipage = mwparserfromhell.parse(data[i]["wikitext"])
    parsed_data = get_output_from_section(
        wikipage,
        return_images=True,
        return_infobox=True,
        return_entities=True,
        return_tree=True,
    )
    parsed_data["title"] = data[i]["title"]
    extracted_data.append(parsed_data)

100%|██████████| 13507/13507 [00:51<00:00, 262.54it/s]


In [229]:
import pickle
pickle.dump(extracted_data, open("wikimedia_extracted_data.pkl", "wb"))

In [4]:
import pickle
extracted_data = pickle.load(open("wikimedia_extracted_data.pkl", "rb"))

### Insert to ElasticSearch

In [29]:
# from elasticsearch import Elasticsearch
# from elasticsearch.helpers import bulk

# es = Elasticsearch()
# es.indices.create(index="naruto_wiki", ignore=400)
# from hashlib import md5
# # doc = {
# #     "id": title + heading,
# #     "title": title,
# #     "heading": heading,
# #     "content": content,
# #     "entities": entities,
# # }

# documents = []
# for i in range(len(extract_data)):
#     entites_names = [entity.title.strip_code().lower() for entity in extract_data[i]["entities"]]
#     for heading, content in extract_data[i]["content_headings"].items():
#         id_ = md5((extract_data[i]["title"] + heading).encode()).hexdigest()
#         doc = {
            
#             "id": id_,
#             "title": extract_data[i]["title"].lower(),
#             "heading": heading.lower(),
#             "content": content.lower(),
#             "entities": entites_names,
#         }
#         documents.append(doc)

# batch_size = 1000
# for i in range(0, len(documents), batch_size):
#     batch = documents[i : i + batch_size]
#     actions = [
#         {"_index": "naruto_wiki", "_id": doc["id"], "_source": doc} for doc in batch
#     ]
#     bulk(es, actions)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'naruto_wiki'}

### Transform

In [5]:
import random

idx = random.choice(range(len(extracted_data)))
print(extracted_data[idx])

{'text': "\nAnko briefs the candidates on their objectives for the second portion of the Chūnin Exams. The test is structured such that at least half the teams will be eliminated. Before entering the Forest of Death, all teams will be given either a heaven or an earth scroll. Once inside the forest, the genin must take the other scroll from another team. Only those that make it to the tower in the centre of the forest with both scrolls will advance to the next stage of the exams. Anko hands out the scrolls and opens the forest gates. Soon after, Team 8 Byakugan for a scroll and manages to take a Amegakure team's scroll after trapping them. Elsewhere, Team Kakashi is attacked by a disguised Oboro.\n", 'heading': [('Synopsis', 2), ('Credits', 2)], 'summary': '', 'entities': ['[[Anko]]', '[[Chūnin Exams]]', '[[Forest of Death]]', '[[scroll]]', '[[genin]]', '[[Team 8]]', '[[Byakugan|searches]]', '[[Amegakure]]', '[[Team Kakashi|Team 7]]', '[[Oboro|Ame ninja]]'], 'images': [], 'infobox': {'

In [5]:
# Filter out empty text, boruto from infobox
def filter_boruto_tag(infobox: dict):
    for section in infobox.keys():
        for k, v in infobox[section].items():
            if "boruto" in k.lower() and "yes" in v.lower():
                return True
    return False


def filter_naruto_videogame(infobox: dict):
    for section in infobox.keys():
        if "videogame" in section.lower():
            return True
    return False

In [7]:
# print descrive stats: min, max, percentile, mean, std
doc_tokens = np.array([get_num_tokens(row["text"]) for row in extracted_data])
print("Min: ", np.min(doc_tokens))
print("Max: ", np.max(doc_tokens))
print("Mean: ", np.mean(doc_tokens))
print("Std: ", np.std(doc_tokens))
print("25th percentile: ", np.percentile(doc_tokens, 25))
print("50th percentile: ", np.percentile(doc_tokens, 50))
print("75th percentile: ", np.percentile(doc_tokens, 75))
print("90th percentile: ", np.percentile(doc_tokens, 90))
print("95th percentile: ", np.percentile(doc_tokens, 95))
print("97th percentile: ", np.percentile(doc_tokens, 97))
print("99th percentile: ", np.percentile(doc_tokens, 99))

Min:  0.0
Max:  37966.799999999996
Mean:  1177.5674539127858
Std:  3864.1207522495592
25th percentile:  111.6
50th percentile:  232.79999999999998
75th percentile:  631.8
90th percentile:  1987.1999999999998
95th percentile:  4640.4
97th percentile:  8665.199999999999
99th percentile:  24651.6


In [6]:
extracted_data_filtered = [
    row
    for idx, row in enumerate(extracted_data)
    if idx not in [3778]
    and row["text"] != ""
    and len(row["text"].split()) > 0
    and not filter_boruto_tag(row["infobox"])
    and get_num_tokens(row["text"]) >= 50
]

In [7]:
len(extracted_data_filtered), len(extracted_data)

(12032, 13507)

In [8]:
from collections import Counter

counter = Counter()
for i in range(len(extracted_data)):
    infobox = extracted_data[i]["infobox"]
    for k in infobox.keys():
        for tag, val in infobox[k].items():
            counter[tag] += 1
valid_tags = [
    "kanji",
    "romaji",
    "users",
    "jutsu classification",
    "jutsu media",
    "jutsu class type",
    "jutsu range",
    "parent jutsu",
    "debut manga",
    "jutsu type",
    "jutsu rank",
    "hand signs",
    "teams",
    "tool classification",
    "manga debut",
]

In [55]:
# chunking with heading
import random

instructions = [
    "Please provide a response to the following topic",
    "Write a response to the following topic",
    "Describe about the following topic",
    "Write the paragraph about the following topic",
]
def create_maximum_chunks(row, max_length: int, overlap_chunk_size: int = 400):
    def get_heading():
        return random.choice([row["title"] + "." + row["summary"], row["title"]])

    first_content = ", ".join([row["title"], row["summary"], row["text"]])
    sections = row["content_headings"]
    if row["infobox"]:
        infobox = row["infobox"]
        for k in infobox.keys():
            ib_content = ";".join(
                [
                    " {}: {}".format(tag, value)
                    for tag, value in infobox[k].items()
                    if tag in valid_tags
                ]
            )
            if ib_content:
                sections["Info Box"] = ib_content
                first_content += "\nInfo Box: " + ib_content

    num_tokens = get_num_tokens(first_content)
    heading = get_heading()
    chunks = []

    if num_tokens >= max_length:

        current_chunk = [heading]
        current_sum_tokens = get_num_tokens(current_chunk[-1])
        # Iterate through sections
        for heading, current_section_content in sections.items():
            current_section_tokens = get_num_tokens(current_section_content)

            # Check if adding the current section would exceed max_length
            if current_sum_tokens + current_section_tokens > max_length:
                # Finalize the current chunk and start a new one
                content = "\n".join(current_chunk)
                chunks.append(content)

                current_chunk = [get_heading()]
                current_sum_tokens = get_num_tokens(current_chunk[-1])

            # Add the current section to the current chunk
            current_chunk.append(current_section_content)
            current_sum_tokens += current_section_tokens

        # Add the last chunk if it's not empty
        if current_chunk:
            chunks.append("\n ".join(current_chunk))
    else:
        chunks.append(first_content)

    new_chunks = []
    for i in range(len(chunks)):
        tokens = chunks[i].split()
        for i in range(0, len(tokens), max_length - overlap_chunk_size):
            new_chunks.append(
                row["title"]
                + ", "
                + " ".join(tokens[i : i + max_length - overlap_chunk_size])
            )

    return new_chunks


new_chunk_data = []
max_length = 4096
for i in range(len(extracted_data_filtered)):
    chunks = create_maximum_chunks(
        extracted_data_filtered[i], max_length, overlap_chunk_size=700
    )
    for chunk in chunks:
        if get_num_tokens(chunk) < 5:
            continue
        new_chunk_data.append(
            {
                "input": chunk,
                "response": "",
            }
        )

In [14]:
# chunking with heading
import random

prefix_templates = [
    "{n1} of {n2}",
    "{n2}'s {n1}",
]

# chunking with heading
import random

instructions = [
    "Please provide a response to the following topic",
    "Write a response to the following topic",
    "Describe about the following topic",
    "Write the paragraph about the following topic",
]


def create_section_chunks(row, max_length: int, overlap_chunk_size: int = 400):
    sections = row["content_headings"]
    if row["infobox"]:
        infobox = row["infobox"]
        for k in infobox.keys():
            ib_content = ";".join(
                [
                    " {}: {}".format(tag, value)
                    for tag, value in infobox[k].items()
                    if tag in valid_tags
                ]
            )
            if ib_content:
                sections["Info Box"] = ib_content

    if row["summary"]:
        sections["Summary"] = row["summary"]

    chunks = []

    # Iterate through sections
    for heading, current_section_content in sections.items():
        chunks.append((heading, current_section_content))

    new_chunks = []
    for i in range(len(chunks)):
        heading, tokens = chunks[i][0], chunks[i][1].split()
        for i in range(0, len(tokens), max_length - overlap_chunk_size):
            content = " ".join(tokens[i : i + max_length - overlap_chunk_size])
            prefix = random.choice(prefix_templates).format(n1=heading, n2=row["title"])
            if (
                content.lower().find(row["title"].lower(), 0, len(row["title"]) + 100)
                > 0
            ):
                new_chunks.append((prefix, content))
            else:
                new_chunks.append((prefix, content))

    return new_chunks


new_chunk_data = []
max_length = 2048
for i in range(len(extracted_data_filtered)):
    chunks = create_section_chunks(
        extracted_data_filtered[i], max_length, overlap_chunk_size=500
    )
    for topic, chunk in chunks:
        if get_num_tokens(chunk) < 30:
            continue
        new_chunk_data.append(
            {
                "input": topic,
                "instruction": random.sample(instructions, 1)[0],
                "response": re.sub("#+", " ", chunk),
            }
        )

deduplicated_prompt_indices = deduplicated_contents([prompt["response"] for prompt in new_chunk_data], ngrams=10, threshold=0.7, num_perm=32, return_indices=True)
new_chunk_data = [new_chunk_data[i] for i in deduplicated_prompt_indices]

In [18]:
len(new_chunk_data)

13193

In [82]:
# print descrive stats: min, max, percentile, mean, std
doc_tokens = np.array([get_num_tokens(row["input"] + row.get("instruction", "") + row.get("response", "")) for row in new_chunk_data])
print("Min: ", np.min(doc_tokens))
print("Max: ", np.max(doc_tokens))
print("Mean: ", np.mean(doc_tokens))
print("Std: ", np.std(doc_tokens))
print("25th percentile: ", np.percentile(doc_tokens, 25))
print("50th percentile: ", np.percentile(doc_tokens, 50))
print("75th percentile: ", np.percentile(doc_tokens, 75))
print("90th percentile: ", np.percentile(doc_tokens, 90))
print("95th percentile: ", np.percentile(doc_tokens, 95))
print("97th percentile: ", np.percentile(doc_tokens, 97))
print("99th percentile: ", np.percentile(doc_tokens, 99))

len(np.where(doc_tokens >= 90)[0])

Min:  42.0
Max:  2005.1999999999998
Mean:  210.6359736223755
Std:  266.681443411528
25th percentile:  68.39999999999999
50th percentile:  111.6
75th percentile:  230.39999999999998
90th percentile:  480.9600000000013
95th percentile:  720.4799999999996
97th percentile:  923.0879999999997
99th percentile:  1466.5919999999996


8002

In [83]:
indices = random.choices(np.where(doc_tokens <= 90)[0], k =10)
for idx in indices:
    print(new_chunk_data[idx])

{'input': "Turbulent Four Seasons's Info Box", 'instruction': 'Write the paragraph about the following topic', 'response': 'kanji: 乱れ雪月花; romaji: Midare Setsugekka; parent jutsu: Fire Release: Great Dragon Fire Technique, Body Flicker Technique; jutsu classification: Ninjutsu, Shurikenjutsu, Cooperation Ninjutsu; jutsu type: Fire Release; jutsu class type: Offensive; jutsu range: Short, Mid; users: Itachi Uchiha~~with~Shisui Uchiha, Shisui Uchiha~~with~Itachi Uchiha; jutsu media: Game'}
{'input': 'Summary of Four Symbols Puppet Shinobi', 'instruction': 'Write the paragraph about the following topic', 'response': 'Akatsuki members are given a special human-shaped seal created by Nagato that takes a physical form known as the Four Symbols Puppet Shinobi, to fight or stall the enemy. The Puppet Shinobi goes to a faraway location to create numerous copies of itself. The only way to break the technique is to stop the Puppet Shinobi source.'}
{'input': "Log's Personality", 'instruction': 'Wr

In [76]:
len(doc_tokens)

13193

In [86]:
# ignore games
tags = [
    "Naruto Mobairu",
    "Naruto: Path of the Ninja",
    "Naruto RPG 2: Chidori vs. Rasengan",
    "side-scrolling action/adventure game in development by Namco Bandai Games and Inti Creates for the Nintendo 3DS platform",
    "Legends: Akatsuki Rising is an adventure game for the Wikipedia:PSP created by Namco Bandai Games America",
    "Legends: Akatsuki Rising is an adventure game",
    "Naruto Hurricane Chronicles: Dragon Sword Chronicles in Japan",
    "Naruto: Ultimate Ninja series",
    "Naruto the Movie: Blood Prison",
    "Naruto: Clash of Ninja (series)",
    "Naruto Shipp\u016bden the Movie: The Will of Fire",
    "Naruto: Konoha Ninp",
    "Ultimate Ninja Storm",
    "Naruto: Ninja Destiny",
    "3D Naruto game",
    "Naruto: Ninja Council",
    "Naruto, the Genie, and the Three Wishes",
    "Clash of Ninja Revolution III",
    "Clash of Ninja 2",
    # "Ultimate Ninja",
    "NARUTOP99",
    "Narutopedia",
    "Clash of Ninja Revolution 3",
    "Ninja Taisen! EX 3",
    "Ninja Taisen EX 2 is the sixth instalment",
    "Kizuna Drive the fourth instalment",
    "Playable Characters",
    "Naruto film",
    "the Movie: Bonds",
    "Naruto Shipp\u016bden the Movie:",
    "Naruto Shippuden the Movie",
    "Naruto Shippuuden Animation Credits",
    "Naruto Shipp\u016bden 3D",
    "adventure game",
    "instalment",
    "Naruto: Shinobi Masters",
    "Naruto: Shippuden the Movie",
    "anime adaptation",
    "Naruto Error",
    "Naruto Collectible Card Game" "Naruto x UT",
    "Naruto Shippuden: Ultimate Ninja Heroes 3",
    "Naruto Shippuden: Ultimate Ninja 5",
    "Naruto Shippuden: Ultimate Ninja Impact",
    "Naruto Shippuden Animation Credits",
    "Naruto Shippuden Ultimate Ninja Strom 2",
    "Naruto Shippuden Storm 2",
    "Naruto: Shipp\u016bden the Movie 3",
    "Naruto Official Manga App",
    "Nintendo Game Boy",
    "Sony PlayStation",
    "Naruto video games",
    "Arcade of Naruto video games",
    "Downloadable Content of Naruto x Boruto",
    "Naruto x UT's",
    "Naruto: Ultimate Ninja Online",
    "Naruto: Ultimate Ninja Heroes 2",
    "Naruto: The Official Fanbook",
    "Naruto: Shipp\u016bden the Movie",
    "Naruto: Shinobi Collection Shipp",
    "Naruto: Road to Ninja",
    "Naruto Shipp\u016bden the Movie 2",
    "Naruto Shipp\u016bden the Movie 3",
    "Naruto Shipp\u016bden: Clash of Ninja Project",
    "Naruto Shipp\u016bden: Bonds's",
    "Naruto Shipp\u016bden's Dubbed",
    "Naruto Shipp\u016bden 4: The Lost Tower",
    "Naruto Shippuuden: the movie",
    "Naruto Shippuden Movie",
    "Naruto RPG: Uketsugareshi",
    "Naruto Shippuden: Ultimate Ninja 4's Story Mode",
    "Naruto Shippuden: Dragon Sword Chronicles",
    "Naruto Shippuden ultimate ninja storm",
    "Naruto Shipp\u016bden 3: Inheritors of the Will of Fire",
    "Naruto Shipp\u016bden 2: Bonds",
    "Naruto Shipp\u016bden: Dairansen! Kage",
    "Naruto Shipp\u016bden: Gekitou Ninja Taisen",
    "Naruto Shipp\u016bden: The Movie",
    "Naruto The Movie",
    "Naruto Shipp\u016bden: Ultimate Ninja",
    "Naruto Shipp\u016bden the Movie",
    "Naruto Shipp\u016bden Animation Credits",
    "Naruto Shipp\u016bden 5: Blood Prison",
    "Naruto Shippuuden: Narutimetto Akuseru",
    "Naruto the Movie 3",
    "Naruto ninja destiny",
]
version = "v7"
min_tokens = 60 # 1/4 in total
for context_length in [512, 1024, 2048]:
    overlap_chunk_size = 200
    while True:
        print("Process ", context_length, overlap_chunk_size)
        new_chunk_data = []
        for i in range(len(extracted_data_filtered)):
            chunks = create_section_chunks(
                extracted_data_filtered[i],
                context_length,
                overlap_chunk_size=overlap_chunk_size,
            )
            for topic, chunk in chunks:
                if get_num_tokens(chunk) < min_tokens:
                    continue
                new_chunk_data.append(
                    {
                        "input": topic,
                        "instruction": random.sample(instructions, 1)[0],
                        "response": re.sub("#+", " ", chunk),
                    }
                )
        doc_tokens = np.array(
            [
                get_num_tokens(
                    row["input"] + row.get("instruction", "") + row.get("response", "")
                )
                for row in new_chunk_data
            ]
        )
        if len(np.where(doc_tokens > max_length)[0]) == 0:
            break

        overlap_chunk_size += 200

    new_chunk_data = [
        row
        for row in new_chunk_data
        if not any(tag.lower() in row["input"].lower() for tag in tags)
        and len(row["response"].split()) >= min_tokens
    ]
    deduplicated_prompt_indices = deduplicated_contents([prompt["response"] for prompt in new_chunk_data], ngrams=10, threshold=0.7, num_perm=32, return_indices=True)
    new_chunk_data = [new_chunk_data[i] for i in deduplicated_prompt_indices]
    with open(f"prompt_wiki_with_chunk_{version}_{context_length}.jsonl", "w") as f:
        for prompt in new_chunk_data:
            f.write(json.dumps(prompt) + "\n")

Process  512 200
Process  1024 200
Process  2048 200
Process  2048 400


In [349]:
create_section_chunks(extracted_data[4636], max_length=2047, overlap_chunk_size=600)

["Summary of Kakashi\nKakashi Hatake はたけカカシ Hatake Kakashi is a shinobi of Konohagakure's Hatake clan. Famed as Kakashi of the Sharingan 写輪眼のカカシ Sharingan no Kakashi and the Copy Ninja コピー忍者 Kopī Ninja, he is one of Konoha's most talented ninja, regularly looked to for advice and leadership despite his personal dislike of responsibility. To his students on Team Kakashi, Kakashi emphasises the importance of teamwork; he himself received this lesson, along with the Sharingan, from his childhood friend, Obito Uchiha. After the Fourth Shinobi World War, Kakashi becomes Konoha's Sixth Hokage 六代目火影 Rokudaime Hokage Sixth Fire Shadow.",
 'Kakashi\'s Background\nBecause his mother died when he was very young, Naruto chapter 449, page 7 Kakashi was raised during his early years by his father, Sakumo. Sakumo was famed throughout the shinobi world, having saved Konoha on at least one occasion; Kakashi in particular revered his father. During one of Sakumo\'s missions - after Kakashi was enrolled 