In [12]:
from glob import glob
import json
import mwparserfromhell
from tqdm import trange
from transformers import AutoModel, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
import numpy as np
from nltk.tokenize import sent_tokenize
import re
# %pip install -qU langchain-text-splitters

from langchain_text_splitters import RecursiveCharacterTextSplitter


In [27]:
data = []
for filename in glob('../data/wikimedia/*/*.json'):
    lines = json.load(open(filename))
    data.extend(lines)
# Write to file
# with open('data.json', 'w') as f:
#     json.dump(data, f, indent=4)
# data = json.load(open("data.json"))    

In [28]:
for i in trange(len(data)):
    data[i]["id"] = i
    if 'wikitext' in data[i]:
        data[i]['content'] = mwparserfromhell.parse(data[i]['wikitext']).strip_code().strip()

100%|██████████| 13507/13507 [00:26<00:00, 519.28it/s]


In [29]:
# Extract file image names
for i in trange(len(data)):
    data[i]['images'] = []
    node = mwparserfromhell.parse(data[i]['wikitext'])
    for wikilink in node.filter_wikilinks():
        if wikilink.title.startswith('File:'):
            data[i]["images"].append(wikilink.title.split('File:', 1)[-1])

100%|██████████| 13507/13507 [00:27<00:00, 487.59it/s]


In [30]:
# Count number of images
image_count = 0
for i in range(len(data)):
    image_count += len(data[i]["images"])
print(image_count)
# Write image names to file, one per line
base_url = "https://naruto.fandom.com/wiki/Special:Redirect/file"
with open('image_names.txt', 'w') as f:
    for i in range(len(data)):
        for image in data[i]["images"]:
            f.write(base_url + "/" + image + "\n")

40476


In [31]:
# Write to file
# with open('data.json', 'w') as f:
#     json.dump(data, f, indent=4)
# data = json.load(open("data.json"))    

In [32]:
def remove_thumb_substring(text):
    while "thumb" in text:
        start_thumb = text.find("thumb")
        end_thumb = start_thumb
        while end_thumb < len(text) and end_thumb != -1 and text[end_thumb] != "|":
            end_thumb += 1
        text = text[:start_thumb] + text[end_thumb+1:]
    return text

def remove_by_regex(text, template, subst = ""):
    return re.sub(template, subst, text, 0, re.MULTILINE)

for i in range(len(data)):    
    txt = data[i]["content"]
    idx = txt.rfind("Reference")
    if idx >= 0:
        txt = txt[:idx]
    
    txt = remove_thumb_substring(txt)
    txt = remove_by_regex(txt, r"(\w)+\|")
    txt = remove_by_regex(txt, r"\n\r|\n|\r")
    data[i]["cleaned_content"] = txt

In [33]:
for i in range(10):
    print("---")
    print(data[i]["cleaned_content"])

---
The  is an Outer Path technique that allows a Rinnegan user to manipulate up to six bodies as though they are their own. Usage The user embeds one or more black receivers into a body, allowing them to channel their chakra into it from great distances. To make full use of the body, the user ideally transmits their chakra from the highest and closest point possible so that they can have the best possible range.Naruto chapter 428, pages 14-15 Once their chakra enters the body, they are given complete control of it, allowing them to decide all of its actions, its use of jutsu, and even speak through it. As a representation of this control, the controlled bodies' eyes take on the same appearance as the user's.Naruto chapter 544, pages 16-17 Inoichi Yamanaka compared this technique to the Yamanaka clan's Mind Body Switch Technique, albeit on a greater scale.A first-person perspective of the Rinnegan's shared vision.When multiple bodies are controlled together the user is able to see thro

In [34]:
prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [35]:
import random
prompts = []
instructions = [
    "Describe about this entity",  
    "What is this entity?",
    "Provide additional information about the entity"
    ]
for i in range(len(data)):
    instruction = random.choice(instructions)
    prompt = {"instruction": instruction, "input": data[i]["title"], "response": data[i]["cleaned_content"]}
    prompts.append(prompt)

In [37]:
# preview first 10 prompts
for i in range(10):
    print(prompt_template.format(prompts[i]["instruction"], prompts[i]["input"], prompts[i]["response"]))

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
What is this entity?

### Input:
Six paths of pain

### Response:
The  is an Outer Path technique that allows a Rinnegan user to manipulate up to six bodies as though they are their own. Usage The user embeds one or more black receivers into a body, allowing them to channel their chakra into it from great distances. To make full use of the body, the user ideally transmits their chakra from the highest and closest point possible so that they can have the best possible range.Naruto chapter 428, pages 14-15 Once their chakra enters the body, they are given complete control of it, allowing them to decide all of its actions, its use of jutsu, and even speak through it. As a representation of this control, the controlled bodies' eyes take on the same appearance as the user's.Naruto chapter 544, pages 16-17 Inoichi Yaman

In [36]:
with open("prompt_wiki.txt", "w") as f:
    for prompt in prompts:
        f.write(json.dumps(prompt) + "\n")

In [3]:
data = [json.loads(line) for line in open("prompt_wiki.txt").readlines()]

In [32]:
new_chunk_data = []
max_length = 4096
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=max_length,
    chunk_overlap=chunk_overlap,
    length_function=lambda d: len(d.split()),
    is_separator_regex=False,
    separators=["\n\n", "\n", ". ", ".\n", "? ", "?\n", "! ", "!\n", "."],
)
for i in range(len(data)):
    texts = text_splitter.create_documents([data[i]["response"]])
    for text in texts:
        new_chunk_data.append({
            "instruction": data[i]["instruction"],
            "input": data[i]["input"],
            "response": text.page_content,
        })

In [38]:
with open("prompt_wiki_with_chunk.txt", "w") as f:
    for prompt in new_chunk_data:
        f.write(json.dumps(prompt) + "\n")

In [42]:
new_chunk_data[3]

{'instruction': 'Describe about this entity',
 'input': 'Sixth Hokage Candidate',
 'response': 'was an elder of Konohagakure. As the founder and leader of Root, Danzō gained notoriety as Naruto chapter 459, page 1 because of his frequent unsanctioned actions and his often-suspected (but rarely proven) undermining of specific Konoha personnel. Despite his decades of suspicious deeds, Danzō only ever acted in what he believed were the village\'s best interests. He was appointed the Kai no Sho page 116 after Pain\'s Assault, but died before he could be formally approved to the position. Background Born into the Shimura Clan to his shinobi father, Danzō went onto joining the Academy, where he was amongst the first few hundred of students to graduate. From his genin days, he was acquainted with Hiruzen Sarutobi,Itachi Shinden: Book of Bright Light page 139 whom he became rivals with in everything.Naruto chapter 481, page 8 Danzō once saw the First Hokage in battle, where he witnessed Hashir

In [37]:
len(data)

13507

In [4]:
# put into elastiscearch
from elasticsearch import Elasticsearch
es = Elasticsearch()
# es.indices.create(index='wikipedia', ignore=400)
for i in trange(len(data)):
    es.index(index='naruto_wiki', id=i, body=data[i])


100%|██████████| 18739/18739 [01:11<00:00, 261.25it/s]


In [2]:
# embeddings by nomic ai
# Use a pipeline as a high-level helper
model_name = "mixedbread-ai/mxbai-embed-large-v1"
model = SentenceTransformer(model_name, device="mps", )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# ort_model = ORTModelForFeatureExtraction.from_pretrained(model_name, file_name="onnx/model_quantized.onnx")
# onnx_pipeline = pipeline("feature-extraction", model=ort_model, tokenizer=tokenizer)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/113k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [5]:
texts = [d['content'] for d in data]
batch_size = 256
# embeddings = np.zeros((len(texts), ort_model.config))

In [37]:
max_length = 512
# Split into chunks of max_length if length of input_ids is greater than max_length
# we mark chunk with index. Each index will have at least one chunk

chunks = []
for i in trange(0, len(data)):
    text = data[i]['content']
    if not text:
        continue
    inputs = model.tokenize(text)
    length = len(inputs["input_ids"])
    for j in range(length):
        for chunk in range(0, len(inputs["input_ids"][j]), max_length):
            chunks.append(
                (
                    data[i]["id"],
                    chunk,
                    {
                        "input_ids": inputs["input_ids"][j][
                            chunk : chunk + max_length
                        ],
                        "attention_mask": inputs["attention_mask"][j][
                            chunk : chunk + max_length
                        ],
                        "token_type_ids": inputs["token_type_ids"][j][
                            chunk : chunk + max_length
                        ],
                    },
                )
            )

 37%|███▋      | 6960/18739 [30:08<20:28,  9.59it/s]    IOStream.flush timed out
 56%|█████▌    | 10480/18739 [1:01:32<02:39, 51.94it/s]  

: 

In [29]:
len(data)

18739

In [35]:
model.tokenize([data[0]["content"]])

{'input_ids': tensor([[  101,  1996,  2003,  2019,  6058,  4130,  6028,  2008,  4473,  1037,
          15544, 10087,  5289,  5310,  2000, 17708,  2039,  2000,  2416,  4230,
           2004,  2295,  2027,  2024,  2037,  2219,  1012,  8192,  1996,  5310,
           7861,  8270,  2015,  2028,  2030,  2062,  2304, 19278,  2046,  1037,
           2303,  1010,  4352,  2068,  2000,  3149,  2037, 15775, 22272,  2046,
           2009,  2013,  2307, 12103,  1012,  2000,  2191,  2440,  2224,  1997,
           1996,  2303,  1010,  1996,  5310, 28946, 19818,  2015,  2037, 15775,
          22272,  2013,  1996,  3284,  1998,  7541,  2391,  2825,  2061,  2008,
           2027,  2064,  2031,  1996,  2190,  2825,  2846,  1012,  6583, 22134,
           2080,  3127,  4413,  2620,  1010,  5530,  2403,  1011,  2321,  2320,
           2037, 15775, 22272,  8039,  1996,  2303,  1010,  2027,  2024,  2445,
           3143,  2491,  1997,  2009,  1010,  4352,  2068,  2000,  5630,  2035,
           1997,  2049,  45