In [1]:
from glob import glob
import json
import random
import mwparserfromhell
from tqdm import trange
from transformers import AutoModel, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
output = []
with open("../data/sportseeker/all_pages.json") as f:
    for line in f:
        output.append(json.loads(line.strip()))

In [5]:
# Two steps
# Step 1: Create a dictionary of tags
# Create entity class: jutsu, character, clan, team
# We extract tag from each item in output["tags"], create a dictionary of tags
# Step 2: Create annotation
# For each content in output, we annotate position of text that match with tags
# Example:
# text: Adamantine Sealing Chains is a form of Fuinjutsu, also known as a Sealing Jutsu. This technique belongs to the Uzumaki and is a Hiden Technique.
# tags: {"jutsu": [Adamantine Sealing Chains, Fuinjutsu, Sealing Jutsu], "clan": [Uzumaki]}
# results: {"Adamantine Sealing Chains": {start_word: 0, end_word: 3}}



In [3]:
def create_tag_dictionary(outputs):
  """
  This function creates a dictionary of tags from the output data.

  Args:
      output: A dictionary containing a "tags" key with a list of tags.

  Returns:
      A dictionary where keys are entity types ("jutsu", "character", "clan", "team")
      and values are lists of tags belonging to that type.
  """
  from collections import defaultdict
  tags_dict = defaultdict(list)
  
  for i in range(len(outputs)):
    if "tag" not in outputs[i]:
        continue
    tag = outputs[i]["tag"]
    tags_dict[tag].append(outputs[i]["title"])

  return tags_dict

In [4]:
def annotate_text(text, tags_dict):
  """
  This function annotates the positions of tags within a text.

  Args:
      text: The text to be annotated.
      tags_dict: The dictionary containing entity types and their tags.

  Returns:
      A dictionary where keys are tags and values are dictionaries with "start_word" and "end_word" keys indicating the position of the tag in the text.  
  """
  annotations = {}
  for entity_type, tags in tags_dict.items():
    for tag in tags:
      start_pos = text.find(tag)
      if start_pos != -1:
        end_pos = start_pos + len(tag)
        annotations[tag] = {"start_word": start_pos, "end_word": end_pos, "entity_type": entity_type}
  return annotations

In [5]:
# annotate_text(output[1]["text"], tags_dict)
tags_dict = create_tag_dictionary(output)
json.dump(tags_dict, open("tags_dict.json", "w"), indent=2)

In [6]:
print("sample: ", annotate_text(output[1]["text"], tags_dict))

sample:  {'Fire Style Jutsu': {'start_word': 755, 'end_word': 771, 'entity_type': 'jutsu'}, 'Itachi Uchiha': {'start_word': 1758, 'end_word': 1771, 'entity_type': 'characters'}, 'Jiraiya': {'start_word': 2966, 'end_word': 2973, 'entity_type': 'characters'}, 'Kabuto Yakushi': {'start_word': 5377, 'end_word': 5391, 'entity_type': 'characters'}, 'Madara Uchiha': {'start_word': 1497, 'end_word': 1510, 'entity_type': 'characters'}, 'Naruto Uzumaki': {'start_word': 4284, 'end_word': 4298, 'entity_type': 'characters'}, 'Ni': {'start_word': 1747, 'end_word': 1749, 'entity_type': 'characters'}, 'Sasuke Uchiha': {'start_word': 1773, 'end_word': 1786, 'entity_type': 'characters'}}


In [7]:
prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [8]:
prompts = []
instructions = [
    "Describe about this {entity_type}", 
    "What is the {entity_type}?", 
    "Provide additional information about the {entity_type}",]
for i in range(len(output)):
    instruction = random.choice(instructions).format(entity_type=output[i]["tag"])
    prompt = {"instruction": instruction, "input": output[i]["title"], "response": output[i]["text"]}
    prompts.append(prompt)

In [16]:
ner_instruction = "Identify the entity type of the following text, return JSON response with the entity type and the position of the entity in the text."
ner_prompts = []
for i in range(len(output)):
    annotation = json.dumps(annotate_text(output[i]["text"], tags_dict))
    prompt = {"instruction": ner_instruction, "input": output[i]["text"], "response": annotation}
    ner_prompts.append(prompt)

In [17]:
for i in range(10):
    print(ner_prompts[i])
    print("----")

{'instruction': 'Identify the entity type of the following text, return JSON response with the entity type and the position of the entity in the text.', 'input': 'Adamantine Sealing Chains is a form of Fuinjutsu, also known as a Sealing Jutsu. This technique belongs to the Uzumaki and is a Hiden Technique..While casting this Jutsu, the opponent is chained down by the user using chains which appear out of their torso. The chain which leaves the opponent’s body is fully controlled by the user and not only that, these chains wrap the enemy and entangle them in those chains..Apart from binding the targets down, these chains also neutralise the opponent’s chakra. It can also form an incredibly strong barrier, which almost seems impregnable. Even Hiruzen Sarutobi could not pierce it..Adamantine Sealing Chains is a Hiden Jutsu, which means that it is a Jutsu that is exclusive to a clan that has been passed down orally through tradition. It is kept secret and is not allowed to be taught to peo

In [18]:
with open("prompt_sportseeker.txt", "w") as f:
    for prompt in prompts:
        f.write(json.dumps(prompt) + "\n")
    for prompt in ner_prompts:
        f.write(json.dumps(prompt) + "\n")

In [19]:
from datasets import Dataset

def formatting_prompts_func(example):
    instruction = example["instruction"]
    input       = example["input"]
    output      = example["response"]
    text = prompt_template.format(instruction, input, output)
    return { "text" : text}

data = []
with open("prompt_sportseeker.txt") as f:
    for line in f:
        data.append(json.loads(line))
        
ds = Dataset.from_list(data)
dataset = ds.map(formatting_prompts_func, batched = False,)

Map: 100%|██████████| 1782/1782 [00:00<00:00, 27010.15 examples/s]


In [26]:
from nltk.tokenize import sent_tokenize

In [27]:
sent_tokenize(prompts[0]["response"])

['Adamantine Sealing Chains is a form of Fuinjutsu, also known as a Sealing Jutsu.',
 'This technique belongs to the Uzumaki and is a Hiden Technique..While casting this Jutsu, the opponent is chained down by the user using chains which appear out of their torso.',
 'The chain which leaves the opponent’s body is fully controlled by the user and not only that, these chains wrap the enemy and entangle them in those chains..Apart from binding the targets down, these chains also neutralise the opponent’s chakra.',
 'It can also form an incredibly strong barrier, which almost seems impregnable.',
 'Even Hiruzen Sarutobi could not pierce it..Adamantine Sealing Chains is a Hiden Jutsu, which means that it is a Jutsu that is exclusive to a clan that has been passed down orally through tradition.',
 'It is kept secret and is not allowed to be taught to people outside the clan..In this case, the Adamantine Sealing Chains is a Hiden Jutsu belonging to the Uzumaki Clan.',
 'Even though this techni

In [12]:
ner_prompts[0]

{'instruction': 'Identify the entity type of the following text, return JSON response with the entity type and the position of the entity in the text.',
 'input': 'Adamantine Sealing Chains is a form of Fuinjutsu, also known as a Sealing Jutsu. This technique belongs to the Uzumaki and is a Hiden Technique..While casting this Jutsu, the opponent is chained down by the user using chains which appear out of their torso. The chain which leaves the opponent’s body is fully controlled by the user and not only that, these chains wrap the enemy and entangle them in those chains..Apart from binding the targets down, these chains also neutralise the opponent’s chakra. It can also form an incredibly strong barrier, which almost seems impregnable. Even Hiruzen Sarutobi could not pierce it..Adamantine Sealing Chains is a Hiden Jutsu, which means that it is a Jutsu that is exclusive to a clan that has been passed down orally through tradition. It is kept secret and is not allowed to be taught to pe