In [2]:
# Write functions to parse XML data formats
# Example data
# <?xml version="1.0" encoding="utf-8"?>
# <posts>
#   <row Id="1" PostTypeId="1" AcceptedAnswerId="8" CreationDate="2012-12-11T20:37:08.823" Score="83" ViewCount="98859" Body="&lt;p&gt;Assuming the world in the One Piece universe is round, then there is not really a beginning or an end of the Grand Line.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;The Straw Hats started out from the first half and are now sailing across the second half.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;Wouldn't it have been quicker to set sail in the opposite direction from where they started?     &lt;/p&gt;&#xA;" OwnerUserId="21" LastEditorUserId="1398" LastEditDate="2015-04-17T19:06:38.957" LastActivityDate="2022-05-12T10:37:24.403" Title="The treasure in One Piece is at the end of the Grand Line. But isn't that the same as the beginning?" Tags="|one-piece|" AnswerCount="6" CommentCount="0" ContentLicense="CC BY-SA 3.0" />
#   <row Id="2" PostTypeId="1" AcceptedAnswerId="33" CreationDate="2012-12-11T20:39:40.780" Score="14" ViewCount="2772" Body="&lt;p&gt;In the middle of &lt;em&gt;The Dark Tournament&lt;/em&gt;, Yusuke Urameshi gets to fully inherit Genkai's power of the &lt;em&gt;Spirit Wave&lt;/em&gt; by absorbing a ball of energy from her.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;However, this process turns into an excruciating trial for Yusuke, almost killing him, and keeping him doubled over in extreme pain for a long period of time, so much so that his Spirit Animal, Poo, is also in pain and flies to him to try to help.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;My question is, why is it such a painful procedure to learn and absorb this power?&lt;/p&gt;&#xA;" OwnerUserId="26" LastEditorUserId="247" LastEditDate="2013-02-26T17:02:31.570" LastActivityDate="2013-06-20T03:31:39.187" Title="Why does absorbing the Spirit Wave from Genkai involve such a painful process?" Tags="|yu-yu-hakusho|" AnswerCount="1" CommentCount="0" ContentLicense="CC BY-SA 3.0" />
# </posts>
# Extract post data from XML and return a list of dictionaries

In [115]:
import json
import re
from tqdm import tqdm, tqdm_notebook
from glob import glob
from bs4 import BeautifulSoup

from langchain_text_splitters import RecursiveCharacterTextSplitter

import py7zr
import pandas as pd
import xml.etree.ElementTree as ET

def parse_xml(xml_string):
    # Parse the XML string
    root = ET.fromstring(xml_string)

    # Initialize an empty list to hold the posts
    posts = []

    # Iterate over each 'row' element in the 'posts' element
    for row in root.findall('row'):
        # Get the attributes of the 'row' element as a dictionary
        post = row.attrib

        # Append the post dictionary to the list of posts
        posts.append(post)

    return posts

def parse_xml_from_bytes(xml_bytes):
    # Convert the bytes to a string
    rows = []
    for row in xml_bytes.split(b'\n'):
        try:
            row = ET.fromstring(row.decode("utf-8"))
            rows.append(row.attrib)
        except Exception as e:
            pass

    # Parse the XML string
    return rows


In [13]:
def get_contents_by_filename(filename: str):
    df = []
    files = glob("../data/anime_stackexchange/*.7z")
    for file in tqdm(files):
        with py7zr.SevenZipFile(file, mode='r') as z:
            list_of_files = z.readall()
            content = list_of_files[filename].read()
            try:
                posts = parse_xml_from_bytes(content)
                df.append(pd.DataFrame(posts))
            except Exception as e:
                print(f"Error {file}: {e}")
    
    return pd.concat(df)

In [32]:
target_file_name = "Posts.xml"
df_posts = get_contents_by_filename("Posts.xml")
df_comments = get_contents_by_filename("Comments.xml")

100%|██████████| 16/16 [00:29<00:00,  1.84s/it]


In [20]:
# print overview about dataframe
# print(df_posts.head())
print(df_posts.info())
print(df_posts.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 483277 entries, 0 to 31290
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   Id                     483277 non-null  object
 1   PostTypeId             483277 non-null  object
 2   AcceptedAnswerId       95061 non-null   object
 3   CreationDate           483277 non-null  object
 4   Score                  483277 non-null  object
 5   ViewCount              178388 non-null  object
 6   Body                   483277 non-null  object
 7   OwnerUserId            477851 non-null  object
 8   LastEditorUserId       278034 non-null  object
 9   LastEditDate           279426 non-null  object
 10  LastActivityDate       483277 non-null  object
 11  Title                  178388 non-null  object
 12  Tags                   178388 non-null  object
 13  AnswerCount            178388 non-null  object
 14  CommentCount           483277 non-null  object
 15  Conten

In [21]:
# Extract tags into list of string
df_posts['Tags'] = df_posts['Tags'].str.strip('|').str.split('|')
df_posts_explode = df_posts.explode('Tags')
df_posts_explode["Tags"].value_counts().head(30)

Tags
<naruto>                                                         17602
<one-piece>                                                       8707
<attack-on-titan>                                                 3847
<fairy-tail>                                                      2956
<death-note>                                                      2872
<bleach>                                                          2625
<my-hero-academia>                                                2554
<anime-production>                                                2469
<pokemon>                                                         2017
<hunter-x-hunter>                                                 2013
<tropes>                                                          1882
<dragon-ball-series><dragon-ball-super>                           1782
<sword-art-online>                                                1731
<one-punch-man>                                                   1659
n

In [59]:
desired_tags = set(["naruto", "<naruto>"])
df_naruto_questions = df_posts[df_posts['Tags'].apply(lambda x: any(["naruto" in tag for tag in x]) if isinstance(x, list) else False)]
df_naruto_questions = df_naruto_questions.drop_duplicates(subset=["Id"])

In [60]:
df_naruto_questions.iloc[1].to_dict()

{'Id': '12',
 'PostTypeId': '1',
 'AcceptedAnswerId': '22',
 'CreationDate': '2012-12-11T20:56:15.090',
 'Score': '14',
 'ViewCount': '25271',
 'Body': '<p>I originally thought that the only surviving members after the Uchiha massacre were Sasuke and Itachi, but more and more seem to be revealed.  Is there a canonical list of surviving members of the Uchiha clan after the massacre?</p>\n',
 'OwnerUserId': '22',
 'LastEditorUserId': '22',
 'LastEditDate': '2012-12-11T21:42:46.997',
 'LastActivityDate': '2018-03-16T18:38:49.033',
 'Title': 'List of surviving Uchiha',
 'Tags': ['<naruto>'],
 'AnswerCount': '3',
 'CommentCount': '1',
 'ContentLicense': 'CC BY-SA 3.0',
 'ClosedDate': nan,
 'ParentId': nan,
 'OwnerDisplayName': nan,
 'FavoriteCount': nan,
 'LastEditorDisplayName': nan,
 'CommunityOwnedDate': nan}

In [102]:
len(df_naruto_questions)

1598

In [62]:
df_naruto_questions.to_parquet("../data/anime_stackexchange/anime_questions.parquet", index=False)

# Use StackOverflow API to fetch answers

In [94]:
import requests
from functools import lru_cache
import time

url_template = "https://api.stackexchange.com/2.3/questions/{question_id}/answers?order=desc&sort=votes&site=anime&filter=!6WPIomp1ahFmr&key=rl_G7gpPaxtnuYzX2QLSatB3fQGq&pagesize=100&page={page}"
def get_answers_by_question(question_ids: list, page: int = 1):
  question_ids_param = ";".join([str(q) for q in question_ids])
  url = url_template.format(question_id=question_ids_param, page=page)
  response = requests.request("GET", url)
  if response.status_code != 200:
    print(f"Error: {response.text}")
    return []
  
  res = response.json()
  return res

In [130]:
post_ids = df_naruto_questions["Id"].unique()
batch_size = 5
max_reqs_per_sec = 30
batches = [post_ids[i:i + batch_size] for i in range(0, len(post_ids), batch_size)]
answers = []
for batch in tqdm(batches):
    page = 1
    while True:
        res = get_answers_by_question(batch, page=page)
        if len(res["items"]) == 0:
            print("No items found: ", batch)
        
        answers.extend(res["items"])
        time.sleep(1/max_reqs_per_sec)

        if not res["has_more"]:
            break
        
        print("Fetch more page: ", page)
        page += 1

In [124]:
len(answers)

3066

In [125]:
df_answers = pd.DataFrame(answers)
df_answers.head()

Unnamed: 0,tags,owner,is_accepted,score,last_activity_date,last_edit_date,creation_date,answer_id,question_id,content_license,body_markdown,title,community_owned_date
0,[],"{'account_id': 456935, 'reputation': 386, 'use...",True,27,1364337700,1364338000.0,1364333863,3052,76,CC BY-SA 3.0,It does not seem like any of Madara&#39;s new ...,What are Madara Uchiha&#39;s weaknesses?,
1,[],"{'account_id': 466356, 'reputation': 19244, 'u...",True,20,1421706122,1421706000.0,1355259956,22,12,CC BY-SA 3.0,The following:\r\n\r\n - **Uchiha Itachi** - W...,List of surviving Uchiha,
2,[],"{'account_id': 92100, 'reputation': 15315, 'us...",False,14,1360849535,1360850000.0,1360849216,2478,10,CC BY-SA 3.0,"When Edo Tensei ends, the summoned soul is fre...",How can Madara still stick around even after t...,
3,[],"{'account_id': 483060, 'reputation': 5502, 'us...",True,14,1545329002,1545329000.0,1387563470,6467,10,CC BY-SA 4.0,"Firstly, the most important thing to know is t...",How can Madara still stick around even after t...,
4,[],"{'account_id': 466356, 'reputation': 19244, 'u...",False,12,1360851075,1360851000.0,1355259492,15,10,CC BY-SA 3.0,"From what I understood, if you know the Edo Te...",How can Madara still stick around even after t...,


In [126]:
df_answers["question_id"] = df_answers["question_id"].astype('str')
df_joint = df_answers.merge(df_naruto_questions, left_on=["question_id"], right_on=["Id"], suffixes=('_answer', '_question'))

In [127]:
len(set(df_naruto_questions["Id"].unique()) - set(df_answers["question_id"].unique()))

142

In [128]:
df_naruto_questions[df_naruto_questions["Id"]=="10"]

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastEditorUserId,LastEditDate,...,Tags,AnswerCount,CommentCount,ContentLicense,ClosedDate,ParentId,OwnerDisplayName,FavoriteCount,LastEditorDisplayName,CommunityOwnedDate
7,10,1,6467,2012-12-11T20:51:17.307,18,92815,<p>Edo Tensei is a technique to revive the dea...,32,27,2013-12-20T18:09:17.177,...,[<naruto>],3,2,CC BY-SA 3.0,,,,,,


In [129]:
df_answers[df_answers["question_id"]=="10"]

Unnamed: 0,tags,owner,is_accepted,score,last_activity_date,last_edit_date,creation_date,answer_id,question_id,content_license,body_markdown,title,community_owned_date
2,[],"{'account_id': 92100, 'reputation': 15315, 'us...",False,14,1360849535,1360850000.0,1360849216,2478,10,CC BY-SA 3.0,"When Edo Tensei ends, the summoned soul is fre...",How can Madara still stick around even after t...,
3,[],"{'account_id': 483060, 'reputation': 5502, 'us...",True,14,1545329002,1545329000.0,1387563470,6467,10,CC BY-SA 4.0,"Firstly, the most important thing to know is t...",How can Madara still stick around even after t...,
4,[],"{'account_id': 466356, 'reputation': 19244, 'u...",False,12,1360851075,1360851000.0,1355259492,15,10,CC BY-SA 3.0,"From what I understood, if you know the Edo Te...",How can Madara still stick around even after t...,


In [131]:
df_answers.to_parquet("../data/anime_stackexchange/anime_answers.parquet", index=False)

In [132]:
df_joint.to_parquet("../data/anime_stackexchange/anime_question_answers.parquet", index=False)

## Preprocessing

In [72]:
# Preview data by first n rows. The format row
# Question: {Title}
# Answer: {Body}
# Tags: {Tags}
def remove_thumb_substring(text):
    while "thumb" in text:
        start_thumb = text.find("thumb")
        end_thumb = start_thumb
        while end_thumb < len(text) and end_thumb != -1 and text[end_thumb] != "|":
            end_thumb += 1
        text = text[:start_thumb] + text[end_thumb+1:]
    return text

def remove_by_regex(text, template, subst = ""):
    return re.sub(template, subst, text, 0, re.MULTILINE)



for index, row in df_naruto.head(5).iterrows():
    print(f"Question: {row['Title']}")
    soup = BeautifulSoup(row['Body'], 'html.parser')
    txt = soup.text
    txt = remove_by_regex(txt, r"(\w)+\|")
    txt = remove_by_regex(txt, r"\n\r|\n|\r")
    print(f"Answer: {txt}")
    print(f"Tags: {row['Tags']}")
    print("---")

Question: How can Madara still stick around even after the Edo Tensei had been ended?
Answer: Edo Tensei is a technique to revive the dead, and bind their souls into living bodies. However, after releasing the technique, all of the dead should get back into being dead.How can Madara still stick around even after the Edo Tensei had been ended? I am not sure if it was something that we needed to think about and figure out by ourselves or is it yet to be revealed?
Tags: ['<naruto>']
---
Question: List of surviving Uchiha
Answer: I originally thought that the only surviving members after the Uchiha massacre were Sasuke and Itachi, but more and more seem to be revealed.  Is there a canonical list of surviving members of the Uchiha clan after the massacre?
Tags: ['<naruto>']
---
Question: What are Madara Uchiha's weaknesses?
Answer: At his current state, how could Madara possibly be defeated?Any physical attack/Taijutsu would be easily deflected by either:  The Susano'o  The Gunbai (war fan,

In [74]:
new_chunk_data = []
max_length = 4096
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=max_length,
    chunk_overlap=chunk_overlap,
    length_function=lambda d: len(d.split()),
    is_separator_regex=False,
    separators=["\n\n", "\n", ". ", ".\n", "? ", "?\n", "! ", "!\n", "."],
)
data = []
for row in df_naruto.to_dict(orient="records"):
    soup = BeautifulSoup(row['Body'], 'html.parser')
    txt = soup.text
    txt = remove_by_regex(txt, r"(\w)+\|")
    txt = remove_by_regex(txt, r"\n\r|\n|\r")
    data.append({
        "instruction": f"Question: {row['Title']}",
        "input": "",
        "response": txt,
    })

for i in range(len(data)):
    texts = text_splitter.create_documents([data[i]["response"]])
    for text in texts:
        new_chunk_data.append({
            "instruction": data[i]["instruction"],
            "input": data[i]["input"],
            "response": text.page_content,
        })

In [76]:
with open("prompt_anime_stackexchange.txt", "w") as f:
    for prompt in new_chunk_data:
        f.write(json.dumps(prompt) + "\n")

In [21]:
import numpy as np
from sentence_transformers import SentenceTransformer, util
from optimum.onnxruntime import ORTModelForFeatureExtraction

In [3]:
import numpy as np
from sentence_transformers import SentenceTransformer, util
from optimum.onnxruntime import ORTModelForFeatureExtraction
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", device="mps", trust_remote_code=True)

<All keys matched successfully>


In [5]:
import json 
data = []
with open("prompt_anime_stackexchange.txt") as f:
    for line in f:
        data.append(json.loads(line))

In [22]:
document_template = "search_document: {content}"
question_docs = np.array([document_template.format(content=doc["instruction"]) for doc in data])
response_docs = np.array([document_template.format(content=doc["response"]) for doc in data])

In [13]:
questions_embeddings = model.encode(question_docs, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)
document_embeddings = model.encode(response_docs, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True)

Batches: 100%|██████████| 688/688 [00:50<00:00, 13.62it/s]
Batches: 100%|██████████| 688/688 [04:04<00:00,  2.81it/s]


In [14]:
import faiss

In [15]:
question_index = faiss.IndexFlatIP(questions_embeddings.shape[1])
question_index.add(questions_embeddings)

document_index = faiss.IndexFlatIP(document_embeddings.shape[1])
document_index.add(document_embeddings)

In [24]:
def search_similar_question(question, top_k=5):
    question_embedding = model.encode([question], convert_to_numpy=True, normalize_embeddings=True)
    scores, question_indices = question_index.search(question_embedding, top_k)
    return question_indices[0], scores[0]

def search_similar_document(document, top_k=5):
    document_embedding = model.encode([document], convert_to_numpy=True, normalize_embeddings=True)
    scores, document_indices = document_index.search(document_embedding, top_k)
    return document_indices[0], scores[0]

In [27]:
def show_document(corpus, indices, scores):
    for i, score in zip(indices, scores):
        print(f"Score: {score}")
        print(corpus[i])
        print(score)
        print("----")

In [29]:
show_document(question_docs, *search_similar_question("How to become a Hokage in Naruto?"))

Score: 0.7615363001823425
search_document: Question: Is Naruto still a Genin after becoming Hokage?
0.7615363
----
Score: 0.7615363001823425
search_document: Question: Is Naruto still a Genin after becoming Hokage?
0.7615363
----
Score: 0.7615363001823425
search_document: Question: Is Naruto still a Genin after becoming Hokage?
0.7615363
----
Score: 0.7615363001823425
search_document: Question: Is Naruto still a Genin after becoming Hokage?
0.7615363
----
Score: 0.7615363001823425
search_document: Question: Is Naruto still a Genin after becoming Hokage?
0.7615363
----
