In [2]:
# Write functions to parse XML data formats
# Example data
# <?xml version="1.0" encoding="utf-8"?>
# <posts>
#   <row Id="1" PostTypeId="1" AcceptedAnswerId="8" CreationDate="2012-12-11T20:37:08.823" Score="83" ViewCount="98859" Body="&lt;p&gt;Assuming the world in the One Piece universe is round, then there is not really a beginning or an end of the Grand Line.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;The Straw Hats started out from the first half and are now sailing across the second half.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;Wouldn't it have been quicker to set sail in the opposite direction from where they started?     &lt;/p&gt;&#xA;" OwnerUserId="21" LastEditorUserId="1398" LastEditDate="2015-04-17T19:06:38.957" LastActivityDate="2022-05-12T10:37:24.403" Title="The treasure in One Piece is at the end of the Grand Line. But isn't that the same as the beginning?" Tags="|one-piece|" AnswerCount="6" CommentCount="0" ContentLicense="CC BY-SA 3.0" />
#   <row Id="2" PostTypeId="1" AcceptedAnswerId="33" CreationDate="2012-12-11T20:39:40.780" Score="14" ViewCount="2772" Body="&lt;p&gt;In the middle of &lt;em&gt;The Dark Tournament&lt;/em&gt;, Yusuke Urameshi gets to fully inherit Genkai's power of the &lt;em&gt;Spirit Wave&lt;/em&gt; by absorbing a ball of energy from her.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;However, this process turns into an excruciating trial for Yusuke, almost killing him, and keeping him doubled over in extreme pain for a long period of time, so much so that his Spirit Animal, Poo, is also in pain and flies to him to try to help.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;My question is, why is it such a painful procedure to learn and absorb this power?&lt;/p&gt;&#xA;" OwnerUserId="26" LastEditorUserId="247" LastEditDate="2013-02-26T17:02:31.570" LastActivityDate="2013-06-20T03:31:39.187" Title="Why does absorbing the Spirit Wave from Genkai involve such a painful process?" Tags="|yu-yu-hakusho|" AnswerCount="1" CommentCount="0" ContentLicense="CC BY-SA 3.0" />
# </posts>
# Extract post data from XML and return a list of dictionaries

In [69]:
import json
import re
from tqdm import tqdm, tqdm_notebook
from glob import glob
from bs4 import BeautifulSoup

from langchain_text_splitters import RecursiveCharacterTextSplitter

import py7zr
import pandas as pd
import xml.etree.ElementTree as ET

def parse_xml(xml_string):
    # Parse the XML string
    root = ET.fromstring(xml_string)

    # Initialize an empty list to hold the posts
    posts = []

    # Iterate over each 'row' element in the 'posts' element
    for row in root.findall('row'):
        # Get the attributes of the 'row' element as a dictionary
        post = row.attrib

        # Append the post dictionary to the list of posts
        posts.append(post)

    return posts

def parse_xml_from_bytes(xml_bytes):
    # Convert the bytes to a string
    rows = []
    for row in xml_bytes.split(b'\n'):
        try:
            row = ET.fromstring(row.decode("utf-8"))
            rows.append(row.attrib)
        except Exception as e:
            pass

    # Parse the XML string
    return rows


In [13]:
def get_contents_by_filename(filename: str):
    df = []
    files = glob("../data/anime_stackexchange/*.7z")
    for file in tqdm(files):
        with py7zr.SevenZipFile(file, mode='r') as z:
            list_of_files = z.readall()
            content = list_of_files[filename].read()
            try:
                posts = parse_xml_from_bytes(content)
                df.append(pd.DataFrame(posts))
            except Exception as e:
                print(f"Error {file}: {e}")
    
    return pd.concat(df)

In [32]:
target_file_name = "Posts.xml"
df_posts = get_contents_by_filename("Posts.xml")
df_comments = get_contents_by_filename("Comments.xml")

100%|██████████| 16/16 [00:29<00:00,  1.84s/it]


In [20]:
# print overview about dataframe
# print(df_posts.head())
print(df_posts.info())
print(df_posts.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 483277 entries, 0 to 31290
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   Id                     483277 non-null  object
 1   PostTypeId             483277 non-null  object
 2   AcceptedAnswerId       95061 non-null   object
 3   CreationDate           483277 non-null  object
 4   Score                  483277 non-null  object
 5   ViewCount              178388 non-null  object
 6   Body                   483277 non-null  object
 7   OwnerUserId            477851 non-null  object
 8   LastEditorUserId       278034 non-null  object
 9   LastEditDate           279426 non-null  object
 10  LastActivityDate       483277 non-null  object
 11  Title                  178388 non-null  object
 12  Tags                   178388 non-null  object
 13  AnswerCount            178388 non-null  object
 14  CommentCount           483277 non-null  object
 15  Conten

In [21]:
# Extract tags into list of string
df_posts['Tags'] = df_posts['Tags'].str.strip('|').str.split('|')
df_posts_explode = df_posts.explode('Tags')
df_posts_explode["Tags"].value_counts().head(30)

Tags
<naruto>                                                         17602
<one-piece>                                                       8707
<attack-on-titan>                                                 3847
<fairy-tail>                                                      2956
<death-note>                                                      2872
<bleach>                                                          2625
<my-hero-academia>                                                2554
<anime-production>                                                2469
<pokemon>                                                         2017
<hunter-x-hunter>                                                 2013
<tropes>                                                          1882
<dragon-ball-series><dragon-ball-super>                           1782
<sword-art-online>                                                1731
<one-punch-man>                                                   1659
n

In [59]:
desired_tags = set(["naruto", "<naruto>"])
df_naruto_questions = df_posts[df_posts['Tags'].apply(lambda x: any(["naruto" in tag for tag in x]) if isinstance(x, list) else False)]
df_naruto_questions = df_naruto_questions.drop_duplicates(subset=["Id"])

In [60]:
df_naruto_questions.iloc[1].to_dict()

{'Id': '12',
 'PostTypeId': '1',
 'AcceptedAnswerId': '22',
 'CreationDate': '2012-12-11T20:56:15.090',
 'Score': '14',
 'ViewCount': '25271',
 'Body': '<p>I originally thought that the only surviving members after the Uchiha massacre were Sasuke and Itachi, but more and more seem to be revealed.  Is there a canonical list of surviving members of the Uchiha clan after the massacre?</p>\n',
 'OwnerUserId': '22',
 'LastEditorUserId': '22',
 'LastEditDate': '2012-12-11T21:42:46.997',
 'LastActivityDate': '2018-03-16T18:38:49.033',
 'Title': 'List of surviving Uchiha',
 'Tags': ['<naruto>'],
 'AnswerCount': '3',
 'CommentCount': '1',
 'ContentLicense': 'CC BY-SA 3.0',
 'ClosedDate': nan,
 'ParentId': nan,
 'OwnerDisplayName': nan,
 'FavoriteCount': nan,
 'LastEditorDisplayName': nan,
 'CommunityOwnedDate': nan}

In [102]:
len(df_naruto_questions)

1598

In [62]:
df_naruto_questions.to_parquet("../data/anime_stackexchange/anime_questions.parquet", index=False)

In [116]:
# Scan more questions

import requests
from functools import lru_cache
import time

filter_template = "!6WPIomoVB24Vu"
key = "rl_G7gpPaxtnuYzX2QLSatB3fQGq"
url_template = "https://api.stackexchange.com/2.3/search/advanced?order=desc&sort=activity&tagged=naruto&site=anime&page={page}&key={key}"


def get_answers_by_question(question_ids: list, page: int = 1):
    question_ids_param = ";".join([str(q) for q in question_ids])
    url = url_template.format(question_id=question_ids_param, page=page, filter=filter_template, key=key)
    response = requests.request("GET", url)
    if response.status_code != 200:
        print(f"Error: {response.text}")
        return []

    res = response.json()
    return res

max_reqs_per_sec = 30
questions = []
page = 1
while True:
    url = url_template.format(page=page, key=key)
    res = requests.request("GET", url)
    if res.status_code != 200:
        print(f"Error: {res.text}")
        break

    res = res.json()
    if len(res["items"]) == 0:
        break
    
    questions.extend(res["items"])
    time.sleep(1/max_reqs_per_sec)

    if not res["has_more"]:
        break
    
    print("Fetch more page: ", page)
    page += 1

Fetch more page:  1
Fetch more page:  2
Fetch more page:  3
Fetch more page:  4
Fetch more page:  5
Fetch more page:  6
Fetch more page:  7
Fetch more page:  8
Fetch more page:  9
Fetch more page:  10
Fetch more page:  11
Fetch more page:  12
Fetch more page:  13
Fetch more page:  14
Fetch more page:  15
Fetch more page:  16
Fetch more page:  17
Fetch more page:  18
Fetch more page:  19
Fetch more page:  20
Fetch more page:  21
Fetch more page:  22
Fetch more page:  23
Fetch more page:  24
Fetch more page:  25
Fetch more page:  26
Fetch more page:  27
Fetch more page:  28
Fetch more page:  29
Fetch more page:  30
Fetch more page:  31
Fetch more page:  32
Fetch more page:  33
Fetch more page:  34
Fetch more page:  35
Fetch more page:  36
Fetch more page:  37
Fetch more page:  38
Fetch more page:  39
Fetch more page:  40
Fetch more page:  41
Fetch more page:  42
Fetch more page:  43
Fetch more page:  44
Fetch more page:  45
Fetch more page:  46
Fetch more page:  47
Fetch more page:  48
F

In [118]:
df_all_naruto_questions = pd.DataFrame(questions)

In [120]:
df_all_naruto_questions.to_parquet("../data/anime_stackexchange/all_naruto_questions.parquet", index=False)

In [130]:
print(set(df_all_naruto_questions["question_id"].astype("int").unique()) - set(df_naruto_questions["Id"].astype('int').unique()))
print(set(df_naruto_questions["Id"].astype('int').unique()) - set(df_all_naruto_questions["question_id"].astype("int").unique()))

{68744, 68810, 68819}
{27651, 40969, 64012, 61967, 48146, 68636, 60959, 61985, 61986, 44584, 59434, 56876, 46140, 60989, 42564, 57936, 59476, 59477, 57433, 62041, 61024, 42593, 65635, 25714, 8311, 9850, 61054, 57995, 39565, 39569, 47761, 65691, 61608, 61611, 64173, 62640, 65716, 28341, 58042, 49344, 22735, 48851, 67804, 62688, 54009, 40715, 61716, 59676, 62242, 62768, 62773, 57141, 42811, 57660, 59710, 56126, 56650, 66384, 68433, 58203, 42850, 62820, 57701, 39784, 66418, 57717, 33654, 65404, 67453, 44926, 21377, 67461, 65415, 58248, 40330, 61326, 61839, 56729, 60834, 61858, 14754, 43433, 44982, 68024, 62911, 44992, 63937, 44485, 60873, 28639, 57314, 51171, 61412, 52716, 39405, 56305, 56308, 57336, 61947, 66047}


In [124]:
df_naruto_questions.shape

(1598, 22)

# Use StackOverflow API to fetch answers

In [70]:
df_naruto_questions = pd.read_parquet("../data/anime_stackexchange/anime_questions.parquet")

In [96]:
tags = set()
for tag in df_naruto_questions["Tags"]:
    tags.update(tag)

In [100]:
df_naruto_questions[df_naruto_questions["Id"]=="50172"]

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastEditorUserId,LastEditDate,...,Tags,AnswerCount,CommentCount,ContentLicense,ClosedDate,ParentId,OwnerDisplayName,FavoriteCount,LastEditorDisplayName,CommunityOwnedDate
1223,50172,1,,2018-12-22T02:41:58.447,3,1086,<p>I recently got this shirt and I was wonderi...,44003,2516,2018-12-23T13:17:53.777,...,[<naruto><merchandise>],1,2,CC BY-SA 4.0,,,,,,


In [256]:
import requests
from functools import lru_cache
import time

filter_template = "!BMb4rqMjWRTYbMk8F86wvTcI_ZGWR1"
key = "rl_G7gpPaxtnuYzX2QLSatB3fQGq"
url_template = "https://api.stackexchange.com/2.3/questions/{question_id}/answers?order=desc&sort=votes&site=anime&filter={filter}&key={key}&pagesize=100&page={page}"


def get_answers_by_question(question_ids: list, page: int = 1):
    question_ids_param = ";".join([str(q) for q in question_ids])
    url = url_template.format(question_id=question_ids_param, page=page, filter=filter_template, key=key)
    response = requests.request("GET", url)
    if response.status_code != 200:
        print(f"Error: {response.text}")
        return []

    res = response.json()
    return res

In [259]:
post_ids = df_naruto_questions["Id"].unique()
batch_size = 2
max_reqs_per_sec = 30
batches = [post_ids[i:i + batch_size] for i in range(0, len(post_ids), batch_size)]
answers = []
for batch in tqdm(batches):
    page = 1
    while True:
        res = get_answers_by_question(batch, page=page)
        if len(res["items"]) == 0:
            print("No items found: ", batch)
        
        answers.extend(res["items"])
        time.sleep(1/max_reqs_per_sec)

        if not res["has_more"]:
            break
        
        print("Fetch more page: ", page)
        page += 1

 94%|█████████▎| 748/799 [06:38<00:25,  1.98it/s]

No items found:  ['68433' '68435']


 94%|█████████▍| 750/799 [06:39<00:23,  2.07it/s]

No items found:  ['39565' '42850']


 94%|█████████▍| 751/799 [06:40<00:22,  2.13it/s]

No items found:  ['46140' '47761']


 94%|█████████▍| 752/799 [06:40<00:22,  2.11it/s]

No items found:  ['49344' '51171']


 94%|█████████▍| 753/799 [06:41<00:21,  2.14it/s]

No items found:  ['56308' '62640']


 94%|█████████▍| 754/799 [06:41<00:20,  2.16it/s]

No items found:  ['62911' '64173']


 94%|█████████▍| 755/799 [06:42<00:20,  2.14it/s]

No items found:  ['65415' '65635']


 95%|█████████▍| 756/799 [06:42<00:19,  2.17it/s]

No items found:  ['66047' '66384']


 95%|█████████▍| 759/799 [06:43<00:18,  2.11it/s]

No items found:  ['68636' '67461']


 95%|█████████▌| 760/799 [06:44<00:18,  2.16it/s]

No items found:  ['68024' '8311']


 95%|█████████▌| 761/799 [06:44<00:17,  2.14it/s]

No items found:  ['9850' '14754']


 95%|█████████▌| 762/799 [06:45<00:16,  2.21it/s]

No items found:  ['22735' '25714']


 95%|█████████▌| 763/799 [06:45<00:15,  2.26it/s]

No items found:  ['27651' '28341']


 96%|█████████▌| 764/799 [06:46<00:16,  2.17it/s]

No items found:  ['28639' '39405']


 96%|█████████▌| 765/799 [06:46<00:15,  2.18it/s]

No items found:  ['39784' '40330']


 96%|█████████▌| 766/799 [06:47<00:14,  2.21it/s]

No items found:  ['40715' '40969']


 96%|█████████▌| 767/799 [06:47<00:14,  2.20it/s]

No items found:  ['42564' '42811']


 96%|█████████▌| 768/799 [06:47<00:13,  2.24it/s]

No items found:  ['43433' '44485']


 96%|█████████▌| 769/799 [06:48<00:13,  2.18it/s]

No items found:  ['44584' '44926']


 96%|█████████▋| 770/799 [06:48<00:13,  2.09it/s]

No items found:  ['44982' '44992']


 96%|█████████▋| 771/799 [06:50<00:18,  1.49it/s]

No items found:  ['67804' '52716']


 97%|█████████▋| 772/799 [06:50<00:16,  1.60it/s]

No items found:  ['56305' '56729']


 97%|█████████▋| 773/799 [06:51<00:20,  1.27it/s]

No items found:  ['56876' '57995']


 97%|█████████▋| 774/799 [06:52<00:17,  1.46it/s]

No items found:  ['58203' '58248']


 97%|█████████▋| 775/799 [06:52<00:14,  1.61it/s]

No items found:  ['59676' '59710']


 97%|█████████▋| 776/799 [06:53<00:13,  1.74it/s]

No items found:  ['60989' '61326']


 97%|█████████▋| 777/799 [06:53<00:11,  1.87it/s]

No items found:  ['61412' '61608']


 97%|█████████▋| 778/799 [06:54<00:10,  1.93it/s]

No items found:  ['61839' '61858']


 97%|█████████▋| 779/799 [06:54<00:09,  2.03it/s]

No items found:  ['61947' '62242']


 98%|█████████▊| 780/799 [06:54<00:09,  2.06it/s]

No items found:  ['62768' '62820']


 98%|█████████▊| 781/799 [06:55<00:08,  2.13it/s]

No items found:  ['63937' '64012']


 98%|█████████▊| 782/799 [06:55<00:07,  2.16it/s]

No items found:  ['65691' '65716']


 98%|█████████▊| 783/799 [06:56<00:07,  2.16it/s]

No items found:  ['60834' '60873']


 98%|█████████▊| 784/799 [06:56<00:06,  2.17it/s]

No items found:  ['60959' '61054']


 98%|█████████▊| 786/799 [06:57<00:06,  2.13it/s]

No items found:  ['48146' '48851']


 98%|█████████▊| 787/799 [06:58<00:05,  2.16it/s]

No items found:  ['54009' '56126']


 99%|█████████▊| 788/799 [06:58<00:05,  2.10it/s]

No items found:  ['57936' '58042']


 99%|█████████▊| 789/799 [06:59<00:06,  1.49it/s]

No items found:  ['59434' '59476']


 99%|█████████▉| 790/799 [07:00<00:05,  1.66it/s]

No items found:  ['59477' '61024']


 99%|█████████▉| 791/799 [07:00<00:04,  1.80it/s]

No items found:  ['61611' '61716']


 99%|█████████▉| 792/799 [07:01<00:03,  1.80it/s]

No items found:  ['62688' '62773']


 99%|█████████▉| 793/799 [07:01<00:03,  1.92it/s]

No items found:  ['66418' '67453']


 99%|█████████▉| 794/799 [07:02<00:02,  2.00it/s]

No items found:  ['56650' '57141']


 99%|█████████▉| 795/799 [07:03<00:02,  1.45it/s]

No items found:  ['57314' '57336']


100%|█████████▉| 796/799 [07:03<00:01,  1.61it/s]

No items found:  ['57433' '57660']


100%|█████████▉| 797/799 [07:04<00:01,  1.76it/s]

No items found:  ['57701' '57717']


100%|█████████▉| 798/799 [07:04<00:00,  1.86it/s]

No items found:  ['61967' '61985']


100%|██████████| 799/799 [07:05<00:00,  1.88it/s]

No items found:  ['61986' '62041']





In [262]:
df_answers = pd.DataFrame(answers)
df_answers["creation_date"] = pd.to_datetime(df_answers["creation_date"], unit="s")
df_answers.head()

Unnamed: 0,tags,down_vote_count,up_vote_count,is_accepted,score,last_activity_date,last_edit_date,creation_date,answer_id,question_id,content_license,title,body,community_owned_date
0,[],0,20,True,20,1421706122,1421706000.0,2012-12-11 21:05:56,22,12,CC BY-SA 3.0,List of surviving Uchiha,<p>The following:</p>\n\n<ul>\n<li><strong>Uch...,
1,[],0,14,False,14,1360849535,1360850000.0,2013-02-14 13:40:16,2478,10,CC BY-SA 3.0,How can Madara still stick around even after t...,"<p>When Edo Tensei ends, the summoned soul is ...",
2,[],1,15,True,14,1545329002,1545329000.0,2013-12-20 18:17:50,6467,10,CC BY-SA 4.0,How can Madara still stick around even after t...,"<p>Firstly, the most important thing to know i...",
3,[],1,13,False,12,1360851075,1360851000.0,2012-12-11 20:58:12,15,10,CC BY-SA 3.0,How can Madara still stick around even after t...,"<p>From what I understood, if you know the Edo...",
4,[],0,3,False,3,1355260808,1355261000.0,2012-12-11 21:07:39,24,12,CC BY-SA 3.0,List of surviving Uchiha,<p>Itachi Uchiha<br>\nSasuke Uchiha</p>\n\n<p>...,


In [263]:
df_answers[df_answers["question_id"]==10]

Unnamed: 0,tags,down_vote_count,up_vote_count,is_accepted,score,last_activity_date,last_edit_date,creation_date,answer_id,question_id,content_license,title,body,community_owned_date
1,[],0,14,False,14,1360849535,1360850000.0,2013-02-14 13:40:16,2478,10,CC BY-SA 3.0,How can Madara still stick around even after t...,"<p>When Edo Tensei ends, the summoned soul is ...",
2,[],1,15,True,14,1545329002,1545329000.0,2013-12-20 18:17:50,6467,10,CC BY-SA 4.0,How can Madara still stick around even after t...,"<p>Firstly, the most important thing to know i...",
3,[],1,13,False,12,1360851075,1360851000.0,2012-12-11 20:58:12,15,10,CC BY-SA 3.0,How can Madara still stick around even after t...,"<p>From what I understood, if you know the Edo...",


In [266]:
df_answers["question_id"] = df_answers["question_id"].astype('str')
df_joint = df_answers.merge(df_naruto_questions, left_on=["question_id"], right_on=["Id"], suffixes=('_answer', '_question'))

In [267]:
len(set(df_naruto_questions["Id"].unique()) - set(df_answers["question_id"].unique()))

142

In [268]:
df_naruto_questions[df_naruto_questions["Id"]=="10"]

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastEditorUserId,LastEditDate,...,Tags,AnswerCount,CommentCount,ContentLicense,ClosedDate,ParentId,OwnerDisplayName,FavoriteCount,LastEditorDisplayName,CommunityOwnedDate
0,10,1,6467,2012-12-11T20:51:17.307,18,92815,<p>Edo Tensei is a technique to revive the dea...,32,27,2013-12-20T18:09:17.177,...,[<naruto>],3,2,CC BY-SA 3.0,,,,,,


In [114]:
df_answers[df_answers["question_id"]=="39847"]

Unnamed: 0,owner,down_vote_count,up_vote_count,is_accepted,score,last_activity_date,last_edit_date,creation_date,answer_id,question_id,content_license,community_owned_date


In [265]:
df_answers.to_parquet("../data/anime_stackexchange/anime_answers.parquet", index=False)

In [269]:
df_joint.to_parquet("../data/anime_stackexchange/anime_question_answers.parquet", index=False)

## Preprocessing

In [1]:
import pandas as pd
import numpy as np
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_parquet("../data/anime_stackexchange/anime_answers.parquet").sort_values(by=["question_id", "score", "is_accepted"], ascending=False).rename({
    "title": "question",
    "body": "answer"
}, axis=1)

In [3]:
df.sample(10)

Unnamed: 0,tags,down_vote_count,up_vote_count,is_accepted,score,last_activity_date,last_edit_date,creation_date,answer_id,question_id,content_license,question,answer,community_owned_date
469,[],1,46,True,45,1380709644,1380710000.0,2013-06-27 01:13:55,4238,4221,CC BY-SA 3.0,Why do ninjas run with their hands at the back?,<p>Based on research...\n...Ninjas frequently ...,
901,[],0,1,False,1,1392891081,,2014-02-20 10:11:21,7605,7593,CC BY-SA 3.0,Why are Kurenai&#39;s eyes the way they are?,<p>They are not related to sharingan in any wa...,
1100,[],0,0,False,0,1407404173,1407404000.0,2014-08-07 06:01:21,13293,13285,CC BY-SA 3.0,How far can the sharingan go?,"<p><a href=""http://naruto.wikia.com/wiki/Flyin...",
159,[],0,2,False,2,1416421333,1416421000.0,2014-11-19 18:13:26,15184,846,CC BY-SA 3.0,Why do some Sharingan eyes have unique Mangeky...,<p>The abilities of the eyes are determined by...,
3032,[],0,1,True,1,1645961656,,2022-02-27 11:34:16,66458,66456,CC BY-SA 4.0,Why wasn&#39;t flying Raijin passed onto other...,<p>You have to understand that not all people ...,
1418,[],0,5,True,5,1434615590,1434616000.0,2015-06-18 07:26:16,22547,22142,CC BY-SA 3.0,What happened to the tomoe in Sasuke&#39;s Rin...,"<p>The latest manga of <a href=""http://naruto....",
1049,[],1,0,False,-1,1406303795,,2014-07-25 15:56:35,13049,11172,CC BY-SA 3.0,Who was Black Zetsu created by? (Manga Spoilers),<p>Black Zetsu was created by Madara but it is...,
1808,[],0,0,False,0,1486825007,,2017-02-11 14:56:47,38908,33896,CC BY-SA 3.0,Why can&#39;t Naruto go into Bijuu mode like t...,<p>It's probably because half of the 9 tails i...,
2709,[],0,1,False,1,1552400565,1552401000.0,2019-02-15 16:43:45,50844,50841,CC BY-SA 4.0,Given how Obito had acquired Bijuu in the past...,<p>Indeed his plan would be a success if he ke...,
2425,[],0,11,True,11,1513767802,,2017-12-20 11:03:22,43859,43592,CC BY-SA 3.0,Why does Sasuke consider Naruto to be his best...,<p>Naruto is Sasuke's best friend because he i...,


In [6]:
df[df["question_id"]==4221].to_dict(orient="records")

[{'tags': array([], dtype=object),
  'down_vote_count': 1,
  'up_vote_count': 46,
  'is_accepted': True,
  'score': 45,
  'last_activity_date': 1380709644,
  'last_edit_date': 1380709644.0,
  'creation_date': Timestamp('2013-06-27 01:13:55'),
  'answer_id': 4238,
  'question_id': 4221,
  'content_license': 'CC BY-SA 3.0',
  'question': 'Why do ninjas run with their hands at the back?',
  'answer': '<p>Based on research...\n...Ninjas frequently were portrayed running with <a href="http://tvtropes.org/pmwiki/pmwiki.php/Main/AirplaneArms">The Airplane Arms</a>.</p>\n\n<blockquote>\n  <p>Ninja are also frequently portrayed running this way (with the arms swept farther back, like the wings on an F-14 Tomcat), though they now tend to prefer the Ninja Run. Samurai also run in a similar manner, usually while keeping one hand grasped onto their katana. <strong>In a more (apparently) realistic series this may have something to do with the runner having a reduced profile and thus being harder to 

In [305]:
qids = set(df["question_id"].astype("int").unique())

In [306]:
df["pm_score"] = np.log2(df["up_vote_count"]+1) + df["is_accepted"].map(lambda d: 1 if d else -1)
# df["pm_score"] = df.apply(lambda x: x["pm_score"] if x["up_vote_count"] > 0 else -1, axis=1)

In [307]:
df["pm_score"].describe()

count    3066.000000
mean        0.990992
std         1.826143
min        -1.000000
25%        -1.000000
50%         0.584963
75%         2.169925
max         6.554589
Name: pm_score, dtype: float64

In [308]:
from bs4 import BeautifulSoup
def binary_comparison(answers):
    """Returns tuples of answers, first always best"""
    pairs = []
    
    for i in range(len(answers)-1):
        for j in range(i+1, len(answers)):
            if answers[i]["pm_score"]>answers[j]["pm_score"]:
                pairs.append((answers[i]["answer"], answers[j]["answer"]))
            elif answers[i]["pm_score"]<answers[j]["pm_score"]:
                pairs.append((answers[j]["answer"], answers[i]["answer"]))
    return pairs

def preprocess(examples, MAX_PAIRS_PER_QUESTION=-1):
    """Cleans HTML and returns paired answers (j is better than k). Note that this returns more examples (one for each pair per question)."""
    n_samples = len(examples["question_id"])
    
    # initialize empty lists for new samples
    new_examples = {"question": [], "response_j": [], "response_k": []}
    for key in examples:
        new_examples[key] = []
    
    for sample_id in range(n_samples):
        # get pairs where first is always the better one
        pairs = binary_comparison(examples["answers"][sample_id])
        n_answers = len(examples["answers"][sample_id])
        
        # sample if we get more pairs than maximum
        if MAX_PAIRS_PER_QUESTION > 0 and len(pairs) > MAX_PAIRS_PER_QUESTION:
            indices = np.random.choice(list(range(len(pairs))), MAX_PAIRS_PER_QUESTION, replace=False)
            pairs = [pairs[i] for i in indices]
        
        # construct the samples
        for pair in pairs:
            for key in examples:
                if key=="question":
                    new_examples[key].append(examples[key][sample_id])
                else:
                    new_examples[key].append(examples[key][sample_id])
            new_examples["response_j"].append(pair[0])
            new_examples["response_k"].append(pair[1])
    
    return new_examples

In [309]:
# Aggregated by question id, each group contains a list of answers with answer id, text, score, and is_accepted. Column group is answer
df["answer"] = df["answer"].map(lambda x: BeautifulSoup(x, "html.parser").get_text())
df_answers_agg = df.groupby(["question_id", "question"]).apply(lambda x: x[["question_id", "answer_id", "pm_score", "answer", "is_accepted"]].to_dict(orient="records")).reset_index(name="answers")

In [310]:
ds = datasets.Dataset.from_pandas(df_answers_agg)
ds_result = ds.map(lambda d: preprocess(d, -1), batch_size=1000, batched=True).remove_columns(["answers"])

Map: 100%|██████████| 1456/1456 [00:00<00:00, 8617.16 examples/s]


In [311]:
len(ds_result)

2704

In [312]:
ds_result.save_to_disk("../data/anime_stackexchange/anime_answers_pair")

Saving the dataset (1/1 shards): 100%|██████████| 2704/2704 [00:00<00:00, 248960.55 examples/s]


In [1]:
import cv2
import numpy as np

def segment_panels(image_path):
    # Read the image
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply thresholding to get binary image
    _, binary = cv2.threshold(gray, 220, 255, cv2.THRESH_BINARY_INV)
    
    # Apply morphological operations to clean up the image
    kernel = np.ones((3,3), np.uint8)
    binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    
    # Find contours
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Filter contours based on area
    min_area = 1000  # Adjust this value based on your image size
    panels = []
    for contour in contours:
        area = cv2.contourArea(contour)
        if area > min_area:
            x, y, w, h = cv2.boundingRect(contour)
            panels.append((x, y, w, h))
    
    # Draw rectangles around detected panels
    for (x, y, w, h) in panels:
        cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
    
    # Display the result
    cv2.imshow('Segmented Panels', img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

# Usage
image_path = '/Users/nhuantran/Documents/Mangas/Naruto/Vol.01 Ch.0004 - Какаші Хатаке! (uk)/10.png'
segment_panels(image_path)