In [2]:
# Write functions to parse XML data formats
# Example data
# <?xml version="1.0" encoding="utf-8"?>
# <posts>
#   <row Id="1" PostTypeId="1" AcceptedAnswerId="8" CreationDate="2012-12-11T20:37:08.823" Score="83" ViewCount="98859" Body="&lt;p&gt;Assuming the world in the One Piece universe is round, then there is not really a beginning or an end of the Grand Line.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;The Straw Hats started out from the first half and are now sailing across the second half.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;Wouldn't it have been quicker to set sail in the opposite direction from where they started?     &lt;/p&gt;&#xA;" OwnerUserId="21" LastEditorUserId="1398" LastEditDate="2015-04-17T19:06:38.957" LastActivityDate="2022-05-12T10:37:24.403" Title="The treasure in One Piece is at the end of the Grand Line. But isn't that the same as the beginning?" Tags="|one-piece|" AnswerCount="6" CommentCount="0" ContentLicense="CC BY-SA 3.0" />
#   <row Id="2" PostTypeId="1" AcceptedAnswerId="33" CreationDate="2012-12-11T20:39:40.780" Score="14" ViewCount="2772" Body="&lt;p&gt;In the middle of &lt;em&gt;The Dark Tournament&lt;/em&gt;, Yusuke Urameshi gets to fully inherit Genkai's power of the &lt;em&gt;Spirit Wave&lt;/em&gt; by absorbing a ball of energy from her.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;However, this process turns into an excruciating trial for Yusuke, almost killing him, and keeping him doubled over in extreme pain for a long period of time, so much so that his Spirit Animal, Poo, is also in pain and flies to him to try to help.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;My question is, why is it such a painful procedure to learn and absorb this power?&lt;/p&gt;&#xA;" OwnerUserId="26" LastEditorUserId="247" LastEditDate="2013-02-26T17:02:31.570" LastActivityDate="2013-06-20T03:31:39.187" Title="Why does absorbing the Spirit Wave from Genkai involve such a painful process?" Tags="|yu-yu-hakusho|" AnswerCount="1" CommentCount="0" ContentLicense="CC BY-SA 3.0" />
# </posts>
# Extract post data from XML and return a list of dictionaries

In [17]:
import json
from tqdm import tqdm
from glob import glob

import py7zr
import pandas as pd
import xml.etree.ElementTree as ET

def parse_xml(xml_string):
    # Parse the XML string
    root = ET.fromstring(xml_string)

    # Initialize an empty list to hold the posts
    posts = []

    # Iterate over each 'row' element in the 'posts' element
    for row in root.findall('row'):
        # Get the attributes of the 'row' element as a dictionary
        post = row.attrib

        # Append the post dictionary to the list of posts
        posts.append(post)

    return posts


In [6]:
files = glob("../data/anime_stackexchange/*.7z")

In [18]:
df = []
target_file_name = "Posts.xml"
for file in tqdm(files):
    with py7zr.SevenZipFile(file, mode='r') as z:
        list_of_files = z.readall()
        content = list_of_files[target_file_name].read()
        try:
            posts = parse_xml(content)
            df.append(pd.DataFrame(posts))
        except Exception as e:
            print(f"Error {file}: {e}")


 53%|█████▎    | 9/17 [00:20<00:18,  2.25s/it]


ParseError: encoding specified in XML declaration is incorrect: line 1, column 31 (<string>)

In [8]:
data_path = "../data/anime_stackexchange/Anime Stack Exchange/Posts.xml"
posts = parse_xml(open(data_path).read())

In [9]:
df_posts = pd.DataFrame(posts)

In [22]:
df_posts.head()

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastEditorUserId,LastEditDate,...,Tags,AnswerCount,CommentCount,ContentLicense,ParentId,ClosedDate,OwnerDisplayName,FavoriteCount,LastEditorDisplayName,CommunityOwnedDate
0,1,1,8.0,2012-12-11T20:37:08.823,83,98859,<p>Assuming the world in the One Piece univers...,21,1398.0,2015-04-17T19:06:38.957,...,|one-piece|,6,0,CC BY-SA 3.0,,,,,,
1,2,1,33.0,2012-12-11T20:39:40.780,14,2772,<p>In the middle of <em>The Dark Tournament</e...,26,247.0,2013-02-26T17:02:31.570,...,|yu-yu-hakusho|,1,0,CC BY-SA 3.0,,,,,,
2,3,1,148.0,2012-12-11T20:42:47.447,12,4984,"<p>In Sora no Otoshimono, Ikaros carries aroun...",29,,,...,|sora-no-otoshimono|,3,1,CC BY-SA 3.0,,,,,,
3,4,1,,2012-12-11T20:44:46.870,12,20792,<p>Is there any particular software or softwar...,18,2516.0,2024-02-08T03:53:05.290,...,|anime-production|,1,7,CC BY-SA 4.0,,,,,,
4,6,1,11.0,2012-12-11T20:47:21.890,25,9212,"<p>In several episodes of DB:Z and DB:GT, usin...",26,3028.0,2016-08-22T03:44:55.933,...,|dragon-ball-z|dragon-ball-series|dragon-ball-gt|,6,2,CC BY-SA 3.0,,,,,,


In [10]:
# Extract tags into list of string
df_posts['Tags'] = df_posts['Tags'].str.strip('|').str.split('|')

In [11]:
df_naruto = df_posts[df_posts['Tags'].apply(lambda x: 'naruto' in x if isinstance(x, list) else False)]

In [12]:
df_naruto.shape

(1497, 22)

In [13]:
df_posts_explode = df_posts.explode('Tags')

In [16]:
df_posts.iloc[1].to_dict()

{'Id': '2',
 'PostTypeId': '1',
 'AcceptedAnswerId': '33',
 'CreationDate': '2012-12-11T20:39:40.780',
 'Score': '14',
 'ViewCount': '2772',
 'Body': "<p>In the middle of <em>The Dark Tournament</em>, Yusuke Urameshi gets to fully inherit Genkai's power of the <em>Spirit Wave</em> by absorbing a ball of energy from her.</p>\n\n<p>However, this process turns into an excruciating trial for Yusuke, almost killing him, and keeping him doubled over in extreme pain for a long period of time, so much so that his Spirit Animal, Poo, is also in pain and flies to him to try to help.</p>\n\n<p>My question is, why is it such a painful procedure to learn and absorb this power?</p>\n",
 'OwnerUserId': '26',
 'LastEditorUserId': '247',
 'LastEditDate': '2013-02-26T17:02:31.570',
 'LastActivityDate': '2013-06-20T03:31:39.187',
 'Title': 'Why does absorbing the Spirit Wave from Genkai involve such a painful process?',
 'Tags': ['yu-yu-hakusho'],
 'AnswerCount': '1',
 'CommentCount': '0',
 'ContentLicen