In [23]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import json

In [3]:
df = pd.read_csv("../data/raw/top_150_fantasy_anime.csv")

In [50]:
def extract_review_data(url, headers=None, delay=1):
    default_headers = {
        "User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
    }

    if headers is None:
        headers = default_headers

    time.sleep(delay)
    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.content, 'html.parser')
    review_blocks = soup.select('.review-element')

    reviews = []
    for review in review_blocks:
        try:
            username_tag = review.select_one('.username a')
            user = username_tag.text.strip() if username_tag else "N/A"
            profile_url = username_tag['href'] if username_tag else "N/A"

            avatar_tag = review.select_one('.thumb img')
            avatar_url = avatar_tag.get('data-src') or avatar_tag.get('src') if avatar_tag else "N/A"

            date = review.select_one('.update_at')
            date_text = date.text.strip() if date else "N/A"

            tag = review.select_one('.tag')
            recommendation = tag.text.strip() if tag else "N/A"

            prelim_span = review.select_one('.tag.preliminary span')
            is_preliminary = prelim_span is not None
            episodes_watched = prelim_span.text.strip('() \n') if prelim_span else "N/A"

            text_block = review.select_one('.text')
            visible_text = text_block.text.strip() if text_block else ""
            hidden_text = review.select_one('.text .js-hidden')
            hidden_text = hidden_text.text.strip() if hidden_text else ""
            full_review = (visible_text + "\n" + hidden_text).strip()

            rating_span = review.select_one('.rating .num')
            rating = rating_span.text.strip() if rating_span else "N/A"

            reactions_json = review.get("data-reactions")
            try:
                reactions_data = json.loads(reactions_json) if reactions_json else {}
            except json.JSONDecodeError:
                reactions_data = {}

            permalink_tag = review.select_one('.bottom-navi .open a')
            permalink_url = permalink_tag['href'] if permalink_tag else "N/A"
            review_id = permalink_url.split('=')[-1] if permalink_url != "N/A" else "N/A"

            gift_tag = review.select_one('.bottom-navi .gift a')
            gift_url = gift_tag['href'] if gift_tag else "N/A"

            more_reviews_tag = review.select_one('.more_reviews a')
            more_reviews_link = more_reviews_tag['href'] if more_reviews_tag else "N/A"
            user_review_count = more_reviews_tag.select_one('.num').text.strip() if more_reviews_tag else "N/A"

            reviews.append({
                "URL": url,
                "User": user,
                "Profile URL": profile_url,
                "Avatar": avatar_url,
                "Date": date_text,
                "Rating": rating,
                "Recommendation": recommendation,
                "Preliminary": is_preliminary,
                "Episodes Watched": episodes_watched,
                "Review": full_review,
                "Permalink": permalink_url,
                "Review ID": review_id,
                "Gift URL": gift_url,
                "More Reviews URL": more_reviews_link,
                "User Review Count": user_review_count,
                "Reactions": reactions_data
            })
        except Exception as e:
            print(f"Error parsing review: {e} at url: {url}")

    return reviews


In [51]:
all_reviews = []

for link in df['link']:
    page_reviews = extract_review_data(link, delay=2)
    all_reviews.extend(page_reviews)

df_reviews = pd.DataFrame(all_reviews)
df_reviews

Unnamed: 0,URL,User,Profile URL,Avatar,Date,Rating,Recommendation,Preliminary,Episodes Watched,Review,Permalink,Review ID,Gift URL,More Reviews URL,User Review Count,Reactions
0,https://myanimelist.net/anime/52991/Sousou_no_...,Czekaj,https://myanimelist.net/profile/Czekaj,https://cdn.myanimelist.net/s/common/userimage...,"Oct 13, 2023",10,Recommended,True,5/28 eps,"With lives so short, why do we even bother? To...",https://myanimelist.net/reviews.php?id=503754,503754,https://myanimelist.net/membership,/profile/Czekaj/reviews,5,"{'icon': ['2', '1', '6'], 'num': 1339, 'count'..."
1,https://myanimelist.net/anime/52991/Sousou_no_...,chekkit,https://myanimelist.net/profile/chekkit,https://cdn.myanimelist.net/s/common/userimage...,"Mar 22, 2024",10,Recommended,False,,I feel so catered to.\n\r\nIt feels like an et...,https://myanimelist.net/reviews.php?id=519189,519189,https://myanimelist.net/membership,/profile/chekkit/reviews,25,"{'icon': ['2', '1', '6'], 'num': 1180, 'count'..."
2,https://myanimelist.net/anime/52991/Sousou_no_...,Trikkiez,https://myanimelist.net/profile/Trikkiez,https://cdn.myanimelist.net/s/common/userimage...,"Mar 24, 2024",4,Not Recommended,False,,Style-\r\nFrieren doesn't have its own unique ...,https://myanimelist.net/reviews.php?id=519472,519472,https://myanimelist.net/membership,/profile/Trikkiez/reviews,3,"{'icon': ['3', '4', '1'], 'num': 4068, 'count'..."
3,https://myanimelist.net/anime/52991/Sousou_no_...,ShabbaRico,https://myanimelist.net/profile/ShabbaRico,https://cdn.myanimelist.net/s/common/userimage...,"Jan 12, 2024",5,Not Recommended,True,18/28 eps,"TL;DR: 5/10, I don't recommend this for anyone...",https://myanimelist.net/reviews.php?id=512466,512466,https://myanimelist.net/membership,/profile/ShabbaRico/reviews,12,"{'icon': ['3', '4', '1'], 'num': 902, 'count':..."
4,https://myanimelist.net/anime/52991/Sousou_no_...,TheRealist68,https://myanimelist.net/profile/TheRealist68,https://cdn.myanimelist.net/s/common/userimage...,"Oct 13, 2023",9,Mixed Feelings,True,6/28 eps,"Through 3 episodes, Frieren appears to be a un...",https://myanimelist.net/reviews.php?id=503760,503760,https://myanimelist.net/membership,/profile/TheRealist68/reviews,16,"{'icon': ['1', '4', '6'], 'num': 947, 'count':..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2363,https://myanimelist.net/anime/14513/Magi__The_...,ichiban_alex,https://myanimelist.net/profile/ichiban_alex,https://cdn.myanimelist.net/s/common/userimage...,"May 12, 2015",7,Recommended,False,,Magi is certainly not the best shonen anime ev...,https://myanimelist.net/reviews.php?id=187703,187703,https://myanimelist.net/membership,/profile/ichiban_alex/reviews,6,"{'icon': ['1', '2', '3'], 'num': 13, 'count': ..."
2364,https://myanimelist.net/anime/14513/Magi__The_...,aikojazz,https://myanimelist.net/profile/aikojazz,https://cdn.myanimelist.net/s/common/userimage...,"Aug 18, 2014",9,Recommended,False,,I just finished this anime and I'm breathless....,https://myanimelist.net/reviews.php?id=157023,157023,https://myanimelist.net/membership,/profile/aikojazz/reviews,16,"{'icon': ['1', '2'], 'num': 14, 'count': ['13'..."
2365,https://myanimelist.net/anime/14513/Magi__The_...,MajorJett,https://myanimelist.net/profile/MajorJett,https://cdn.myanimelist.net/s/common/userimage...,"Oct 11, 2014",9,Recommended,False,,"When i think Magi, i think action and comedy p...",https://myanimelist.net/reviews.php?id=164485,164485,https://myanimelist.net/membership,/profile/MajorJett/reviews,1,"{'icon': ['1', '3'], 'num': 15, 'count': ['14'..."
2366,https://myanimelist.net/anime/14513/Magi__The_...,Anim3Punk,https://myanimelist.net/profile/Anim3Punk,https://cdn.myanimelist.net/s/common/userimage...,"Aug 22, 2015",8,Recommended,False,,Initial Review for Newcomers: Magi: The Labyri...,https://myanimelist.net/reviews.php?id=196769,196769,https://myanimelist.net/membership,/profile/Anim3Punk/reviews,12,"{'icon': ['1'], 'num': 14, 'count': ['14', '0'..."


In [52]:
df_reviews.to_csv("../data/top_150_fantasy_reviews.csv")