"""
Author: Ra Cohen (ra.q.cohen@gmail.com)
Date: May 4, 2023
Based on: Graphs and Tropes Experiments by Aleksei Dorkin (@slowwavesleep)
Original URL: https://github.com/slowwavesleep/GraphsAndTropesExperiments
"""

In [129]:
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd
import html2text

In [None]:
BASE_URL = 'https://tvtropes.org/'

In [None]:
USER_AGENT = {'User-agent':
              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

In [None]:
MEDIA = ('Anime', 'ComicBook', 'Fanfic', 'Literature',
         'Myth', 'TabletopGame', 'Toys', 'Franchise', 'VideoGame',
         'Webcomic', 'AudioPlay', 'WesternAnimation', 'Wrestling',
         'Podcast', 'Music', 'Blog', 'ComicStrip', 'Theatre')

In [None]:
IMDB_MATCHABLE = ('Film', 'Series')

In [None]:
def get_page_html(path, url=BASE_URL, user_agent=USER_AGENT):
    url = url + path
    html = requests.get(url, headers=user_agent).text
    return bs(html)

In [None]:
def get_current_url(page, base_url=BASE_URL):
    offset = 0
    url = page.find('p', {'id': 'current_url'}).text
    url = strip_domain(url)
    return url

In [None]:
def strip_domain(url):
    return re.sub(r'http.*.org/', '', url)

In [None]:
def get_info_from_url(url):
    kind, name = re.findall('php/([^/]+).*/([^/]+)$', url)[0]
    return kind, name

In [None]:
def get_name(name):
    sep_str = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', name)
    new_name = ' '.join(sep_str)
    return new_name

In [None]:
def type_from_kind(kind, media=MEDIA, imdb_matchable=IMDB_MATCHABLE):
    if kind == 'Main':
        ptype = 'Trope'
    elif kind == 'Creator':
        ptype = 'Creator'
    elif kind in imdb_matchable:
        ptype = kind
    elif kind in media:
        ptype = 'Work'
    else:
        ptype = 'Other'
    return ptype

In [None]:
class Page(object):
    
    def __init__(self, url):
        self.url = strip_domain(url)
        kind, name = get_info_from_url(url)
        ptype = type_from_kind(kind)
        self.kind = kind
        self.name = get_name(name)
        self.ptype = ptype
    
    def __repr__(self):
        return f'{self.ptype} : {self.name}'
    

In [None]:
def get_references(page):
    references = []
    url = get_current_url(page)
    folders = page.findAll('div', {'class': 'folder'})
    if folders:
        lis = []
        for folder in folders:
            lis.extend(folder.findAll('li'))
    else:
        lis = page.find('h2').findNext('ul').findAll('li')
    for li in lis:
        links = li.findAll('a', {'class': 'twikilink'})
        references.extend(links)
    references = set([reference for reference in references if reference['href'] != url])
    references = [Page(reference['href']) for reference in references]
    references = [reference for reference in references if reference.ptype != 'Other']
    return references
        

In [111]:
def get_related_tropes(url):
    related_tropes = []
    folders = get_page_html(url).findAll('div', {'id': 'main-article'})
    if folders:
        lis = []
        for folder in folders:
            lis.extend(folder.findAll('p'))
    else:
        lis = page.find('h2').findNext('ul').findAll('li')
    for li in lis:
        links = li.findAll('a', {'class': 'twikilink'})
        related_tropes.extend(links)
    related_tropes = set([related_trope for related_trope in related_tropes if related_trope['href'] != url])
    related_tropes = [Page(related_trope['href']) for related_trope in related_tropes]
    related_tropes = [related_trope for related_trope in related_tropes if related_trope.ptype == 'Trope']
    return related_tropes

In [112]:
get_related_tropes('/pmwiki/pmwiki.php/Main/AardvarkTrunks')

[Trope : Sub Trope,
 Trope : Clamshells As Mouths,
 Trope : Overly Long Tongue,
 Trope : Vacuum Mouth,
 Trope : Funny Animal Anatomy,
 Trope : Toothy Bird,
 Trope : Somewhere A Mammalogist Is Crying,
 Trope : Mouthy Bird]

In [148]:
def get_sub_super_tropes(url):
    sub_tropes = []
    super_tropes = []
    related_tropes = []
    folders = get_page_html(url).findAll('div', {'id': 'main-article'})
    if folders:
        lis = []
        for folder in folders:
            lis.extend(folder.findAll('p'))
    else:
        lis = page.find('h2').findNext('ul').findAll('li')
    for li in lis:
        plain_text = li.get_text()
        if "Sub-Trope" or "Super-Trope" in plain_text:
            sentences = html2text.html2text(str(li)).split('. ')
            for sentence in sentences:
                if "Sub-Trope" in sentence:
                    sub_tropes_new = re.findall('"([^"]*)"', sentence)
                    sub_tropes_new.remove('/pmwiki/pmwiki.php/Main/SubTrope')
                    sub_tropes.extend(sub_tropes_new)
                if "Super-Trope" in sentence:
                    super_tropes_new = re.findall('"([^"]*)"', sentence)
                    super_tropes_new.remove('/pmwiki/pmwiki.php/Main/SuperTrope')
                    super_tropes.extend(super_tropes_new)
    subby_tropes = [Page(sub_trope_) for sub_trope_ in sub_tropes]
    suppy_tropes = [Page(super_trope_) for super_trope_ in super_tropes]
    return subby_tropes, super_tropes

In [149]:
get_sub_super_tropes('/pmwiki/pmwiki.php/Main/AardvarkTrunks')

([Trope : Somewhere A Mammalogist Is Crying, Trope : Funny Animal Anatomy], [])

In [91]:
def all_trope_parser(i):
    url = "/pmwiki/pagelist_having_pagetype_in_namespace.php?n=Main&t=trope&page="+str(i)
    trope_list = get_page_html(url).findAll('td')
    
    all_tropes = []
    for entry in trope_list:
        trope_url = entry.contents[0]['href']
        all_tropes.append(strip_domain(trope_url))

    all_tropes = [Page(trope) for trope in all_tropes]
    return all_tropes



In [93]:
all_tropes = []
for i in range(1, 60):
    all_tropes.extend(all_trope_parser(i))
all_tropes

[Trope : Aardvark Trunks,
 Trope : Abandoned Area,
 Trope : Abandoned Camp Ruins,
 Trope : Abandoned Catchphrase,
 Trope : Abandoned Hospital,
 Trope : Abandoned Hospital Awakening,
 Trope : Abandoned Info Page,
 Trope : Abandoned Laboratory,
 Trope : Abandoned Mascot,
 Trope : Abandoned Mine,
 Trope : Abandoned Pet In A Box,
 Trope : Abandoned Playground,
 Trope : Abandoned War Child,
 Trope : Abandoned Warehouse,
 Trope : Abandon Ship,
 Trope : Abandon Shipping,
 Trope : Abandon The Disabled,
 Trope : Abandonware,
 Trope : Abbey Road Crossing,
 Trope : Abdicate The Throne,
 Trope : Abduction Is Love,
 Trope : A Beast In Name And Nature,
 Trope : Abhorrent Admirer,
 Trope : Abilene Paradox,
 Trope : Ability Depletion Penalty,
 Trope : Ability Mixing,
 Trope : Ability Over Appearance,
 Trope : Ability Required To Proceed,
 Trope : A Birthday Not A Break,
 Trope : Abled In The Adaptation,
 Trope : A Bloody Mess,
 Trope : AB Negative,
 Trope : Abnormal Allergy,
 Trope : Abnormal Ammo,
 T