In [None]:
"""
Author: Ra Cohen (ra.q.cohen@gmail.com)
Date: May 4, 2023
Based on: Graphs and Tropes Experiments by Aleksei Dorkin (@slowwavesleep)
Original URL: https://github.com/slowwavesleep/GraphsAndTropesExperiments
"""

In [2]:
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd

In [3]:
BASE_URL = 'https://tvtropes.org/'

In [4]:
USER_AGENT = {'User-agent':
              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

In [5]:
MEDIA = ('Anime', 'ComicBook', 'Fanfic', 'Literature',
         'Myth', 'TabletopGame', 'Toys', 'Franchise', 'VideoGame',
         'Webcomic', 'AudioPlay', 'WesternAnimation', 'Wrestling',
         'Podcast', 'Music', 'Blog', 'ComicStrip', 'Theatre')

In [6]:
IMDB_MATCHABLE = ('Film', 'Series')

In [7]:
def get_page_html(path, url=BASE_URL, user_agent=USER_AGENT):
    url = url + path
    html = requests.get(url, headers=user_agent).text
    return bs(html)

In [8]:
def get_current_url(page, base_url=BASE_URL):
    offset = 0
    url = page.find('p', {'id': 'current_url'}).text
    url = strip_domain(url)
    return url

In [9]:
def strip_domain(url):
    return re.sub(r'http.*.org/', '', url)

In [10]:
def get_info_from_url(url):
    kind, name = re.findall('php/([^/]+).*/([^/]+)$', url)[0]
    return kind, name

In [11]:
def get_name(name):
    sep_str = re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', name)
    return " ".join(sep_str)

In [12]:
def type_from_kind(kind, media=MEDIA, imdb_matchable=IMDB_MATCHABLE):
    if kind == 'Main':
        ptype = 'Trope'
    elif kind == 'Creator':
        ptype = 'Creator'
    elif kind in imdb_matchable:
        ptype = kind
    elif kind in media:
        ptype = 'Work'
    else:
        ptype = 'Other'
    return ptype

In [13]:
class Page(object):
    
    def __init__(self, url):
        self.url = strip_domain(url)
        kind, name = get_info_from_url(url)
        ptype = type_from_kind(kind)
        self.kind = kind
        self.name = get_name(name)
        self.ptype = ptype
    
    def __repr__(self):
        return f'{self.ptype} : {self.name}'
    

In [14]:
def get_references(page):
    references = []
    url = get_current_url(page)
    folders = page.findAll('div', {'class': 'folder'})
    if folders:
        lis = []
        for folder in folders:
            lis.extend(folder.findAll('li'))
    else:
        lis = page.find('h2').findNext('ul').findAll('li')
    for li in lis:
        links = li.findAll('a', {'class': 'twikilink'})
        references.extend(links)
    references = set([reference for reference in references if reference['href'] != url])
    references = [Page(reference['href']) for reference in references]
    references = [reference for reference in references if reference.ptype != 'Other']
    return references
        

In [2]:
from neo4j import GraphDatabase, basic_auth

In [3]:
driver = GraphDatabase.driver("neo4j+s://ae079f80.databases.neo4j.io",
                              auth=("neo4j", "2wHOpYvvZwXCVnUDAvuE5KKSKz09E-6nOFJuP5lzsH4"))

In [4]:
session = driver.session()

In [18]:
def add_node(page_obj, session=session):
    query = 'MERGE (p:Page:' + page_obj.ptype + '{name:$name, kind: $kind, url: $url})'
    session.run(query, name=page_obj.name, kind=page_obj.kind, url=page_obj.url)

In [19]:
add_node(Page('/pmwiki/pmwiki.php/VideoGame/PlagueInc'))

In [20]:
def list_node_properties(url, session=session):
    query = 'MATCH (n:Page{url: $url}) RETURN n'
    return session.run(query, url=url).value()

In [21]:
list_node_properties('/pmwiki/pmwiki.php/VideoGame/PlagueInc')

[<Node element_id='4:0ee1c312-2185-45df-8494-f142ea9115d5:2' labels=frozenset({'Work', 'Page'}) properties={'kind': 'VideoGame', 'name': 'Plague Inc', 'url': '/pmwiki/pmwiki.php/VideoGame/PlagueInc'}>]

In [22]:
def update_name(url, name, session=session):
    query = 'MATCH (n:Page{url: $url}) SET n.name = $name'
    session.run(query, url=url, name=name)

In [23]:
update_name('/pmwiki/pmwiki.php/VideoGame/PlagueInc', 'test')

In [24]:
list_node_properties('/pmwiki/pmwiki.php/VideoGame/PlagueInc')

[<Node element_id='4:0ee1c312-2185-45df-8494-f142ea9115d5:2' labels=frozenset({'Work', 'Page'}) properties={'kind': 'VideoGame', 'name': 'test', 'url': '/pmwiki/pmwiki.php/VideoGame/PlagueInc'}>]

In [25]:
def delete_node(url, session=session):
    query = 'MATCH (p:Page{url: $url}) DETACH DELETE (p)'
    session.run(query, url=url)

In [26]:
delete_node('/pmwiki/pmwiki.php/VideoGame/PlagueInc')

In [27]:
list_node_properties('/pmwiki/pmwiki.php/VideoGame/PlagueInc')

[]

In [28]:
def delete_all(session=session):
    query = 'MATCH(n) DETACH DELETE n'
    session.run(query)

In [29]:
delete_all()

In [30]:
def add_reference(page_from, page_to, session=session):
    query = 'MATCH (a:Page) WHERE a.name = $from_node'
    query += '\nMATCH (b) WHERE b.name = $to_node'
    query += '\nMERGE (a)-[r:REFERENCE{visual:\'-\'}]->(b)'
    session.run(query, from_node=page_from.name, to_node=page_to.name)

In [31]:
def fill_references(url, session=session):
    '''
    Create a node and add references.
    '''
    page = get_page_html(url)
    current = Page(url)
    add_node(current)
    references = get_references(page)
    for reference in references:
        add_node(reference)
        add_reference(current, reference)

In [32]:
def get_nodes_n(session=session):
    '''
    Return the total number of nodes in the datebase.
    '''
    query = 'MATCH (n) RETURN count(n) as count'
    return session.run(query).single()['count']

In [33]:
def get_edges_n(session=session):
    '''
    Return the total number of edges in the datebase.
    '''
    query = 'MATCH ()-[r]-() RETURN count(r) as count'
    return session.run(query).single()['count']

In [34]:
def get_node_edges(url, session=session):
    query = 'MATCH ()-[r]-({url: $url}) RETURN count(r)'
    return session.run(query, url=url).value()[0]

In [35]:
def get_urls(session=session):
    '''
    List url of each node it the database.
    '''
    query = 'MATCH (n:Page) RETURN n.url'
    result = session.run(query).values()
    return  [record[0] for record in result]

In [36]:
from collections import Counter

def list_mutual_neigbors(n=10, session=session):
    '''
    Returns n most common tropes referenced by at least two works.
    '''
    query = 'MATCH (a:Work)-[:REFERENCE]->(t:Trope)<-[:REFERENCE]-(b:Work) RETURN t.url'
    records = session.run(query).records()
    records = Counter([record.value() for record in records])
    return records.most_common(n)

[The Common Neighbors algorithm](https://neo4j.com/docs/graph-algorithms/current/labs-algorithms/common-neighbors/)

In [37]:
def common_neighbors(url1, url2, session=session):
    query = 'MATCH (p1:Page{url:$url1})'
    query += 'MATCH (p2:Page{url:$url2})'
    query += 'RETURN algo.linkprediction.commonNeighbors(p1, p2) AS score'
    return session.run(query, url1=url1, url2=url2).value()[0]

Random Walk

In [38]:
def random_walk(url, steps=3, walks=1, session=session):
    query = '''MATCH (home:Page {url: $url})
               CALL algo.randomWalk.stream(id(home), $steps, $walks)
               YIELD nodeIds

               UNWIND nodeIds AS nodeId

               RETURN algo.asNode(nodeId).url AS page'''
    return session.run(query, url=url, steps=steps, walks=walks)

Similarity algorithms utilize node properties which aren't used at the moment.

Graph statistics

In [39]:
def graph_stats(session=session):
    '''
    Returns stats of the graph in the current database.
    '''
    query = '''MATCH (p:Page)
               RETURN avg(apoc.node.degree(p,'REFERENCE')) as average_refs,
               stdev(apoc.node.degree(p,'REFERENCE')) as stdev_refs,
               max(apoc.node.degree(p,'REFERENCE')) as max_refs,
               min(apoc.node.degree(p,'REFERENCE')) as min_refs'''
    records = session.run(query)
    records = [record for record in records.records()][0]
    result = {}
    result['average_refs'] = records['average_refs']
    result['stdev_refs'] = records['stdev_refs']
    result['max_refs'] = records['max_refs']
    result['min_refs'] = records['min_refs']
    return result

In [40]:
def all_trope_parser(i):
    url = "/pmwiki/pagelist_having_pagetype_in_namespace.php?n=Main&t=trope&page="+str(i)
    trope_list = get_page_html(url).findAll('td')
    
    all_tropes = []
    for entry in trope_list:
        trope_url = entry.contents[0]['href']
        all_tropes.append(strip_domain(trope_url))

    all_tropes = [Page(trope) for trope in all_tropes]
    return all_tropes

Get all our trope nodes

In [49]:
def get_all_trope_nodes():
    all_tropes = []
    for i in range(1, 60):
        all_tropes.extend(all_trope_parser(i))
    return all_tropes

In [50]:
def add_all_nodes(page_list):
    for trope in all_tropes:
        add_node(trope)

In [53]:
all_tropes = get_all_trope_nodes()
len(all_tropes)

29027

In [None]:
add_all_nodes(all_tropes)

For each trope, find its related subtropes

In [56]:
def add_related_trope(page_from, page_to, session=session):
    query = 'MATCH (a:Page) WHERE a.name = $from_node'
    query += '\nMATCH (b) WHERE b.name = $to_node'
    query += '\nMERGE (a)-[r:RELATED{visual:\'-\'}]->(b)'
    session.run(query, from_node=page_from.name, to_node=page_to.name)

In [47]:
def get_related_tropes(url):
    related_tropes = []
    folders = get_page_html(url).findAll('div', {'id': 'main-article'})
    if folders:
        lis = []
        for folder in folders:
            lis.extend(folder.findAll('p'))
    else:
        lis = page.find('h2').findNext('ul').findAll('li')
    for li in lis:
        links = li.findAll('a', {'class': 'twikilink'})
        related_tropes.extend(links)
    related_tropes = set([related_trope for related_trope in related_tropes if related_trope['href'] != url])
    related_tropes = [Page(related_trope['href']) for related_trope in related_tropes]
    related_tropes = [related_trope for related_trope in related_tropes if related_trope.ptype == 'Trope']
    return related_tropes

In [57]:
for trope in all_tropes:
    related_tropes = get_related_tropes(trope.url)
    for related_trope in related_tropes:
        add_related_trope(trope, related_trope)