##### TopWebFiction Scraper

This webscraper collects highest-rated story information from TopWebFiction.com, the most popular website on the internet for ranking english-language serialized web fiction (stories that are published one chapter at a time). It loads the data and joins it into two clean pandas dataframes, one of which contains tags for each story and the other contains top 10 related stories for that story.

In [2]:
#IMPORT RELEVANT LIBRARIES
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import matplotlib.pyplot as plt
import csv

#### Function to collect all the relevant data from a single story

In [3]:
def getPageData(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    #FIND THE STORY TITLES ON ONE PAGE
    story_titles = soup.findAll('span', attrs={"class":"title"})
    titles = []
    for row in story_titles:
        titles.append(row.text.strip())

    #FIND THE STORY AUTHORS ON ONE PAGE
    story_authors = soup.findAll('span', attrs={"class":"byline"})
    authors = []
    for row in story_authors:
        authors.append(row.text.strip())
    for x in range(len(authors)):
        authors[x] = re.sub('by ','',authors[x])

    #GET STORY VOTES FOR ONE PAGE
    story_votes = soup.findAll('td', attrs={"class":"info"})
    votes = []
    for x in story_votes:
        votes.append(x.text)
    votes2 = votes
    for x in range(len(votes)):
        votes2[x] = re.sub(r'\n', '', str(votes[x]))
        votes2[x] = re.sub(r'\xa0boosters', '', str(votes2[x]))

    #GET STORY TAGS FOR ONE PAGE
    story_tags = soup.findAll('p', attrs={"class":"tags"})
    tags = []
    for x in story_tags:
        tags.append(x.text)
    for x in range(len(tags)):
        tags[x] = re.sub(' ', '-', str(tags[x]))
        tags[x] = re.sub(r'\n', ' ', str(tags[x]))
        tags[x] = re.split(' ', str(tags[x]))
    
    #GET TITLES FOR STORIES WITH MOST SHARED VOTERS
    links = []
    for link in soup.find_all('a', attrs={'href': re.compile("^https://topwebfiction.com/listings/")}):
        links.append(link.get('href'))

    def get_related_links_soup(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        related_stories_soup = soup.findAll('span', attrs={"class":"title"})
        return(related_stories_soup)

    def get_related_links(soup):
        link_list = []
        for x in range(len(soup)):
            link_list.append(soup[x].text)
        return(link_list[10:])

    def relatedLinkFetch(url): 
        linksoup = get_related_links_soup(url)
        return(get_related_links(linksoup))

    rel_stories = []
    for x in links:
        rel_stories.append(relatedLinkFetch(x))
    
    #REMOVE STORIES WITH NO TAGS.
    if ('Google' in titles) == True:
        ind = titles.index('Google')
        del(titles[ind])
        del(authors[ind])
        del(votes[ind])
        del(rel_stories[ind])
    
    if ('Out of New York City' in titles) == True:
        ind = titles.index('Out of New York City')
        del(titles[ind])
        del(authors[ind])
        del(votes[ind])
        del(rel_stories[ind])
    
    if ('Project Shelf Life' in titles) == True:
        ind = titles.index('Project Shelf Life')
        del(titles[ind])
        del(authors[ind])
        del(votes[ind])
        del(rel_stories[ind])  
    
    #PUT ALL THE COLUMNS TOGETHER INTO A DATAFRAME
    page_df = pd.DataFrame(
        {'Title': titles,
         'Author': authors,
         'Votes': votes,
         'Tags': tags,
         'Related Stories': rel_stories
        })
    
    return(page_df)

#### Run the scraper on the first 16 pages of highest-rated stories, which includes all stories with at least 2 unique votes and combine them

In [4]:
dfs = []
base_url = 'https://topwebfiction.com/?ranking=at&page='
for i in range(1,16):
    df = getPageData(base_url+str(i))
    dfs.append(df)
    
df = pd.concat(dfs)
df.index = range(len(df))

In [9]:
df.head()

Unnamed: 0,Title,Author,Votes,Tags,Related Stories
0,A Practical Guide to Evil,ErraticErrata,48114,"[adventure, anti-hero, coming-of-age, fantasy,...","[The Gods are Bastards, The Wandering Inn, Wor..."
1,Worm,Wildbow,39206,"[action, adventure, complete, crime, dark, hig...","[Pact, Twig, Ward, A Practical Guide to Evil, ..."
2,Ward,wildbow,24154,"[action, adventure, alternate-universe, crime,...","[Worm, Twig, Pact, A Practical Guide to Evil, ..."
3,The Wandering Inn,pirateaba,22464,"[fantasy, magic, ]","[A Practical Guide to Evil, The Gods are Basta..."
4,Metaworld Chronicles,Wutosama,20112,"[fantasy, magic, relationships, young-adult, ]","[A Practical Guide to Evil, The Wandering Inn,..."


In [None]:
#### Split the 

In [5]:
#CREATE DUMMY VARIABLES FOR EACH TAG AND EACH RELATED STORY SO THEY CAN BE ANALYZED, AND SPLIT THEM INTO TWO SEPERATE DATAFRAMES
tags_dum = pd.get_dummies(df['Tags'].apply(pd.Series).stack()).sum(level=0)
related_stories_dum = pd.get_dummies(df['Related Stories'].apply(pd.Series).stack()).sum(level=0)

tags_df = pd.concat([df, tags_dum], axis=1)
rs_df = pd.concat([df, related_stories_dum], axis=1)

tags_df = tags_df.drop(['Tags', 'Related Stories'], axis=1)
rs_df = rs_df.drop(['Tags', 'Related Stories'], axis=1)

In [6]:
#TURN THE DATAFRAMES INTO CSV'S
tags_df.to_csv('tags_df.csv')
rs_df.to_csv('rs_df.csv')

In [7]:
tags_df.head()

Unnamed: 0,Title,Author,Votes,Unnamed: 4,action,addiction,adventure,ai,airships,aliens,...,western,witches,wizards,wolves,wuxia,xianxia,youkai,young-adult,young-love,zombies
0,A Practical Guide to Evil,ErraticErrata,48114,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Worm,Wildbow,39206,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Ward,wildbow,24154,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,The Wandering Inn,pirateaba,22464,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Metaworld Chronicles,Wutosama,20112,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
rs_df.head()

Unnamed: 0,Title,Author,Votes,100 Luck and the Dragon Tamer Skill!,23 Pangbourne Place,A Bad Idea,A Devil in Gods Country,A Grey World,A Journey of Black and Red,A Lament of Gods and Monsters,...,Wonder City Stories,World Domination in Retrospect,World-ruling Dungeon,Worm,Worth the Candle,"Yes, Your Highness",Your Typical Isekai LitRPG,aka,asa kraiya,blacklight
0,A Practical Guide to Evil,ErraticErrata,48114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Worm,Wildbow,39206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Ward,wildbow,24154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,The Wandering Inn,pirateaba,22464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Metaworld Chronicles,Wutosama,20112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


####Summary

In this notebook I scraped the data from topwebfiction.com