##### TopWebFiction Scraper

this scraper collects highest-rated story information from TopWebFiction.com, the most popular website on the internet for ranking english-language serialized web fiction (stories that are published one chapter at a time). It loads the data and joins it into two clean pandas dataframes, one of which contains tags for each story and the other contains related stories.

In [2]:
#IMPORT RELEVANT LIBRARIES
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import matplotlib.pyplot as plt
import csv

#### Function to collect all the relevant data from a single story

In [5]:
def getPageData(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    #FIND THE STORY TITLES ON ONE PAGE
    story_titles = soup.findAll('span', attrs={"class":"title"})
    titles = []
    for row in story_titles:
        titles.append(row.text.strip())

    #FIND THE STORY AUTHORS ON ONE PAGE
    story_authors = soup.findAll('span', attrs={"class":"byline"})
    authors = []
    for row in story_authors:
        authors.append(row.text.strip())
    for x in range(len(authors)):
        authors[x] = re.sub('by ','',authors[x])

    #GET STORY VOTES FOR ONE PAGE
    story_votes = soup.findAll('td', attrs={"class":"info"})
    votes = []
    for x in story_votes:
        votes.append(x.text)
    votes2 = votes
    for x in range(len(votes)):
        votes2[x] = re.sub(r'\n', '', str(votes[x]))
        votes2[x] = re.sub(r'\xa0boosters', '', str(votes2[x]))

    #GET STORY TAGS FOR ONE PAGE
    story_tags = soup.findAll('p', attrs={"class":"tags"})
    tags = []
    for x in story_tags:
        tags.append(x.text)
    for x in range(len(tags)):
        tags[x] = re.sub(' ', '-', str(tags[x]))
        tags[x] = re.sub(r'\n', ' ', str(tags[x]))
        tags[x] = re.split(' ', str(tags[x]))
    
    #GET TITLES FOR STORIES WITH MOST SHARED VOTERS
    links = []
    for link in soup.find_all('a', attrs={'href': re.compile("^https://topwebfiction.com/listings/")}):
        links.append(link.get('href'))

    def get_related_links_soup(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        related_stories_soup = soup.findAll('span', attrs={"class":"title"})
        return(related_stories_soup)

    def get_related_links(soup):
        link_list = []
        for x in range(len(soup)):
            link_list.append(soup[x].text)
        return(link_list[10:])

    def relatedLinkFetch(url): 
        linksoup = get_related_links_soup(url)
        return(get_related_links(linksoup))

    rel_stories = []
    for x in links:
        rel_stories.append(relatedLinkFetch(x))
    
    #REMOVE STORIES WITH NO TAGS.
    if ('Google' in titles) == True:
        ind = titles.index('Google')
        del(titles[ind])
        del(authors[ind])
        del(votes[ind])
        del(rel_stories[ind])
    
    if ('Out of New York City' in titles) == True:
        ind = titles.index('Out of New York City')
        del(titles[ind])
        del(authors[ind])
        del(votes[ind])
        del(rel_stories[ind])
    
    if ('Project Shelf Life' in titles) == True:
        ind = titles.index('Project Shelf Life')
        del(titles[ind])
        del(authors[ind])
        del(votes[ind])
        del(rel_stories[ind])  
    
    #PUT ALL THE COLUMNS TOGETHER INTO A DATAFRAME
    page_df = pd.DataFrame(
        {'Title': titles,
         'Author': authors,
         'Votes': votes,
         'Tags': tags,
         'Related Stories': rel_stories
        })
    
    return(page_df)

In [4]:
#RUN THE SCRAPER ON THE FIRST 16 PAGES OF HIGHEST-RATED STORIES, WHICH INCLUDE ALL STORIES WITH AT LEAST 2 UNIQUE VOTES
dfs = []
base_url = 'https://topwebfiction.com/?ranking=at&page='
for i in range(1,16):
    df = getPageData(base_url+str(i))
    dfs.append(df)
    
df = pd.concat(dfs)
df.index = range(len(df))

In [6]:
#CREATE DUMMY VARIABLES FOR EACH TAG AND EACH RELATED STORY SO THEY CAN BE ANALYZED, AND SPLIT THEM INTO TWO SEPERATE DATAFRAMES
tags_dum = pd.get_dummies(df['Tags'].apply(pd.Series).stack()).sum(level=0)
related_stories_dum = pd.get_dummies(df['Related Stories'].apply(pd.Series).stack()).sum(level=0)

tags_df = pd.concat([df, tags_dum], axis=1)
rs_df = pd.concat([df, related_stories_dum], axis=1)

tags_df = tags_df.drop(['Tags', 'Related Stories'], axis=1)
rs_df = rs_df.drop(['Tags', 'Related Stories'], axis=1)

In [22]:
#TURN THE DATAFRAMES INTO CSV'S
tags_df.to_csv('tags_df.csv')
rs_df.to_csv('rs_df.csv')

EDA Questions:

1. What are the most popular tags?
   What are the most popular tags for the most popular (top 25%) stories?
   What are the most popular tags for the least popular (bottom 25%) stories?
2. What's the distribution of story votes?
3. Which authors have multiple stories?
4. Is the list of most popular related stories the same as the list of most popular stories?

In [21]:
sums = []
for x in tags_df.columns:
        try:
            sums.append(tags_df[x].sum())
        except:
            sums.append(0)

In [48]:
sums[0] = 'NA'
sums[1] = 'NA'
sums[2] = 'NA'
sums[3] = 'NA'

In [None]:
d = {'Title':list(tags_df.columns), 'Sums':sums}
t = pd.DataFrame(d).transpose()
t = t.rename(columns = t.iloc[0])
t = t.drop(['Title'])
t = t.drop(columns=['Title', 'Author','Votes', ''])
t = t.sort_values(by=['Sums'], ascending=False ,axis=1)

In [153]:
t

Unnamed: 0,fantasy,adventure,action,magic,science-fiction,romance,complete,superhero,science-fantasy,series,...,secret-agents,christmas,self-esteem,clones,mercenaries,space-horror,spanish,spies,dolphins,epistolary
Sums,471,325,295,176,109,99,97,94,93,91,...,1,1,1,1,1,1,1,1,1,1


In [138]:
sums = []
for x in rs_df.columns:
        try:
            sums.append(rs_df[x].sum())
        except:
            sums.append(0)

In [140]:
sums[0] = 'NA'
sums[1] = 'NA'
sums[2] = 'NA'

In [142]:
d = {'Title':list(rs_df.columns), 'Sums':sums}
r = pd.DataFrame(d).transpose()
r = r.rename(columns = r.iloc[0])
r = r.drop(['Title'])
r = r.drop(columns=['Title', 'Author','Votes'])
r = r.sort_values(by=['Sums'], ascending=False ,axis=1)

In [179]:
rankcompare = r.transpose() 

In [182]:
rankcompare['relindex'] = range(len(rankcompare))
rankcompare['Title'] = rankcompare.index
#rankcompare['absindex'] = 

In [189]:
pd.set_option('display.max_rows', None)

In [190]:
pd.merge(rankcompare, rs_df, on='Title').iloc[:,1:20]

Unnamed: 0,relindex,reltitle,Title,Author_x,Votes_x,100 Luck and the Dragon Tamer Skill!_x,23 Pangbourne Place_x,A Bad Idea_x,A Devil in Gods Country_x,A Grey World_x,A Journey of Black and Red_x,A Lament of Gods and Monsters_x,A Practical Guide to Evil_x,A Rosary of Stones and Thorns_x,ATL: Stories from the Retrofuture_x,Above Ground_x,Aconitum_x,Addergoole_x,Advent_x
0,0,A Practical Guide to Evil,A Practical Guide to Evil,ErraticErrata,47838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Worm,Worm,Wildbow,39169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,The Wandering Inn,The Wandering Inn,pirateaba,22388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,The Gods are Bastards,The Gods are Bastards,D. D. Webb,14916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,The Legion of Nothing,The Legion of Nothing,Jim Zoetewey,9858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,The Zombie Knight Saga,The Zombie Knight Saga,George M. Frost,14940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,Metaworld Chronicles,Metaworld Chronicles,Wutosama,20080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,The Iron Teeth,The Iron Teeth,ClearMadness,11040,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,Twig,Twig,Wildbow,18919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,9,The Good Student,The Good Student,mooderino,10454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [191]:
tags_df.head()

Unnamed: 0,Title,Author,Votes,Unnamed: 4,action,addiction,adventure,ai,airships,aliens,...,western,witches,wizards,wolves,wuxia,xianxia,youkai,young-adult,young-love,zombies
0,A Practical Guide to Evil,ErraticErrata,47838,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Worm,Wildbow,39169,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Ward,wildbow,24135,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,The Wandering Inn,pirateaba,22388,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Metaworld Chronicles,Wutosama,20080,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
#TITLES OF RELATED STORIES
rs_df.head()

Unnamed: 0,Title,Author,Votes,100 Luck and the Dragon Tamer Skill!,23 Pangbourne Place,A Bad Idea,A Devil in Gods Country,A Grey World,A Journey of Black and Red,A Lament of Gods and Monsters,...,World Domination in Retrospect,World-ruling Dungeon,Worm,Worth the Candle,"Yes, Your Highness",Your Typical Isekai LitRPG,Zeppelins are What Dreams are Made of,aka,asa kraiya,blacklight
0,A Practical Guide to Evil,ErraticErrata,47838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Worm,Wildbow,39169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Ward,wildbow,24135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,The Wandering Inn,pirateaba,22388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Metaworld Chronicles,Wutosama,20080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
