The goal of this program is to take a list of search results from Archive of Our Own and return a more customized set of search results. In particular, this program returns the stories with a fic quality rating, simply calculated by dividing the number of kudos plus the number of bookmarks by the number of reads. Eventually, this calculation may become more nuanced, including accounting for series. 

Inputs:
    AO3 URL

In [1]:
from urllib import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import requests
import string
from lxml import html

base_url = "http://archiveofourown.org"
search_url = "http://archiveofourown.org/tags/Sherlock%20Holmes*s*Sarah%20Sawyer*s*John%20Watson/works?page="
#search_url = "http://archiveofourown.org/tags/Clint%20Barton*s*Phil%20Coulson/works?page="


In [2]:
def process_url(link_url):
    file = urlopen(link_url)
    soup = BeautifulSoup(file, "lxml")
    stories = soup.ol
    stories = soup.findAll("li", { "class" : "work blurb group" })
    return stories

In [3]:
def grab_max_page(link_url):
    file = urlopen(link_url)
    soup = BeautifulSoup(file, "lxml")
    pages = soup.find("ol", {"class": "pagination actions"})
    index = len(pages.contents) - 3
    max = pages.contents[index].a.contents[0]
    return max

In [4]:
def grab_story(story1):
    #Attempt to calculate score first
    
    try:   
        hits = float(story1.findAll("dd", { "class" : "hits" })[0].contents[0])
        kudos = float(story1.findAll("dd", {"class":"kudos"})[0].contents[0].contents[0])
        bookmarks = float(story1.findAll("dd", {"class": "bookmarks"})[0].contents[0].contents[0])
        score = (kudos + bookmarks)/hits * 100
    except IndexError:
        return
    else:  
        characters = []
        relationship = []
        header = story1.find("h4")
        story_url = base_url + header.a['href']
        title = header.a.contents[0]
        #Need to navigate eiher to 2nd link in header or to rel = "author"
        author = story1.find("a", {"rel": "author"}).contents[0]
        summary = story1.findAll("blockquote", {"class": "userstuff summary"})[0].contents[1].contents[0]

        #Store all tags as list
        tags = ""    
        language = ""
        word_count = ""
        chapters = ""
#        temp = story1.findAll("li", {"class": "characters"})[0]
#        for item in temp:
#            characters.append(item.contents[0].contents)
        date = story1.findAll("p", {"class":"datetime"})[0].contents[0]
        relationship = []
        temp = story1.findAll("li", {"class": "relationships"})
        for item in temp:
            relationship.append(item.contents[0].contents[0])        
    #    tags = story1.findAll
        summary = story1.findAll("blockquote", {"class": "userstuff summary"})[0].contents[1].contents[0]
        keys = ['Title','Author', 'Characters', 'Relationship', 'Score', 'Summary', 'Hits', 'Kudos', 'URL', 'Date', 'Bookmarks']    
        story_list.append(dict(zip(keys, [title, author, characters, relationship, score, summary, hits, kudos, story_url, date, bookmarks])))  


In [15]:
story_list = []
page_number = 1
link_url = search_url + str(page_number)
max = int(grab_max_page(link_url))
while page_number <= max:
    stories = process_url(link_url)
    for story in stories:
        grab_story(story)
    page_number += 1
    link_url = search_url + str(page_number)

In [13]:
story_list_df = pd.DataFrame(story_list)
sorted_stories = story_list_df.sort_values(by="Score", ascending=False, inplace=False, kind='quicksort', na_position='last')
print sorted_stories

             Author  Bookmarks Characters         Date     Hits  Kudos  \
21          Sadbhyl        8.0         []  10 Oct 2010    568.0   40.0   
9           Sadbhyl       19.0         []   1 Jul 2011   1525.0   83.0   
5      alizarin_nyc       11.0         []   4 Jan 2012   1893.0   92.0   
2       Mazarin221b       28.0         []  23 Aug 2012   3110.0  139.0   
11       Saathi1013       10.0         []   1 Apr 2011   2766.0  134.0   
10      Pyjamapants      150.0         []  29 May 2011  13900.0  461.0   
7    marysutherland        9.0         []   5 Nov 2011   1838.0   70.0   
1         lie_to_me        1.0         []   7 Sep 2013    877.0   29.0   
20       Saathi1013       24.0         []  13 Dec 2010   5319.0  151.0   
17       Saathi1013        2.0         []  28 Dec 2010   2629.0   81.0   
14       Saathi1013        2.0         []   1 Feb 2011   2797.0   85.0   
16       Saathi1013        2.0         []   4 Jan 2011   2523.0   74.0   
15       Saathi1013        2.0        