In [1]:
#Maria Williams  - Nov, 2021
#Library thing scraper

In [10]:
#imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options 
from bs4 import BeautifulSoup, NavigableString
import pandas as pd
import numpy as np
import requests
import re

#this script uses selenium and chromedriver to crawl the page
!pip install selenium
!pip install chromedriver-py



# First Step:
Get links to the book pages that correspond to a list of movies

In [13]:
#this is the function that searches for the movie title and release date in the standard format used by Library Thing
#it returns the title it finds (in case it is different than the search term) and a link to the first result
def MovieFirst(baseurl, title):
    
    #initiate driver
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    driver = webdriver.Chrome('chromedriver', options=options)

    #wait for page to load item, then get contents
    try:
        driver.get(baseurl.format(title))
        timeout = 5
        WebDriverWait(driver, timeout).until(ec.presence_of_element_located((By.CLASS_NAME, 'item')))
        stuff = driver.page_source
        driver.quit()

        #get the first item title and link
        soup = BeautifulSoup(stuff, features="html.parser")
        please = soup.find_all(attrs={'id':'ajaxcontent'})
        please = soup.find_all(attrs={'class':'item'})
        please = please[0].find('a')
        title = please.string
        link1 = please['href']
        
    except TimeoutException:
        link1 = None

    return title, link1

#this is the function that receives a link (supposedly a movie page) 
#it returns any 'adaptation of' info: book title and link 
def BookSecond(link1):
    
    #initiate driver
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    driver2 = webdriver.Chrome('chromedriver', options=options)
    
    try:
        driver2.get('https://www.librarything.com'+link1)
        timeout = 5
        WebDriverWait(driver2, timeout).until(ec.presence_of_element_located((By.ID, 'relationships_container')))
        stuff = driver2.page_source
        driver2.quit()

        #get the book title and link
        soup = BeautifulSoup(stuff, features="html.parser")
        yes = soup.find(attrs={'id':'relationships_container'})
        hm = soup.find(text="Is an adaptation of")
        if type(hm) == type(None):
            book = None
            link2 = None
        else:
            yes = soup.find(text="Is an adaptation of").find_next()
            yes = yes.find(attrs={'class':'popup_registered'})
            book = yes.string
            link2 = yes['href']

    except TimeoutException:
        book = None
        link2 = None

    return book, link2

#this function receives a movie title and year
#it uses the 'MovieFirst' and 'BookSecond' functions from above
#it returns the found movie title, the link to the movie entry, the book title, and the link to the book entry
def ScrapeSearch(title, moviedate):

        #url puzzle pieces
        baseurl = 'https://www.librarything.com/search.php?search={}&searchtype=newwork_titles&searchtype=newwork_titles&sortchoice=0'
        search = title + ' ['+ str(moviedate)+ ' film]'    
   
        #search by movie and date
        newtitle, link1 = MovieFirst(baseurl, search)
        #if no results, search by movie title only
        if link1 == None:
            newtitle, link1 = MovieFirst(baseurl, title)
            #if still nothing, oh well
            if link1 == None:
                book = None
                link2 =None
            #otherwise, check that one for adapatation info
            else:
                book, link2 = BookSecond(link1)
        #if movie results found, go into movie entry and get the book title and link
        else:
            book, link2 = BookSecond(link1)
            #if no book adaptation info, search again by just the movie title
            #assume first result without 'film]' is the book
            #didn't filter by missing ] so this returned many bad results
            if link2 == None:
                book, link2 = MovieFirst(baseurl, title)
            
        #return them
        return [newtitle, link1, book, link2]
    

Use functions defined above to create database of movies and corresponding source material

In [14]:
#pull in the IMDb list (1342 lines)
moviesdb = pd.read_csv('CleanMovieData.csv')

#initiate a dataframe to hold results
ohgood = pd.DataFrame(columns = ['SearchTerm', 'Link1', 'AdaptationOf', 'Link2'])

#batch maker
moviesdb = moviesdb.iloc[1242:]

#scrape
for hm in moviesdb.index:
    hello = ScrapeSearch(moviesdb.Title[hm], moviesdb.Date[hm])
    #print(hello)
    ohgood = ohgood.append(pd.Series(hello, index = ohgood.columns), ignore_index=True)

print(ohgood.info())
#ohgood.to_csv('LTlinks5.csv', index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   SearchTerm    99 non-null     object
 1   Link1         99 non-null     object
 2   AdaptationOf  89 non-null     object
 3   Link2         89 non-null     object
dtypes: object(4)
memory usage: 3.2+ KB
None


In [None]:
#combine batch files
one = pd.read_csv('LTlinks1.csv')
two = pd.read_csv('LTlinks2.csv')
three = pd.read_csv('LTlinks3.csv')
four = pd.read_csv('LTlinks4.csv')
five = pd.read_csv('LTlinks5.csv')

together = pd.concat([one,two,three,four,five], ignore_index=True)
together.drop_duplicates(inplace=True, ignore_index=True)

print(together.info())

#together.to_csv('LTlinksFull.csv',index=False)

# Second Step:
Use list generated to scrape each novel page for features

In [15]:
#function that scrapes an individual page
def GetBookPlease(page):
    
    #print(page)
    series = 0
        
    #get response
    response = requests.get(url='https://www.librarything.com'+ page).text
    soup = BeautifulSoup(response, features="html.parser")
    
    #get top of page - author, publish date, if part of series
    top = soup.find(attrs={'class':'headsummary'})
    published = top.find(attrs={'class':'date'})
    if published != None:
        published = top.find(attrs={'class':'date'}).string
    author = top.find('h2')
    if author != None:
        author = top.find('h2').text
    stuff = top.find_all('h3')                          #this will always exist
    for e in stuff:
        if 'Series:' in e.text:
            series = 1
    
    #get rating
    rating = soup.find(attrs={'class':'dark_hint'})
    if rating != None:
        rating = soup.find(attrs={'class':'dark_hint'}).text
        
    #get number of characters
    groups = soup.find_all(attrs={'class':'fwikiGroup'})
    for g in range(len(groups)):
        hm = groups[g].find(attrs={'fieldname':'characternames'})
        if hm is not None:
            place = g
    charnum = groups[place].find(attrs={'class':'itemnumberoverflow'})
    if charnum is None:
        charnum = groups[place].find_all(attrs={'class':'fwikiAtomicValue'})
        charnum = len(charnum)
    else:
        charnum = groups[place].find(attrs={'class':'itemnumberoverflow'}).text
        charnum = re.findall(r'\d+', charnum)[0]

    #get number of awards
    for g in range(len(groups)):
        hm = groups[g].find(attrs={'fieldname':'awards'})
        if hm is not None:
            place = g
    awardnum = groups[place].find(attrs={'class':'itemnumberoverflow'})
    if awardnum is None:
        awardnum = groups[place].find_all(attrs={'class':'fwikiAtomicValue'})
        awardnum = len(awardnum)
    else:
        awardnum = groups[place].find(attrs={'class':'itemnumberoverflow'}).text
        awardnum = re.findall(r'\d+', awardnum)[0]
        
    #NOTE: this will still return values if the link is for a movie
    return [page, published, author, series, rating, charnum, awardnum]


In [16]:
#pull in the link list
moviesdb = pd.read_csv('LTlinksFull.csv')

#make a list of book links
links = moviesdb['Link2']

In [None]:
#batch maker
links = links[576:].reset_index(drop=True)

#initiate dataframe to hold returns
library = pd.DataFrame(columns = ['link','published', 'author', 'series', 'rating', 'charnum', 'awardnum'])
#links = ['/work/4041453', '/work/1906740', '/work/4725', '/work/5716']

#run through all link2
for l in range(len(links)):
    if str(links[l]) != 'nan':
        grab = GetBookPlease(links[l])
        library = library.append(pd.Series(grab, index = library.columns), ignore_index=True)
        
#if there is no Link2 AND the last search term did not have ] in it, try Link1
for m in range(len(links)):
    if str(links[m])=='nan':
        if r']' not in moviesdb.loc[m, 'SearchTerm']:
            #print(moviesdb.loc[m, 'Link2'])
            links[m] = moviesdb.loc[m, 'Link1']

#print(library)
print(library.info())


In [None]:
#library.to_csv('LTScrape3.csv', index = False)

In [None]:
#this cell is just to slap everything together since I had trouble running the full scrape
one = pd.read_csv('LTScrape1.csv')
two = pd.read_csv('LTScrape2.csv')
three = pd.read_csv('LTScrape3.csv')

together = pd.concat([one,two,three])
#together = together.applymap(str)
together.drop_duplicates(inplace=True, ignore_index=True)

print(together.info())

together.to_csv('LTScrapeFull.csv',index=False)

# Step 3:
Combine and clean 

In [None]:
#merge the list and final sheets together
one = pd.read_csv('LTlinksFull.csv')
two = pd.read_csv('LTScrapeFull.csv')

#merge matching Link2
LT = one.merge(two, how='left', left_on = 'Link2', right_on = 'link')
#print(LT.info())

#only use rows where Link1 and Link2 are different
#if they match, it was a return from where there was no adaptation info
LT = LT[LT['Link1'] != LT['Link2']]

In [None]:
#fill in any missing with Link1
LT = LT.merge(two, how='left', left_on = 'Link1', right_on = 'link')

LT['link_x'].fillna(LT['link_y'], inplace=True)
LT['published_x'].fillna(LT['published_y'], inplace=True)
LT['author_x'].fillna(LT['author_y'], inplace=True)
LT['series_x'].fillna(LT['series_y'], inplace=True)
LT['rating_x'].fillna(LT['rating_y'], inplace=True)
LT['charnum_x'].fillna(LT['charnum_y'], inplace=True)
LT['awardnum_x'].fillna(LT['awardnum_y'], inplace=True)

LT.drop(['link_x','link_y','published_y','author_y','series_y','rating_y','charnum_y','awardnum_y'], axis=1, inplace=True)

print(LT.head())

In [None]:
#LT.to_csv('LibraryThingFull.csv',index=False)
print(LT.info())