In [145]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import requests
from langdetect import detect
import re
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys

In [2]:
## Creating array from Times Top 100 Books of All Time webpage on Goodreads website

TimesTop100  = requests.get("https://www.goodreads.com/list/show/2681.Time_Magazine_s_All_Time_100_Novels").content

## Grabbing all tags in webpage of 'a' type and class 'bookTitle'
soup = BeautifulSoup(TimesTop100,"lxml")
cont = soup.select("a.bookTitle")

## Iterating through and creating list for all titles (bookT) and links (bookLink)
bookT = [x.text.strip() for x in cont]
bookLink = ['https://www.goodreads.com'+ x.get('href') for x in cont]

## Combining list
con = np.column_stack((bookT, bookLink))
Top100Books = pd.DataFrame(con, columns = ['Book Title', 'Book Link'])

In [10]:
# Retrieving Page Link for the reviews of a certain book
# requests.get has tendency to crash so must be run with a while loop until requests successfully works - load time (~11.5 min)

ReviewPageLink = []
for i in bookLink:
    cont = None
    while cont == None:
            r = requests.get(i).content
            soup = BeautifulSoup(r,"lxml")
            cont = soup.find("a", attrs = {'class' : 'Button Button--transparent Button--small'})

    ReviewPageLink += ['https://www.goodreads.com' + cont['href']]

# Finding the Unique ID set by Goodreads in each books main page href
BookID = [re.search('\d+', i)[0] for i in Top100Books['Book Link']]

# adding information to dataframe
Top100Books['Review Page Link'] = ReviewPageLink
Top100Books['Unique ID'] = BookID

# saving dataframe to csv file
# Top100Books.to_csv('Top100BooksData.csv', index=False)

In [None]:
## Defining function to continuing spamming requests until html can be processed
## when looking to iterate through multiple links while scraping

# Code currently returns a search - however - might be more important to return just the soup 
# Therefore, it can be used for mulitple searches if required
# The caveat is that by the definition of the function it must find some object that does exist on the page regardless
# in order to work 

def spamRequestsFind(link, tag, attr, attr_id ):
    status = None
    while status == None:
        r = requests.get(link).content
        soup = BeautifulSoup(r, "lxml")
        status = type(soup.find(tag, attrs = {attr : attr_id}))

    return(soup.find(tag, attrs = {attr : attr_id}))

    

In [3]:
Top100Books = pd.read_csv('Top100BooksData.csv')
Top100Books.head()

Unnamed: 0,Book Title,Book Link,Review Page Link,Unique ID
0,To Kill a Mockingbird,https://www.goodreads.com/book/show/2657.To_Ki...,https://www.goodreads.com/book/show/2657/revie...,2657
1,1984,https://www.goodreads.com/book/show/5470.1984,https://www.goodreads.com/book/show/5470/revie...,5470
2,The Lord of the Rings,https://www.goodreads.com/book/show/33.The_Lor...,https://www.goodreads.com/book/show/33/reviews...,33
3,The Catcher in the Rye,https://www.goodreads.com/book/show/5107.The_C...,https://www.goodreads.com/book/show/5107/revie...,5107
4,The Great Gatsby,https://www.goodreads.com/book/show/4671.The_G...,https://www.goodreads.com/book/show/4671/revie...,4671


In [5]:
def MultBookReviews(page_source):

    ## starting by grabbing one persons review and information
    
    soup = BeautifulSoup(page_source, 'lxml')
    ReviewCards = soup.find_all('article', attrs = {'class' : 'ReviewCard'})

    ## Book Title and Author
    title = soup.find('h1', attrs = {'class' : 'Text H1Title'}).text
    author = soup.find('h3', attrs = {'class' : 'Text Text__title3 Text__regular'}).text

    ## List of all user account hrefs for account page
    cont = soup.select('div.ReviewerProfile__name')
    hrefsUsers = [x.find('a')['href'] for x in cont]

    ## Text data of user review
    contReview = soup.select("section.ReviewText")
    Reviews = [x.text.strip() for x in contReview]

    ## grabbing individual user rating for review 
    contRatingCont = soup.select("div.ShelfStatus")
    userRatings = [x.find('span')['aria-label'] if (x.findChildren('span', recursive=False) == []) == False else 'No Rating' for x in contRatingCont]

    ## Date the review was written by user
    dateCont = soup.select('section.ReviewCard__row')
    datesOfReviews = [x.find('span', attrs = {'class': 'Text Text__body3'}).text for x in dateCont]

    ## Amount of likes and comments for review
    commentLikeCont = soup.select('footer.SocialFooter')
    likes = ['0' if x.find('div', attrs={'class': 'SocialFooter__statsContainer'}) == None else x.find('span', attrs={'class': 'Button__labelItem'}).text  for x in commentLikeCont]
    comments = ['0' if x.find('div', attrs={'class': 'Button__container'}).next_sibling == None else x.find('div', attrs={'class': 'Button__container'}).next_sibling.text for x in commentLikeCont]

    ### Creating DataFrame of all the review data

    reviewData = pd.DataFrame({ 'User Href' : hrefsUsers,
                                'Title' : title,
                                'Rating' : userRatings,
                                'Date' : datesOfReviews,
                                'Likes' : likes,
                                'Comments' : comments,
                                'Review' : Reviews})
    
    return(reviewData)

In [19]:
from selenium.common.exceptions import NoSuchElementException
def check_exists_by_xpath(xpath):
    try:
        webdriver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True

In [64]:
def getManyReviews(url):
    clicks = 0

    # Initilaizing driver and webpage and allowing time for reviews to load
    driver = webdriver.Edge()
    driver.get(url)
    time.sleep(3)

    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'lxml')
    nreviews = int(re.sub('\D', '', soup.find('span', attrs = {'class' : 'Text Text__body3 Text__subdued'}).text))
    cap = 36
    iters = np.round(nreviews/30)-1

    if iters < cap:
        while clicks < iters:

            # scrolling down page to ensure click will work on "show more results" button
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

            # Clicking "show more results" button
            SMResults = driver.find_element(By.XPATH, "//div[@class = 'Divider Divider--contents Divider--largeMargin']/div[@class = 'Button__container']/button")
            driver.execute_script("arguments[0].click();", SMResults)
            time.sleep(1)

            clicks += 1
    else:
        while clicks < cap:

            # scrolling down page to ensure click will work on "show more results" button
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

            # Clicking "show more results" button
            SMResults = driver.find_element(By.XPATH, "//div[@class = 'Divider Divider--contents Divider--largeMargin']/div[@class = 'Button__container']/button")
            driver.execute_script("arguments[0].click();", SMResults)
            time.sleep(1)

            clicks += 1

    # grabbing reference for final state of page after n number of "show more results" button clicks
    page_source = driver.page_source

    reviews = MultBookReviews(page_source)

    driver.quit()

    return(reviews)

In [27]:
# writing code to get list of user reviews page links

r = requests.get('https://www.goodreads.com/user/show/45198798-leynes').content
soup = BeautifulSoup(r,"lxml")
cont = soup.find("div", attrs = {'class' : 'profilePageUserStatsInfo'}).find('a')['href']
userReviewPage = 'https://www.goodreads.com' + cont
userReviewPage

'https://www.goodreads.com/review/list/45198798?sort=rating&view=reviews'

In [142]:
# Processing in smaller batches to double check loading into csv was successful
for i in range(71, 76):
    bookdata = getManyReviews(Top100Books['Review Page Link'][i])
    bookdata.to_csv('Book{num}.csv'.format(num=i+1), index=False)

In [144]:
pd.read_csv('Book76.csv')

Unnamed: 0,User Href,Title,Rating,Date,Likes,Comments,Review
0,https://www.goodreads.com/user/show/19283284-v...,Appointment in Samarra,Rating 5 out of 5,"May 5, 2021",146 likes,0,Appointment in Samarra is all about life of a ...
1,https://www.goodreads.com/user/show/10490224-o...,Appointment in Samarra,Rating 4 out of 5,"October 3, 2022",99 likes,2 comments,UN PUGNO DI POLVEREViva il proibizionismo.Mi v...
2,https://www.goodreads.com/user/show/6743601-ji...,Appointment in Samarra,Rating 5 out of 5,"December 29, 2019",91 likes,4 comments,I’ll start with two paragraphs that I think il...
3,https://www.goodreads.com/user/show/2922102-bl...,Appointment in Samarra,Rating 4 out of 5,"February 17, 2023",78 likes,5 comments,"Appointment in Samarra is set in 1930, just on..."
4,https://www.goodreads.com/user/show/57810023-g...,Appointment in Samarra,Rating 4 out of 5,"May 4, 2022",69 likes,2 comments,Seguro que conocen la leyenda:\n “Había en Bag...
...,...,...,...,...,...,...,...
923,https://www.goodreads.com/user/show/1878745-laura,Appointment in Samarra,No Rating,"April 11, 2018",0,0,Didn't get caught up enough in it to finish. I...
924,https://www.goodreads.com/user/show/2445431-kr...,Appointment in Samarra,No Rating,"January 4, 2019",0,0,I really liked this. O’Hara grew up in Pottsvi...
925,https://www.goodreads.com/user/show/88049620-m...,Appointment in Samarra,No Rating,"February 16, 2019",0,0,I have the english and the german edition.
926,https://www.goodreads.com/user/show/91987561-a...,Appointment in Samarra,No Rating,"February 16, 2019",0,0,Read for 2009
