In [1]:
import numpy as np
import pandas as pd
from random import choice

from bs4 import BeautifulSoup
import requests
import re
import time

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

from IPython.display import clear_output

In [2]:
## Loading review data
br = pd.read_csv('../BookReviews.csv').dropna()
br = br.reset_index(drop=True)

## creating numpy array of review data with only unique user href's
br_u = br['User Href'].unique()

In [3]:
## Defining function to continuing spamming requests until html can be processed
## when looking to iterate through multiple links while scraping

# returns a soup and request

# The caveat is that by the definition, the function must find some object 
# that exists on the page in order to work 

def spamRequestsFind(link, tag, attr, attr_id):
    status = None
    while status == None:
        r = requests.get(link)
        soup = BeautifulSoup(r.text, "lxml")
        status = type(soup.find(tag, attrs = {attr : attr_id}))

    prefix = len('https://www.goodreads.com/user/show/')
    userID_disp = link[prefix:]
    match = re.match(r'^(\d+)-', userID_disp)
    if match:
        userID = match.group(1)
    return(soup, r)

In [4]:
## function is necessary to retrieve an updated url for users whose page url link has undergone changes
## this occurs in the instance of existing user becoming a goodreads verified author

def updated_url_html(r):
    # Set up the Selenium driver with the path to the ChromeDriver executable
    options = Options()
    options.add_argument('--headless')
    service = Service(r"C:\Users\marty\OneDrive - The George Washington University\Documents\CSCI 4443\Project\chromedriver_win32\chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)

    # Load the Goodreads author webpage
    url = r.url
    driver.get(url)

    # Get the page source and parse it with BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    # Close the Selenium driver
    driver.quit()

    return(soup)

In [5]:
## grabs user information while there is a link that works properly and the page is one of the acceptable conditons for users

def getUserInformation(linklst):
    while True: 
        # time.sleep(0.5)
        # clear_output()
        try:
            # intializing tag to search for with spamRequestsFind to verify webpage html was successfully retrieved
            # tag is chosen as something common to any goodreads page, the siteHeader
            tag = 'div'
            attr = 'class'
            attr_id = 'siteHeader'

            # creating empty lists to house scraped information
            user_ID = []
            Display_Name = []
            num_ratings = []
            avg_rating = []
            num_reviews = []
            booklst = []
            isAuthor = []


            for i, x in enumerate(linklst):

                soup, r = spamRequestsFind(x, tag, attr, attr_id)

                user_ID += [re.sub(r'\D', '', x)]

                # executing scrape for goodreads author page
                if soup.find('h3', attrs = {'class' : 'right goodreadsAuthor'}) != None:

                    Display_Name += [soup.find('h1', attrs = {'class' : 'authorName'}).text.strip()]

                    userInfo = soup.find('div', attrs = {'class' : 'smallText'}).find_all('a')
                    num_ratings += [re.sub(r'[^\d.]', '', userInfo[0].text)]
                    avg_rating += [re.sub(r'[^\d.]', '', userInfo[2].text)]
                    num_reviews += [re.sub(r'[^\d.]', '', userInfo[1].text)]
                    booklst += [None]
                    isAuthor += ['yes']
                
                # executing scrape for user whose href has changed after becoming a goodreads verified author
                elif r.url != x and soup.find('h3', attrs = {'class' : 'right goodreadsAuthor'}) != None:

                    soup = updated_url_html(r)

                    Display_Name += [soup.find('h1', attrs = {'class' : 'authorName'}).text.strip()]

                    userInfo = soup.find('div', attrs = {'class' : 'smallText'}).find_all('a')
                    num_ratings += [re.sub(r'[^\d.]', '', userInfo[0].text)]
                    avg_rating += [re.sub(r'[^\d.]', '', userInfo[2].text)]
                    num_reviews += [re.sub(r'[^\d.]', '', userInfo[1].text)]
                    booklst += [None]
                    isAuthor += ['yes']

                # executing scrape for normal user account
                elif soup.find('h1', attrs = {'class' : 'userProfileName'}) != None:

                    # Retrieving the name of the user
                    Display_Name += [soup.find('h1', attrs = {'class' : 'userProfileName'}).text.strip()]

                    # Retrieving userInfo element that houses a tags with information of user total reviews and ratings 
                    userInfo = soup.find('div', attrs = {'class' : 'profilePageUserStatsInfo'}).find_all('a')
                    num_ratings += [re.sub(r'[^\d.]', '', userInfo[0].text)]
                    avg_rating += [re.sub(r'[^\d.]', '', userInfo[1].text)]
                    num_reviews += [re.sub(r'[^\d.]', '', userInfo[2].text.strip())]

                    # Retrieving up to 10 of users favorite books
                    if soup.find('div', attrs = {'class' : 'imgGrid'}) != None:
                        favBooks = soup.find('div', attrs = {'class' : 'imgGrid'}).find_all('img')
                        books = []
                        for i,x in enumerate(favBooks):
                            books += [favBooks[i]['title']]
                        booklst += [books]
                    else:
                        booklst += [None]
                    
                    isAuthor += ['no']

                # executing for private user who has changed their display name
                elif r.url != x and soup.find('div', attrs = {'class' : 'mainContentFloat'}) != None:

                    soup = updated_url_html(r)

                    # Retrieving the name of the user
                    Display_Name += [soup.find('div', attrs = {'class' : 'mainContentFloat'}).find('h1').text.strip()]

                    # Retrieving userInfo element that houses a tags with information of user total reviews and ratings 
                    userInfo = soup.find('div', attrs = {'class' : 'smallText'}).find_all('a')
                    avg_rating += [re.sub(r'[^\d.]', '', userInfo[0].text)]

                    # text order is 'total_ratings|total_reviews:avg_rating'
                    cleaner_info = re.sub(r'[^0-9:|]', '', soup.find('div', attrs = {'class' : 'smallText'}).text.strip().replace('\n', '').replace('\t', ''))
                    num_ratings += [cleaner_info.split('|',1)[0]]
                    num_reviews += [re.findall(r'\|(.*?)\:', cleaner_info)[0]]

                    # Cannot see favorited books due to privacy settings
                    booklst += [None]

                    isAuthor += ['no']
                
                # exceuting for page that has been deleted
                elif soup.find('h4', attrs = {'class' : 'gr-h4'}) != None:
                    Display_Name += ['DELETED']
                    avg_rating += [None]
                    num_ratings += [None]
                    num_reviews += [None]
                    booklst += [None]
                    isAuthor += [None]

                # executing scrape for private user account
                else:

                    # Retrieving the name of the user
                    Display_Name += [soup.find('div', attrs = {'class' : 'mainContentFloat'}).find('h1').text.strip()]

                    # Retrieving userInfo element that houses a tags with information of user total reviews and ratings 
                    userInfo = soup.find('div', attrs = {'class' : 'smallText'}).find_all('a')
                    avg_rating += [re.sub(r'[^\d.]', '', userInfo[0].text)]

                    # text order is 'total_ratings|total_reviews:avg_rating'
                    cleaner_info = re.sub(r'[^0-9:|]', '', soup.find('div', attrs = {'class' : 'smallText'}).text.strip().replace('\n', '').replace('\t', ''))
                    num_ratings += [cleaner_info.split('|',1)[0]]
                    num_reviews += [re.findall(r'\|(.*?)\:', cleaner_info)[0]]

                    # Cannot see favorited books due to privacy settings
                    booklst += [None]

                    isAuthor += ['no']

                # print('{} iter successful, {} left in run'.format(len(Display_Name), len(linklst)-len(Display_Name)))
                          
        except (AttributeError):
            continue
        else:
            break
    

    reviewData = pd.DataFrame({ 'User ID' : user_ID,
                                'Display Name' : Display_Name,
                                'Average Rating' : avg_rating,
                                'Total Ratings' : num_ratings,
                                'Total Reviews' : num_reviews,
                                'Favorite Books' : booklst,   
                                'Author' : isAuthor            })


    return(reviewData)

In [141]:
# # Appending current dataset to overall dataset
# fulldata = pd.read_excel("UserInfoBank.xlsx")
# agg = pd.read_excel("userInfo_9.xlsx")
# comb_data = pd.concat([fulldata, agg], axis = 0)

# comb_data.to_csv('UserInfoBank.csv', index=False, header=True)
# comb_data.to_excel('UserInfoBank.xlsx', index=False, header=True)

userIB = pd.read_csv('UserInfoBank.csv')
len_so_far = len(userIB)
len_so_far

59881

In [131]:
# appendLoop functions works to grab n amount of users info based on the current length of the dataset being appended and overall dataset collected
# using multiple files encourages redundancy in the event a file is lost in iteration due to OS ERROR
# Therefore, it will reduce this possibility and allow for a backup file to exist in the event

def appendLoop(niters, len_so_far):
    
    for i in range(niters):
        itr = pd.read_csv("userInfo_9.csv")
        start = len_so_far + len(itr) 
        end = start + 20
        linklst = br_u[start:end]

        revData = getUserInformation(linklst)
        #revData.to_csv("userInfo_9.csv", index=False, header=True)
        revData.to_csv("userInfo_9.csv", index=False, mode="a", header=False)
        #print('{} iterations successful, {} iterations left'.format(i+1, niters-(i+1)))

In [138]:
appendLoop(10, len_so_far)

In [139]:
# confirming that total dataset (from combining current set to overall set) contains only unique users and no duplicates
# then saving a .xlsx backup file for the current dataset csv being appended to 

def sanityCheck(comb_data):
    uI9 = pd.read_csv("userInfo_9.csv")
    itr = pd.concat([comb_data, uI9], axis = 0)
    print('Length of Dataset : {}'.format(len(itr)))
    print('Unique Users in Dataset : {}'.format(len(itr['User ID'].unique())))
    print('Length of current Dataset : {}'.format(len(uI9)))

    print('\t')

    if len(itr) == len(itr['User ID'].unique()):
        print("You're doing great bud")
    else:
        print("Hey, I think we should talk...")

    uI9.to_excel('userInfo_9.xlsx', index=False)

sanityCheck(userIB)

Length of Dataset : 59881
Unique Users in Dataset : 59881
Length of current Dataset : 7986
	
You're doing great bud


In [147]:
# # converting the UserInfo dataframe to parquet
# pip install pyarrow

# additionally, creating a compressed version of the file using lz4 in a parquet file format
# this turns out to actually be larger than a simple .xlsx of the case of a small dataset

userIB.to_parquet('UserInfo.parquet', compression='lz4')