## GoodReads Scraping using Selenium

The script fetches the details from GoodReads using the titles of books already present in the books dataset obtianed from Kaggle. The details that are fetched are, Primary Genre, Secondary Genre, Awards, Series, Followers for Author on GoodReads, No of books of the author and author's primary genre. Selenium is a testing tool that interacts with the browser. The driver essentially doe the following actions.
1. Launch Chrome Browser
2. Open GoodReads Link
3. Sign in with user creds
4. Enter the book title from the dataset
5. Search for the book
6. Scrapes data from the book page

In [None]:
#Import Libraries
from selenium import webdriver 
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
import time

In [None]:
#Read the data from the csv files. Change the path here
Books=pd.read_csv('C:/Users/Tina/Desktop/STT/books.csv',error_bad_lines=False)
#Books=pd.read_csv('C:/Users/Tina/Desktop/STT/Project/Data/Clean_books_with_missing_authors.csv',error_bad_lines=False)

In [None]:
#Change the rows to the ones you plan to fetch
Books=Books.iloc[7001:8001,:]
Books.shape

In [None]:
#Adding other columns
Books['Primary Genre']=None
Books['Secondary Genre']=None
Books['Awards']=0
Books['Series']='No'
Books['Auth_followers']=0
Books['Auth_books']=0
Books['Auth_Genre']=None
#This List is to reduce search time, if an author infor is already fetched
Author=[]
Books.head()

In [None]:
#initiating driver and lauching the required page
driver = webdriver.Chrome(executable_path='C:/Users/Tina/Desktop/STT/chromedriver.exe')
#Launch the required link
driver.get("https://www.goodreads.com/")
# This is for the driver to implicity wait until it finds the element. That way, loading time is covered. If your netwrok is too slow, you may increase this.
driver.implicitly_wait(15)

#After this, login manually or run the script below with your creds
#In subsequent blocks, if you face NoSuchWindowException or WebDriverException run this and the cell below

In [None]:
#LOGIN
#Enter your id here
driver.find_element_by_id('userSignInFormEmail').send_keys('id')
# ENter your pwd here
driver.find_element_by_id('user_password').send_keys('password')
driver.find_element_by_xpath("(//input[contains(@type,'submit')])[1]").click()

In [None]:

for i in range (7887,8001):
    print(i)
    title=Books['title'][i]
    # navigate to the book page
    search_bar=driver.find_element_by_xpath("//input[contains(@class,'searchBox__input searchBox__input--navbar')]")
    search_bar.clear()
    search_bar.send_keys(title,Keys.ENTER)
    time.sleep(3)
    #First search by title 
    try:
        driver.find_element_by_xpath("(//span[contains(text(),'"+title+"')])[1]").click()
    #In case title is different in csv, click on the first link    
    except NoSuchElementException as e:
        try:
            driver.find_element_by_xpath("(//span[contains(@itemprop,'name')])[1]").click()
        except NoSuchElementException as e:
            next
    
    #Now get data from the Book Page
    try:
        #Get the genre, primary genre will definitely be present in the page. The try catch block is to handle secondary Genre
        Books['Primary Genre'][i]=driver.find_element_by_xpath("(//a[contains(@class,'actionLinkLite bookPageGenreLink')])[1]").text
        Books['Secondary Genre'][i]=driver.find_element_by_xpath("(//a[contains(@class,'actionLinkLite bookPageGenreLink')])[2]").text
    except NoSuchElementException as e:
        None
    
    try:
        # Get the number of awards
        Books['Awards'][i]=len(driver.find_elements_by_xpath("//a[contains(@class,'award')]" ))
    except NoSuchElementException as e:
        None
    
    try:
        #If this Xpath exists, then the book is part of a series
        driver.find_element_by_xpath("//div[@class='infoBoxRowTitle' and contains(text(),'Series')]" )
        Books['Series'][i]='Yes'
    except NoSuchElementException as e:
        None
        
    #Now let us move to the author page
    #Do not search info about the author if you already have it
    author=driver.find_element_by_xpath("//a[@class='authorName']").text
    
    if author not in Author:
        driver.find_element_by_xpath("//a[@class='authorName']").click()
        # for now the number of followers is in String format
        try:
            Books['Auth_followers'][i]=driver.find_element_by_xpath("(//a[contains(@href,'/author_followings')])[1]").text
        except NoSuchElementException as e:
            None   
            
        try:
            Books['Auth_books'][i]=driver.find_element_by_xpath("//a[contains(text(),'distinct works')]").text
        except NoSuchElementException as e:
            None 
            
        try:
            j = len(driver.find_elements_by_xpath("//a[contains(@href,'/genres/')]"))
            genre=[]
            for j in range(1,j+1):
                genre.append((driver.find_element_by_xpath("(//a[contains(@href,'/genres/')])["+str(j)+"]")).text)
            Books['Auth_Genre'][i]=genre
        except NoSuchElementException as e:
            None
        
        #Append author to the Author List. Will fill values for all the rows for author in data cleaning and prep phase
        Author.append(author)

In [None]:
# Save as a CSV file 
Books.to_csv("Books_7000_8000.csv")

## Cleaning the data

This section does the following
1. Handle missing values for primary and secondary genre from author's genre
2. Data type transformation from string to number for certain columns
3. Hanle missing author information

In [None]:
Books=pd.read_csv('C:/Users/Tina/Desktop/STT/Project/Data/book_total.csv',error_bad_lines=False)

In [None]:
#drop unncessary column
Books=Books.drop(['Unnamed: 0'], axis=1)

In [None]:
Books.columns

In [None]:
Books.head()

In [None]:
Books.shape

In [None]:
import math
Author={}
for i in range (0,Books.shape[0]):
    name=Books['authors'][i]
    if name in Author.keys():
        Books['Auth_followers'][i]=Author[name][0]
        Books['Auth_books'][i]=Author[name][1]
        Books['Auth_Genre'][i]=Author[name][2]
    else:
        # Search any row where a value is recorded for any of the 3 author related columns
        Author_books=Books[Books['authors']==name]
        range_new=np.arange(Author_books.shape[0])
        Author_books=Author_books.set_index(range_new)
        for j in range (0,Author_books.shape[0]):
            if Author_books['Auth_followers'][j] !=0 or Author_books['Auth_books'][j] !=0 or type (Author_books['Auth_Genre'][j]) == str:
                auth_list=[]
                auth_list.append(Author_books['Auth_followers'][j])
                auth_list.append(Author_books['Auth_books'][j])
                auth_list.append(Author_books['Auth_Genre'][j])
                Author[name]=auth_list

for i in range (0,Books.shape[0]):
    name=Books['authors'][i]
    if type (Books['Auth_Genre'][i]) != str:
        if name in Author.keys():
            Books['Auth_followers'][i]=Author[name][0]
            Books['Auth_books'][i]=Author[name][1]
            Books['Auth_Genre'][i]=Author[name][2]                 
    

In [None]:
#If Primary genre was missing
for i in range (0,Books.shape[0]):
    if type(Books['Primary.Genre'][i]) !=str and type(Books['Auth_Genre'][i]) == str:
        genre=((Books['Auth_Genre'][i].split(',')[0].replace("\"","")).replace('[',"").replace(']',"").split(' ')[0].replace("\'",""))
        Books['Primary.Genre'][i]=genre

In [None]:
#If secondary was missing
for i in range (0,Books.shape[0]):
    if type(Books['Secondary.Genre'][i]) !=str and type(Books['Auth_Genre'][i]) == str:
        if len(Books['Auth_Genre'][i].split(','))>1:
            genre=((Books['Auth_Genre'][i].split(',')[1].replace("\"","")).replace('[',"").replace(']',"").split(' ')[0].replace("\'",""))
            Books['Secondary.Genre'][i]=genre

In [None]:
import re
# Convert followers and auth_books to numbers
for i in range (0,Books.shape[0]):
    if (type(Books['Auth_followers'][i])==str):
        Books['Auth_followers'][i]=re.sub("[^0-9]", "",  Books['Auth_followers'][i])
    if (type(Books['Auth_books'][i])==str):
        Books['Auth_books'][i]=re.sub("[^0-9]", "",  Books['Auth_books'][i])

In [None]:
Books.to_csv("Clean_books.csv")