# Imports

In [18]:
import requests
import bs4
import pandas as pd
import re

from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import chromedriver_autoinstaller

from dotenv import dotenv_values

import time

# Install Chromevriver selenium

In [2]:
path = '../requirements/chromedriver'
service = Service(executable_path=path)
driver = webdriver.Chrome(service=service)

# Columns and rows display options

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Configurate dotenv to hide password

In [4]:
config = dotenv_values("../.env")

In [5]:
mar_password = config["PASSWORD"]

# Log into Goodreads with Selenium

In [6]:
url = 'https://www.goodreads.com/ap/signin?language=en_US&openid.assoc_handle=amzn_goodreads_web_na&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.goodreads.com%2Fap-handler%2Fsign-in&siteState=5343dd7ffb96ad0340e3a83b80aba5dc'
driver.get(url)
driver.maximize_window()

In [7]:
username = driver.find_element(By.XPATH, "//input[@type='email']")
username.send_keys("margamontv@gmail.com")

In [8]:
password = driver.find_element(By.XPATH, "//input[@type='password']")
password.send_keys(mar_password)

In [9]:
sign_in = driver.find_element(By.XPATH, "//input[@id='signInSubmit']")
sign_in.click()

# Get the genres urls containing 50 books per page per genre

In [10]:
#Create a genre list with the gernes and a page list with the pages
genre_list = ['adult-fiction', 'biography', 'business', 'chick-lit', 'childrens', 
              'comics', 'contemporary', 'crime', 'fantasy', 'fiction', 
              'graphic-novels', 'historical-fiction', 'history', 'horror', 'humor', 
              'lgbt', 'literary-fiction', 'memoir', 'mystery', 'new-adult', 
              'non-fiction', 'paranormal', 'poetry', 'romance', 'science', 
              'science-fiction', 'self-help', 'suspense', 'thriller', 'travel', 'young-adult']

genre_pages = ['1', '2', '3', '4', '5',
              '6', '7', '8', '9', '10',
              '11', '12', '13', '14', '15',
              '16', '17', '18', '19', '20']

In [11]:
#Create a for loop and dictionaries containing the main urls. This urls contain 50 books each
list_url_genres = []

for g in genre_list:
    for p in genre_pages:
        base_url = 'https://www.goodreads.com/shelf/show/'
        pages_url = f'?page={p}'
        full_url = base_url + g + pages_url
        dict_url = {}
        dict_url['genre'] = g
        dict_url['url'] = full_url
        list_url_genres.append(dict_url)  

In [12]:
# Convert my list of dictionaries (JSON) into a Pandas dataframe
genres_urls = pd.DataFrame(list_url_genres)
genres_urls

Unnamed: 0,genre,url
0,adult-fiction,https://www.goodreads.com/shelf/show/adult-fiction?page=1
1,adult-fiction,https://www.goodreads.com/shelf/show/adult-fiction?page=2
2,adult-fiction,https://www.goodreads.com/shelf/show/adult-fiction?page=3
3,adult-fiction,https://www.goodreads.com/shelf/show/adult-fiction?page=4
4,adult-fiction,https://www.goodreads.com/shelf/show/adult-fiction?page=5
...,...,...
615,young-adult,https://www.goodreads.com/shelf/show/young-adult?page=16
616,young-adult,https://www.goodreads.com/shelf/show/young-adult?page=17
617,young-adult,https://www.goodreads.com/shelf/show/young-adult?page=18
618,young-adult,https://www.goodreads.com/shelf/show/young-adult?page=19


# Obtain the books urls form the genres urls

In [13]:
#Create an empty list for my dictionaries
list_url_books = []

#create a for loop to iterate along the genres urls
for i in range(len(genres_urls['url'])):   # [0,1,2,...,620]
    driver_content = driver.get(genres_urls['url'][i])
    genre = genres_urls['genre'][i]
    try:
        books_elements = driver.find_elements(By.XPATH, "//a[@class='bookTitle']")
    except:
        continue
    
    for element in books_elements:
        dict_url_books = {}
        try:
            books_url = element.get_attribute("href")
            #print(books_url)
            dict_url_books['book_url'] = books_url
            dict_url_books['genre'] = genre
        except:
            continue

        list_url_books.append(dict_url_books)

In [14]:
# Convert my list of urls (JSON) into a Pandas dataframe
books_urls = pd.DataFrame(list_url_books)
books_urls

Unnamed: 0,book_url,genre
0,https://www.goodreads.com/book/show/19288043-gone-girl,adult-fiction
1,https://www.goodreads.com/book/show/4667024-the-help,adult-fiction
2,https://www.goodreads.com/book/show/22557272-the-girl-on-the-train,adult-fiction
3,https://www.goodreads.com/book/show/38447.The_Handmaid_s_Tale,adult-fiction
4,https://www.goodreads.com/book/show/32620332-the-seven-husbands-of-evelyn-hugo,adult-fiction
...,...,...
30824,https://www.goodreads.com/book/show/61215351-the-fellowship-of-the-ring,young-adult
30825,https://www.goodreads.com/book/show/29772863-wires-and-nerve,young-adult
30826,https://www.goodreads.com/book/show/2218252.Rumors,young-adult
30827,https://www.goodreads.com/book/show/45872054-the-kingdom-of-back,young-adult


In [16]:
books_urls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30829 entries, 0 to 30828
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   book_url  30829 non-null  object
 1   genre     30829 non-null  object
dtypes: object(2)
memory usage: 481.8+ KB


In [15]:
books_urls.to_csv('../data/books_urls.csv', index=False)

# Extract the relevant info from my books_urls and append to dictionary

In [11]:
books_urls_2 = pd.read_csv('../data/books_urls.csv')
books_urls_2

Unnamed: 0,book_url,genre
0,https://www.goodreads.com/book/show/19288043-gone-girl,adult-fiction
1,https://www.goodreads.com/book/show/4667024-the-help,adult-fiction
2,https://www.goodreads.com/book/show/22557272-the-girl-on-the-train,adult-fiction
3,https://www.goodreads.com/book/show/38447.The_Handmaid_s_Tale,adult-fiction
4,https://www.goodreads.com/book/show/32620332-the-seven-husbands-of-evelyn-hugo,adult-fiction
...,...,...
30824,https://www.goodreads.com/book/show/61215351-the-fellowship-of-the-ring,young-adult
30825,https://www.goodreads.com/book/show/29772863-wires-and-nerve,young-adult
30826,https://www.goodreads.com/book/show/2218252.Rumors,young-adult
30827,https://www.goodreads.com/book/show/45872054-the-kingdom-of-back,young-adult


In [13]:
#create an empty list to store my dictionaries
goodreads_books_list = []

#Iterate the urls through a for loop
for i in range(len(books_urls_2['book_url'])):   # [0,1,2,...,620]
    genre = books_urls_2['genre'][i]
    url = books_urls_2['book_url'][i]

#Create a new dictionary to store the info extracted
    goodreads_books = {}

#Iterate through the urls
    new_driver_content = driver.get(url)

#Extract the relevant info from each book and append to dictionary
    #Title info
    try:
        titles = driver.find_element(By.XPATH, "//h1[@class='Text Text__title1']").text
        goodreads_books['Title'] = titles
    except:
        goodreads_books['Title'] = 'error' 
    #Author info
    try:
        authors = driver.find_element(By.XPATH, "//span[@class='ContributorLink__name']").text
        goodreads_books['Author'] = authors
    except:
        goodreads_books['Author'] = 'error'
    #Rating info
    try:
        ratings = driver.find_element(By.XPATH, "//div[@class='RatingStatistics__rating']").text
        goodreads_books['Rating'] = ratings
    except:
        goodreads_books['Rating'] = 'error' 
    #Ratings count info
    try:
        ratings_count = driver.find_element(By.XPATH, "//span[@data-testid='ratingsCount']").text
        goodreads_books['Ratings count'] = ratings_count
    except: 
        goodreads_books['Ratings count'] = 'error'
    #Reviews count info
    try:
        reviews_count = driver.find_element(By.XPATH, "//span[@data-testid='reviewsCount']").text
        goodreads_books['Reviews count'] = reviews_count
    except: 
        goodreads_books['Reviews count'] = 'error'
    #Pages info
    try:
        pages = driver.find_element(By.XPATH, "//p[@data-testid='pagesFormat']").text
        goodreads_books['Pages'] = pages
    except:
        goodreads_books['Pages'] = 'error'
    #First published info
    try:
        first_published = driver.find_element(By.XPATH, "//p[@data-testid='publicationInfo']").text
        goodreads_books['First published'] = first_published
    except:
        goodreads_books['First published'] = 'error'
    #Cover image info
    try:
        cover_image = driver.find_element(By.XPATH, "//img[@class='ResponsiveImage']")
        image_url = cover_image.get_attribute("src")
        goodreads_books['Cover image'] = image_url
    except:
        goodreads_books['Cover image'] = 'error' 
    
    goodreads_books['Genre'] = genre
    goodreads_books['Book url'] = url
    
    goodreads_books_list.append(goodreads_books)

In [14]:
print(len(goodreads_books_list))

30829


In [15]:
# Convert my list of urls (JSON) into a Pandas dataframe
goodreads_books_df = pd.DataFrame(goodreads_books_list)
goodreads_books_df

Unnamed: 0,Title,Author,Rating,Ratings count,Reviews count,Pages,First published,Cover image,Genre,Book url
0,Gone Girl,Gillian Flynn,4.12,"2,911,327 ratings","148,502 reviews","415 pages, Paperback","First published May 14, 2012",https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1554086139i/19288043.jpg,adult-fiction,https://www.goodreads.com/book/show/19288043-gone-girl
1,The Help,Kathryn Stockett,4.47,"2,627,626 ratings","88,917 reviews","464 pages, Hardcover","First published February 10, 2009",https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1622355533i/4667024.jpg,adult-fiction,https://www.goodreads.com/book/show/4667024-the-help
2,The Girl on the Train,Paula Hawkins,3.96,"2,767,383 ratings","122,845 reviews","336 pages, Hardcover","First published January 6, 2015",https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1574805682i/22557272.jpg,adult-fiction,https://www.goodreads.com/book/show/22557272-the-girl-on-the-train
3,The Handmaid’s Tale,Margaret Atwood,4.13,"1,918,823 ratings","90,689 reviews","314 pages, Paperback","First published January 1, 1985",https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1578028274i/38447.jpg,adult-fiction,https://www.goodreads.com/book/show/38447.The_Handmaid_s_Tale
4,The Seven Husbands of Evelyn Hugo,Taylor Jenkins Reid,4.44,"2,326,642 ratings","223,037 reviews","389 pages, Hardcover","First published June 13, 2017",https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1664458703i/32620332.jpg,adult-fiction,https://www.goodreads.com/book/show/32620332-the-seven-husbands-of-evelyn-hugo
...,...,...,...,...,...,...,...,...,...,...
30824,The Fellowship of the Ring,J.R.R. Tolkien,4.38,"2,708,740 ratings","35,260 reviews","432 pages, Kindle Edition","First published July 29, 1954",https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1654215925i/61215351.jpg,young-adult,https://www.goodreads.com/book/show/61215351-the-fellowship-of-the-ring
30825,Wires and Nerve,Marissa Meyer,4.12,"25,977 ratings","4,507 reviews","238 pages, Hardcover","First published January 31, 2017",https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1470414564i/29772863.jpg,young-adult,https://www.goodreads.com/book/show/29772863-wires-and-nerve
30826,Rumors,Anna Godbersen,3.85,"38,906 ratings","1,956 reviews","423 pages, Hardcover","First published June 3, 2008",https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1442612484i/2218252.jpg,young-adult,https://www.goodreads.com/book/show/2218252.Rumors
30827,The Kingdom of Back,Marie Lu,3.70,"20,006 ratings","4,455 reviews","313 pages, Hardcover","First published March 3, 2020",https://images-na.ssl-images-amazon.com/images/S/compressed.photo.goodreads.com/books/1562771370i/45872054.jpg,young-adult,https://www.goodreads.com/book/show/45872054-the-kingdom-of-back


In [17]:
goodreads_books_df.to_csv('../data/goodreads_books.csv', index=False)