In [10]:
def user_books_from_goodreads(user_code = None, shelf_name = None, verbose = 1):
    """
    Scarping book info from Goodreads Users's page
    the user_code is 9 digit code found in the link of user's 'My Books' page
    If no shelf name is provided, info from 'All' shelf is returned
    verbose has usual meaning with value being integer 0 or 1, default is 1.
    eg. book_from_goodreads(user_code =110673662, shelf_name = 'currently-reading', verbose = 1)
    """

    #Scarping book info from Goodreads

    #Imports
    import requests
    import bs4
    import pandas as pd

    #Containers
    books = []
    authors = []
    isbn13s = []
    isbns =[]
    pagess = []
    ratings = []
    num_ratings = []
    pub_dates = []
    date_started = []
    date_completed = []

    p=1
    while True:

        if not shelf_name:
            link = f'https://www.goodreads.com/review/list/110673662?page={p}'
        else:
            link = f'https://www.goodreads.com/review/list/110673662?page={p}&ref=nav_mybooks&shelf={shelf_name}'

        code = requests.get(link)
        soup = bs4.BeautifulSoup(code.text, "lxml")

        #Breaking the loop
        if soup.select('.bookalike') == []:
            break

        if verbose == 0:
            pass 
        else:
            print(p)

        for i in range(len(soup.select('.bookalike'))):

            book_name = soup.select('.bookalike .title a')[i]['title']
            books.append(book_name)

            author = soup.select('.bookalike .author a')[i].text
            authors.append(author)

            isbn13 = soup.select('.bookalike .isbn13 .value')[i].text.strip()
            isbn13s.append(isbn13)
            isbn = soup.select('.bookalike .isbn .value')[i].text.strip()
            isbns.append(isbn)

            pages = soup.select('.bookalike .num_pages .value')[i].text.split('\n')[1].strip()
            pagess.append(pages)

            rating = float(soup.select('.bookalike .avg_rating .value')[i].text.strip())
            ratings.append(rating)

            num = soup.select('.bookalike .num_ratings .value')[i].text.strip()
            num_rating = int(''.join(num.split(',')))
            num_ratings.append(num_rating)

            pub_date = soup.select('.bookalike .date_pub .value')[i].text.strip()
            pub_dates.append(pub_date)
            
            started = soup.select('.bookalike .date_started')[i].text.split('\n')[3]
            date_started.append(started)

            ended = soup.select('.bookalike .date_read')[i].text.split('\n')[3]
            date_completed.append(ended)

        #Next page
        p=p+1

    #Creating the DataFrame
    index = 'Book Name, Author, ISBN 13, ISBN, Number of Pages, Average Rating, Total Number of Ratings, Publication Date, Date Started, Date Completed'.split(', ')
    book_df = pd.DataFrame(data = [books, authors, isbn13s, isbns, pagess, ratings, num_ratings, pub_dates, date_started, date_completed], index = index).transpose()
    return book_df

In [11]:
user_books_from_goodreads(user_code = 110673662, shelf_name = 'currently-reading', verbose = 1)

1


Unnamed: 0,Book Name,Author,ISBN 13,ISBN,Number of Pages,Average Rating,Total Number of Ratings,Publication Date,Date Started,Date Completed
0,The First Three Minutes: A Modern View of the ...,"Weinberg, Steven",9780465024377.0,0465024378,203,4.09,11498,"Jan 01, 1977","Aug 16, 2021",not set
1,"Dune (Dune, #1)","Herbert, Frank",9780593099322.0,059309932X,688,4.24,913763,Jun 1965,"Sep 17, 2021",not set
2,The Hobbit,"Tolkien, J.R.R.",,,322,4.28,3174171,"Sep 21, 1937","Mar 20, 2021",not set


In [1]:
def goodreads_author(author_id = None, author_name = None, verbose = 1, clean = True):
    
    """
    Works only for Goodread Authors
    needs author_id and author_name
    author_name should be specifically defined or pass clean as False
    """

    import re
    import pandas as pd
    import numpy as np

    names = []
    authors = []
    ratings = []
    published_years = []
    num_ratings = []

    flag = True
    p=1
    
    if clean and author_name:
        return None
    elif clean:
        print('You should provide the author name or use \"clean\" = False')
        return None
    else:
        return None
    
    while flag:
        print(p)
        try:
            link = f"https://www.goodreads.com/author/list/{author_id}?page={p}"

            df = pd.read_html(link)[0]
            for i in range(len(df[1])):
                book_name = df[1].iloc[i].split('by')[0].strip()
                other = df[1].iloc[i].split('by')[1]

                names.append(book_name)

                author = other.split('(')[0].strip()
                authors.append(author)
                text = other.split('(')[-1]
                pattern_rating = r"\d\.\d\d"
                pattern_num = r"\d{1,}\,\d{1,}"
                pattern_year = r"\d{4}"

                if re.findall(pattern_rating, text):
                    ratings.append(re.findall(pattern_rating, text)[0])
                else:
                    ratings.append(np.nan)


                if re.findall(pattern_year, text):
                    published_years.append(re.findall(pattern_year, text)[0])
                else:
                    published_years.append(np.nan)

                if re.findall(pattern_num, text):
                    num_ratings.append(re.findall(pattern_num, text)[0])
                else:
                    num_ratings.append(np.nan)
        except :
            break
        p = p+1

    index = 'Book Name, Author, Average Rating, Publication Date, Total Number of Ratings'.split(', ')
    book_df = pd.DataFrame(data = [names, authors, ratings, published_years, num_ratings], index = index).transpose()
    if clean and author_name:
        book_df=book_df.dropna(thresh=3)
        book_df[book_df['Author'].apply(lambda x: author_name in x)]
        return book_df
    else:
        return book_df

In [7]:
import requests
import bs4
import pandas as pd
import re

In [222]:
link = 'https://www.goodreads.com/author/list/1265?page=1'

In [223]:
code = requests.get(link)
soup = bs4.BeautifulSoup(code.text, "lxml")

In [224]:
soup.select('title')

[<title>Books by Jane Austen (Author of Pride and Prejudice)</title>]

In [269]:
df = pd.read_html(link)

In [270]:
string = df[0][1]

In [271]:
title = string[0].split(' by ')[0].strip()

In [273]:
title

'Pride and Prejudice'

In [272]:
pattern_author = 'Jane Austen'

In [253]:
text = string[0].split(' by ')[1]

In [254]:
text

'Jane Austen  4.27 avg rating — 3,266,966 ratings  —  published  1813  —  11651 editions'

In [256]:
author = re.findall(pattern, text)[0]

In [262]:
pattern_rating = r"\d\.\d\d"
pattern_num1 = r"\d\,\d{1,}\,\d{1,}"
pattern_num2 = r"\d{1,}\,\d{1,}"
pattern_year = r"\d{4}"

In [265]:
re.findall(pattern_num1, text)

['3,266,966']

In [268]:
re.findall(pattern_year, text)[0]

'1813'

In [15]:
def author(author_id = None, author_name = None, verbose = 1, clean = True):
    
    """
    Works for any Author on Goodreads
    needs author_id and author_name
    author_name should be specifically defined or pass clean as False
    """

    import re
    import pandas as pd
    import numpy as np

    names = []
    authors = []
    ratings = []
    published_years = []
    num_ratings = []

    flag = True
    p=1
    
    # if clean and author_name:
    #     return None
    if clean:
        print('You should provide the author name or use \"clean\" = False')
        return None
    # else:
    #     return None
    
    while flag:
        print(p)
        try:
            link = f"https://www.goodreads.com/author/list/{author_id}?page={p}"

            df = pd.read_html(link)
            for i in range(len(df[1])):
                string = df[0][1]
                book_name = string[0].split(' by ')[0].strip()
                names.append(book_name)
                
                text = string[0].split(' by ')[1]

                pattern_rating = r"\d\.\d\d"
                pattern_num1 = r"\d\,\d{1,}\,\d{1,}"
                pattern_num2 = r"\d{1,}\,\d{1,}"
                pattern_year = r"\d{4}"
                pattern_author = 'Jane Austen'
                
                author = re.findall(pattern, text)[0]
                authors.append(author)

                if re.findall(pattern_rating, text):
                    ratings.append(re.findall(pattern_rating, text)[0])
                else:
                    ratings.append(np.nan)


                if re.findall(pattern_year, text):
                    published_years.append(re.findall(pattern_year, text)[0])
                else:
                    published_years.append(np.nan)

                if re.findall(pattern_num1, text):
                    num_ratings.append(re.findall(pattern_num1, text)[0])
                elif re.findall(pattern_num2, text):
                    num_ratings.append(re.findall(pattern_num2, text)[0])
                else:
                    num_ratings.append(np.nan)
        except:
            pass
        p = p+1

    index = 'Book Name, Author, Average Rating, Publication Date, Total Number of Ratings'.split(', ')
    book_df = pd.DataFrame(data = [names, authors, ratings, published_years, num_ratings], index = index).transpose()
    if clean and author_name:
        book_df=book_df.dropna(thresh=3)
        book_df[book_df['Author'].apply(lambda x: author_name in x)]
        return book_df
    else:
        return book_df

In [17]:
author(author_id=3441, clean=False, author_name='Terry Goodkind')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44


In [9]:
goodreads_author(author_id=1265, author_name='Terry Goodkind')