In [10]:
import spacy
import pandas as pd

In [11]:
from bs4 import BeautifulSoup
import requests
header = headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
import concurrent.futures

In [12]:
genres = [
    "Fiction",
    "Mystery",
    "Thriller",
    'Action',
    "Romance",
    "Fantasy",
    "Science-fiction",
    "Action",
    "Autobiography",
    "Motivational"
]

In [13]:
base_url = "https://www.goodreads.com"

In [14]:
def scrape_book_details(book_link):
    book_details = {}
    try:
        book_page_response = requests.get(book_link, headers=header)
        book_page_content = book_page_response.text
        book_page = BeautifulSoup(book_page_content, "html.parser")

        try:
            book_details['title'] = book_page.find('h1', class_='Text Text__title1').text.strip()
        except AttributeError:
            print('The book title is not available')
            book_details['title'] = None

        try:
            book_details['rating'] = book_page.find('div', class_='RatingStatistics__rating').text.strip()
        except AttributeError:
            print('The book rating is not available')
            book_details['rating'] = None

        try:
            book_details['synopsis'] = book_page.find('span', class_='Formatted').find('b').text.strip()
        except AttributeError:
            print('The book synopsis is not available')
            book_details['synopsis'] = None

        try:
            published_date_text = book_page.find('div', class_='FeaturedDetails').text.strip()
            book_details['published_date'] = published_date_text.split()[-3:]
        except AttributeError:
            print('The published date is not available')
            book_details['published_date'] = None

        try:
            book_details['review'] = book_page.find('section', class_='ReviewText').text.strip()
        except AttributeError:
            print('The book review is not available')
            book_details['review'] = None

        try:
            book_details['review_rating'] = float(book_page.find('div', class_='ShelfStatus').find('span').get('aria-label').split()[1])
        except Exception as e:
            print(f"An error occurred while extracting review rating: {e}")
            book_details['review_rating'] = None

        try:
          book_details['genre'] = book_page.find('span',class_='BookPageMetadataSection__genreButton').text
        except Exception as e:
            print(f"An error occurred while extracting genre: {e}")
            book_details['genre'] = None

        if book_details['synopsis'] is None:
            try:
                book_details['description'] = book_page.find('span', class_='Formatted').text.strip()
            except AttributeError:
                print('The description is not available')
                book_details['description'] = None

        else :
            try:
                book_details['description'] = book_page.find('span', class_='Formatted').get_text().split('.', 1)[1].strip()
            except AttributeError:
                print('The description is not available')
                book_details['description'] = None


    except Exception as e:
        print(f"An error occurred while scraping book details: {e}")

    return book_details


In [15]:
book_titles = []
book_avg_ratings = []
book_synopses = []
published_dates = []
book_reviews = []
book_review_ratings = []
book_genre = []
book_descriptions = []

In [17]:
def scrape_books(genre):

  for i in range(1,11):

    url = f'https://www.goodreads.com/shelf/show/{genre}?page={i}'
    response = requests.get(url, headers=header)
    print(response)
    html_content = response.text
    soup = BeautifulSoup(html_content, "lxml")
    books = soup.find_all('a', class_='bookTitle')
    links = [base_url + book.get('href') for book in books]

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = executor.map(scrape_book_details, links)

    for result in results:
        if result:
            book_titles.append(result.get('title', None))
            book_avg_ratings.append(result.get('rating', None))
            book_synopses.append(result.get('synopsis', None))
            published_dates.append(result.get('published_date', None))
            book_reviews.append(result.get('review', None))
            book_review_ratings.append(result.get('review_rating', None))
            book_descriptions.append(result.get('description',None))
            book_genre.append(genre)


with concurrent.futures.ThreadPoolExecutor() as executor:
   executor.map(scrape_books,genres)

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


In [9]:
len(book_genre)

0

In [11]:
df_new = pd.DataFrame()

In [12]:
df_new['title'] = book_titles
df_new['genre'] = book_genre
df_new['description'] = book_descriptions
df_new['review'] = book_reviews
df_new['rating'] = book_review_ratings

In [13]:
df_new.dropna(inplace=True)

In [14]:
df_new.to_csv('titles-and-genres.csv')

In [15]:
df_new.head()

Unnamed: 0,title,genre,description,review,rating
0,City Under Siege,ActionRomance,London is a city in flames. Tensions are high ...,5 ‘I was fucking Wonder Woman’ starsGah! My re...,5.0
1,Out of Plans,ActionRomance,THE NEW PLAN: * Find Marc * DON'T find Marc *...,"★★★★★! Out Of Plans, book 2 of 2. Action-fille...",5.0
2,Best Laid Plans,ActionRomance,"Together, they'll learn that sometimes even th...","★★★★ 3/4! Best Laid Plans, book 1 of 2. Thrown...",5.0
3,Wolf Trouble,ActionRomance,..There's never been a female on the Dallas SW...,Reviewed by: Rabid Reads.Men in uniform is alr...,5.0
5,Witness to Passion,ActionRomance,Under his protection and in his bed…For Fallon...,3.5 Stars!!Fallon is a twenty-five years old g...,4.0


In [16]:
df_new['genre'].value_counts()

genre
Mystery            500
Thriller           498
Science-fiction    497
Fiction            489
Autobiography      488
Fantasy            479
Action             477
Motivational       468
ActionRomance      200
Name: count, dtype: int64

In [17]:
df_new.query('genre != "ActionRomance"')

Unnamed: 0,title,genre,description,review,rating
23,1984,Fiction,A masterpiece of rebellion and imprisonment wh...,It's written 1948? Clearly History has its twi...,5.0
24,To Kill a Mockingbird,Fiction,The unforgettable novel of a childhood in a sl...,/// gentle reminder that this is not the time ...,1.0
25,The Great Gatsby,Fiction,"Scott Fitzgerald's third book, stands as the s...","Oh Gatsby, you old sport, you poor semi-delusi...",5.0
26,Animal Farm,Fiction,Librarian's note: There is an Alternate Cover ...,"سيظل دائما جميع الحيوانات سواسية..الا ان بعض ""...",5.0
27,Harry Potter and the Sorcerer’s Stone,Fiction,An alternative cover for this ASIN can be foun...,(A-) 83% | Very GoodNotes: An effortless encha...,4.0
...,...,...,...,...,...
4215,Boy: Tales of Childhood,Autobiography,"In Boy, Roald Dahl recounts his days as a chil...",English / ItalianoChosen by my daughter as bed...,3.0
4216,Talking as Fast as I Can: From Gilmore Girls t...,Autobiography,"In this collection of personal essays, the bel...",Lauren Graham narrating the audiobook was ever...,4.0
4217,Scrappy Little Nobody,Autobiography,A collection of humorous autobiographical essa...,All I knew about her going into these books wa...,4.0
4218,The Year of Magical Thinking,Autobiography,"'An act of consummate literary bravery, a writ...",Disclaimer: Being fresh into the grieving proc...,5.0


In [None]:
df_new.loc[4]['title']

'Shades of Treason'

In [None]:
df_new.loc[4]['description']

'Instead, they gave their lives to save hers.Lieutenant Ramie Ashdyn is an anomaly, a person whose genetic makeup makes her stronger and smarter than the average human. She’s pledged her life to protect the Coalition, an alliance of thirteen planetary systems, but when a top secret operation turns bloody, she’s charged with treason and the brutal executions of her teammates.The Coalition needs the information Ash’s team stole on their last mission, so they send in Commander Rhys “Rest in Peace” Rykus to get it. He’s the man who’s responsible for turning Ash into an elite soldier… and he’s a man who isn’t, never was, and never will be in love with the woman he trained. Or so he tells himself.Ash wants nothing more than to clear her name and be the woman her former instructor wants her to be, but the enemy who killed her teammates did more than frame her for treason and murder: they telepathically silenced her mind, preventing her from saying anything that might point to the truth about 