In [5]:
import requests
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup 

In [7]:
goodread_df = pd.read_csv(r'D:\Online_Learning\Practical_DL\final_project\books.csv', on_bad_lines='skip')

In [11]:
url = f"https://www.goodreads.com/book/isbn/"

def get_reviews_from_isbn(url, isbn):
    url = url + f'{isbn}'
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    response = requests.get(url, headers=headers, allow_redirects=True)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Get review from soups
    reviews = soup.find_all('article', class_='ReviewCard')
    print(len(reviews))
    # Initialize a list to hold all extracted review data
    all_reviews = []

    for review in reviews:
        # Extract the reviewer's name
        reviewer_name = review.find('div', {'data-testid': 'name'}).get_text(strip=True)
        
        # Extract the rating, assuming it's in the format "Rating x out of 5"
        try:
            rating_section = review.find('div', class_='ShelfStatus')
            rating = rating_section.find('span', {'role': 'img'}).get('aria-label', '').split()[1]
        except:
            rating = ''
        
        # Extract the comment text
        comment_section = review.find('div', {'data-testid': 'contentContainer'})
        comment = comment_section.get_text(strip=True) if comment_section else "No comment provided"
        
        # Collect all information into a dictionary and add to the list
        review_data = {
            'isbn': isbn,
            'reviewer': reviewer_name,
            'rating': rating,
            'comment': comment
        }
        all_reviews.append(review_data)
    
    return pd.DataFrame(all_reviews)


isbn_df = goodread_df[['bookID','average_rating','isbn']]
reviews_df = []

for isbn in isbn_df['isbn'].unique():
    tmp = get_reviews_from_isbn(url, isbn)
    print(tmp.shape)
    reviews_df.append(tmp)
    time.sleep(0.1)

KeyboardInterrupt: 

In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed


def fetch_data_for_isbn(isbn):
    return get_reviews_from_isbn(url, isbn)


def get_reviews_from_isbn(url, isbn):
    full_url = url+f"{isbn}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }

    # Retry mechanism
    for attempt in range(3):  # Retry up to 3 times
        try:
            response = requests.get(full_url, headers=headers, allow_redirects=True, timeout=10)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                break
        except requests.RequestException as e:
            print(f"Error fetching data for ISBN {isbn}: {e}")
            time.sleep(2 ** attempt)  # Exponential backoff
        else:
            print(f"Failed to fetch data after {attempt + 1} attempts. Status Code: {response.status_code}")
            return pd.DataFrame()  # Return empty DataFrame if unsuccessful

    # Parsing logic as before
    reviews = soup.find_all('article', class_='ReviewCard')
    all_reviews = []

    for review in reviews:
        review_data = parse_review(review, isbn)  # Modularize parsing into a function
        all_reviews.append(review_data)

    return pd.DataFrame(all_reviews)


def parse_review(review, isbn):
    try:
        reviewer_name = review.find('div', {'data-testid': 'name'}).get_text(strip=True)
        rating_section = review.find('div', class_='ShelfStatus')
        rating = rating_section.find('span', {'role': 'img'}).get('aria-label', '').split()[1]
        comment_section = review.find('div', {'data-testid': 'contentContainer'})
        comment = comment_section.get_text(strip=True) if comment_section else "No comment provided"
    except Exception as e:
        print(f"Error parsing review for ISBN {isbn}: {e}")
        return {'isbn': isbn, 'reviewer': '', 'rating': '', 'comment': ''}
    
    return {
        'isbn': isbn,
        'reviewer': reviewer_name,
        'rating': rating,
        'comment': comment
    }


def fetch_reviews_parallel(isbn_list):
    reviews_df = []
    with ThreadPoolExecutor(max_workers=5) as executor:  # Reduced number of workers to lessen the load
        future_to_isbn = {executor.submit(fetch_data_for_isbn, isbn): isbn for isbn in isbn_list}
        for future in as_completed(future_to_isbn):
            isbn = future_to_isbn[future]
            try:
                data = future.result()
                reviews_df.append(data)
                print(f"Data fetched for ISBN {isbn} with shape {data.shape}")
                time.sleep(1)  # Throttle requests
            except Exception as exc:
                print(f"{isbn} generated an exception: {exc}")
    return reviews_df


In [13]:
url = f"https://www.goodreads.com/book/isbn/"
isbn_list = goodread_df['isbn'].unique()

reviews_dataframes = fetch_reviews_parallel(isbn_list)

Error parsing review for ISBN 0439358078: 'NoneType' object has no attribute 'get'
Error parsing review for ISBN 0439358078: 'NoneType' object has no attribute 'get'
Error parsing review for ISBN 0439358078: 'NoneType' object has no attribute 'get'
Data fetched for ISBN 0439358078 with shape (30, 4)
Error parsing review for ISBN 0439785960: 'NoneType' object has no attribute 'get'
Error parsing review for ISBN 0439785960: 'NoneType' object has no attribute 'get'
Error parsing review for ISBN 0439785960: 'NoneType' object has no attribute 'get'
Error parsing review for ISBN 043965548X: 'NoneType' object has no attribute 'get'
Error parsing review for ISBN 043965548X: 'NoneType' object has no attribute 'get'
Error parsing review for ISBN 0439682584: 'NoneType' object has no attribute 'get'
Error parsing review for ISBN 0439682584: 'NoneType' object has no attribute 'get'
Data fetched for ISBN 0439785960 with shape (30, 4)
Data fetched for ISBN 043965548X with shape (30, 4)
Data fetched f

In [35]:
test = pd.concat(reviews_dataframes)
test = test[test['reviewer']!=""]
test.to_csv('reviews_goodread.csv', index=False)

In [36]:
test = pd.read_csv(r'D:\Online_Learning\Practical_DL\final_project\reviews_goodread.csv')
test.shape

(273442, 4)

In [37]:
test.head()

Unnamed: 0,isbn,reviewer,rating,comment
0,439358078,Jayson,5,(A) 86%| ExtraordinaryNotes:An angsty apprehen...
1,439358078,Navessa,5,"Seriously, don't read this review if you haven..."
2,439358078,Diane ϟ [ Lestrange ],5,Interview with JK Rowling...Stephen Fry:Can we...
3,439358078,Jayson,5,(A) 86%| ExtraordinaryNotes:It's a transitiona...
4,439358078,Hannah Azerang,5,I had to re read it. I was in such a nostalgic...


In [None]:
!p