In [7]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [8]:
def bookscraper(url):

    '''
    Gets all information of a specific book
    :param url: url of one book
    :return: dictionary with all information of a book
    '''
    page_book = requests.get(url)
    soup_book = BeautifulSoup(page_book.content, 'html.parser')

    title = None
    author = None
    num_reviews = None
    num_ratings = None
    avg_rating = None
    num_pages = None
    original_publish_year = None
    genres = None
    awards = None
    places = None

    try:
        title = soup_book.find("h1").text.strip()
    except:
        pass
    try:
        author = soup_book.find("span", itemprop="name").text.strip()
    except:
        pass
    try:
        num_reviews = int(soup_book.find("meta", itemprop="reviewCount").get('content'))
    except:
        pass
    try:
        num_ratings = int(soup_book.find("meta", itemprop="ratingCount").get('content'))
    except:
        pass
    try:
        avg_rating = float(soup_book.find("span", itemprop="ratingValue").text.strip())
    except:
        pass
    try:
        num_pages = int(soup_book.find("span", itemprop="numberOfPages").text.strip().split(' ')[0])
    except:
        pass
    try:
        details = soup_book.find("div", id="details")
        original_publish_year = details.find_all("div", class_="row")[-1].text.strip().split(' ')
        for element in original_publish_year:
            try:
                original_publish_year = int(element)
                break
            except:
                pass

    except:
        pass
    try:
        series = soup_book.find("h2", id="bookSeries").text.strip()
        if series.strip() == "" or series == None:
            series = 0
        else:
            series = 1
    except:
        series = 0
    try:
        genres_list = soup_book.find_all("a", class_="actionLinkLite bookPageGenreLink")
        genres = ";".join([genre.text for genre in genres_list])
    except:
        pass
    try:
        awards_list = soup_book.find_all("a", class_="award")
        awards = ";".join([award.text for award in awards_list])
    except:
        pass
    try:
        places_detail = details.find("div",id="bookDataBox").findChildren("div", recursive=False)
        i=0
        while i<len(places_detail):
            if places_detail[i].text == "Setting":
                places_html = places_detail[i+1]
                break
            i += 1
        places = ';'.join([place.text for place in places_html.find_all('a')])
    except:
        pass

    to_append = {"url": url , "title":title,"author":author,"num_reviews":num_reviews,"num_ratings":num_ratings,"avg_rating":avg_rating,\
                 "num_pages":num_pages,"original_publish_year":original_publish_year,"series":series,"genres":genres,"awards":awards,"places":places}
    return to_append

In [9]:
def scraper():
    '''
    Loop all best Dystopian_and_Post_Apocalyptic_Fiction books in goodreads webpage
    :return: save a csv document with all information books
    '''
    base_url = 'https://www.goodreads.com'

    for i in range(1,12):
        df = pd.DataFrame(columns=["url", "title", "author", "num_reviews", "num_ratings", "avg_rating", "num_pages",
                                   "original_publish_year", "series", "genres", "awards", "places"])
        page = requests.get(f'https://www.goodreads.com/list/show/29013.Best_Biographies_?page={i}')
        soup = BeautifulSoup(page.content, 'html.parser')
        book_titles = soup.find_all('a', class_="bookTitle")
        for book in book_titles:
            try:
                id_book = book.get('href')
                data_to_append = bookscraper(base_url+id_book)
                df_to_append = pd.DataFrame(data_to_append, index=[0])
                df = df.append(df_to_append, ignore_index=True)
            except:
                pass
        df.to_csv(path_or_buf='book_data.csv', mode='a', sep='&', header=True)
        

In [65]:
def preprocessing(data):
    data = pd.read_csv(data, sep='&') # Reading the data set
    # Data cleanimg 
    data['original_publish_year'] = pd.to_numeric(data['original_publish_year'], errors='coerce')
    data = data[data['original_publish_year']>1900]
    #Count non null values in a pandas dataframe
    
    
    data = data.dropna(subset=['avg_rating','original_publish_year'])
    data = data[data['original_publish_year'] <= 2021]
    data = data.drop_duplicates(subset=['title'])
    data = data.reset_index(drop=True)
    data['genres'].fillna("No Genre", inplace = True)
    #print(data.describe)
    #print(data.isnull().sum())
    #checking the Author column containing the numerice values count
    authors_must_string = [not(str(author).isnumeric()) for author in data['author']]
    data = data[authors_must_string]
    #checking the Author column containing the numerice data and applying the simple indexing method
    data.author.str.contains(r'[0-9]').value_counts()

    #Converting the object data type to numerice data type
    data['avg_rating'] = pd.to_numeric(data['avg_rating'],errors = 'coerce')

    # MinMax Normilization on avg_rating and scaling from 0 to 10 and saving it into the minmax_norm_rating
    data['minmax_norm_rating'] = 1 + (data['avg_rating'] - data['avg_rating'].min()) / (data['avg_rating'].max() - data['avg_rating'].min()) * 9
    # Mean normilization
    data['mean_norm_rating'] = 1 + (data['avg_rating'] - data['avg_rating'].mean()) / (data['avg_rating'].max() - data['avg_rating'].min()) * 9
    # Converting the awards column into numerical data
    data['awards'] = data['awards'].str.split(';').str.len()

    data = data.rename(columns={'title': 'Title', 'original_publish_year': 'Publication year', 'minmax_norm_rating': 'Rating', 'awards':'Awards', 'num_pages':'Nº pages', 'series':'Series'})
    
    
    return(data)
   
    

In [66]:
if __name__ == "__main__":
    #scraper() #To run the scraper Function
    data = preprocessing('./book_data.csv')
   
