## Scraping The New York Times Best Sellers Books

In [21]:
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import json
import threading

## Book class

In [22]:
class Book:
    def __init__(self, title, img_url, position, description):
        self.title = title
        self.img_url = img_url
        self.position = position
        self.description = description

    @property
    def title(self):
        return self._title

    @title.setter
    def title(self, title):
        self._title = title

    @property
    def img_url(self):
        return self._img_url

    @img_url.setter
    def img_url(self, img_url):
        self._img_url = img_url

    @property
    def position(self):
        return self._position

    @position.setter
    def position(self, position):
        self._position = position

    @property
    def description(self):
        return self._description

    @description.setter
    def description(self, description):
        self._description = description
    
    def __str__(self):
        book_dict = {}
        book_dict["title"] = self.title
        book_dict["img_url"] = self.img_url
        book_dict["position"] = self.position
        book_dict["description"] = self.description
        return json.dumps(book_dict)


## Category class

In [23]:
class Category:
    def __init__(self, name, best_sellers_books):
        self.name = name
        self.best_sellers_books = best_sellers_books

    @property
    def name(self):
        return self._name

    @name.setter
    def name(self, name):
        self._name = name

    @property
    def best_sellers_books(self):
        return self._best_sellers_books

    @best_sellers_books.setter
    def best_sellers_books(self, best_sellers_books):
        self._best_sellers_books = best_sellers_books

    def __str__(self):
        books_formt = '[\n'
        for book in self.best_sellers_books:
            books_formt += '\t'
            books_formt += f'{book}'
            books_formt += ',\n'
        books_formt= books_formt[:-2]
        books_formt += '\n]'
        return f'"category": "{self.name}",\nbooks:{books_formt}'

## Week class

In [24]:
class Week:
    def __init__(self, number, categories):
        self.number = number
        self.categories = categories

    @property
    def number(self):
        return self._number

    @number.setter
    def number(self, number):
        self._number = number

    @property
    def categories(self):
        return self._categories

    @categories.setter
    def categories(self, categories):
        self._categories = categories

    @staticmethod
    def get_dict(self):
        week_dict = {}
        week_dict["week"] = self.number
        week_dict["data"] = self.categories
        return week_dict
        
    def __str__(self):
        output = '[{\n'
        for category in self.categories:
            output += f'{category}'
            output += ',\n\n'
        output= output[:-3]
        output += '\n}]'
        return '{' + f'"week": "{self.number}", "data":{output}' + '}'
       

## Utils funcs

In [25]:
def adjustDateFormation(date):
    """
    :param date: datetime object
    :return: modified string version of an datetime object 
    """ 
    return str(date).replace('-','/')
    

In [26]:
def get_past_weeks_dates(initial_date, number_of_weeks):
    """
    :param initial_date: datetime object of the current week date
    :param number_of_weeks: number of weeks to go back to
    :return: list of all the past weeks dates
    """ 
    week_dates_list = ['']
    prev_week_date = initial_date
    
    for i in range(number_of_weeks - 1):
        prev_week_date = prev_week_date - timedelta(days=7)
        week_dates_list.append(prev_week_date.date())

    # modify date to yyyy/mm/dd format
    modified_week_dates_list = list(map(adjustDateFormation, week_dates_list))
        
    return modified_week_dates_list
    

In [27]:
def parse_categories(soup_categories):
    """
    :param soup_categories: a bs4 object that contains all the categories names
    :return: list of all the categories names in order of appearnce
    """ 
    categories_names = []
    
    for row in soup_categories:
        str_row = str(row).replace('amp;','')
        parse_mid = str_row.split('>')[1]
        category_name = parse_mid.split('<')[0]
        categories_names.append(category_name)
    return categories_names
    

In [28]:
def parse_books(soup_books):
    """
    :param soup_books: a bs4 object that contains all the books & their data
    :return: list of all the best sellers books in order of appearnce, as Book objects 
    """ 
    books = []

    for i in range(len(soup_books)):
        str_row = str(soup_books[i])
        
        img_url = str_row.split('src="')[1].split('"/>')[0]
        title = str_row.split('"name">')[1].split('</h3>')[0]
        description = str_row.split('"description">')[1].split('</p>')[0].replace('\u201c','').replace('\u201d', '')
        position = (i % 5) + 1
        
        book = Book(title=title, img_url=img_url, position=position, description=description)
        books.append(book)
        
    return books


In [29]:
def match_books_to_categories(categories_names, books):
    """
    :param categories_names: a list of all the categories names in order of appearnce 
    :param books: list of all the best sellers books in order of appearnce as Book objects
    :return: list of all the categories and their best sellers books, as Category objects
    """ 
    categories = []
    
    for i in range(0, len(books), books_per_category):
        category_index = int(i / books_per_category)
        category_name = categories_names[category_index]
        
        first_book_index = i
        last_book_index = i + books_per_category

        category_books = books[first_book_index : last_book_index]
        category = Category(name=category_name , best_sellers_books=category_books)

        categories.append(category)

    return categories
     

In [30]:
def modify_week_format(raw_week_date):
    """
    :param raw_week_date: string date of: "month date, year" format
    :return: datetime object of the week's date
    """ 
    
    month_dict = {'January':'1', 'February':'2', 'March':'3' , 'April':'4',
                  'May':'5', 'June':'6', 'July':'7', 'August':'8',
                  'September':'9', 'October':'10', 'November':'11', 'December':'12'}

    filtered = raw_week_date.split('>')[1].replace(',', '')
    month = month_dict.get(filtered.split(' ')[0])
    day = filtered.split(' ')[1]
    year = filtered.split('<')[0][-4:]
    str_date = f'{year}/{month}/{day}'
    return datetime.strptime(str_date, '%Y/%m/%d')
    

In [31]:
def get_urls_to_scrape(soup, url, number_of_weeks):
    """
    :param soup: a bs4 object of the url's raw data
    :param url: the web page url
    :param number_of_weeks: number of weeks to go back to
    :return: list that contains the web page's url for each past week 
    """ 
    # Locate initial week in the homepage
    raw_initial_week = soup.find('time', {"class": "css-6068ga"})
    initial_week = modify_week_format(str(raw_initial_week))
    
    # Create a list of the dates of all past weeks
    past_weeks_dates_list = get_past_weeks_dates(initial_date=initial_week, number_of_weeks=number_of_weeks)
    
    # Create a list of all the urls
    urls_to_scrape = [url + week_date for week_date in past_weeks_dates_list]
    
    return urls_to_scrape

In [32]:
def scrape_url(url, week_id):
    """
    :param url: the url of the desired web page to scrape
    :param week_id: the index of the week's inside the url (the latest week id would be 144)
    :return: saves the url's data inside the week_id index of the output list
    """ 
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "lxml")
    
    # Create a list of the categories names 
    soup_categories = soup.findAll('a', {"class": "css-nzgijy"})
    categories_names = parse_categories(soup_categories)
    
    # Create a list of all the best selling book as Book objects
    soup_books = soup.findAll('a', {"class": "css-g5yn3w"})
    books = parse_books(soup_books)
    
    # Create a list of all the categories as Category objects
    categories = match_books_to_categories(categories_names, books)
    
    # Create a week object
    week = Week(number=number_of_weeks-week_id, categories=categories)

    best_sellers[week_id] = week
    return

## Main

In [33]:
url = 'https://www.nytimes.com/books/best-sellers/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")

In [34]:
# Set number of weeks:
number_of_weeks = 144

In [35]:
# Set number of best sellers per category:
books_per_category = 5

In [36]:
# Create a list of urls to scrape by matching a url to each week:
urls = get_urls_to_scrape(soup, url, number_of_weeks)

In [37]:
# Declare a variable to contain the final output
best_sellers = [None] * number_of_weeks

In [38]:
# Declare a threads list to support an efficient multiproccening solution
threads = []

In [39]:
# Divide the scraping task between multiple threads
for i in reversed(range(number_of_weeks)):
    t = threading.Thread(target=scrape_url, args=(urls[i],i))
    threads.append(t)
    t.start()

for t in threads:
    t.join()


In [40]:
print(best_sellers[0])

{"week": "144", "data":[{
"category": "Combined Print & E-Book Fiction",
books:[
	{"title": "FOURTH WING", "img_url": "https://storage.googleapis.com/du-prd/books/images/9781649374042.jpg", "position": 1, "description": "Violet Sorrengail is urged by the commanding general, who also is her mother, to become a candidate for the elite dragon riders."},
	{"title": "BLOOD LINES", "img_url": "https://storage.googleapis.com/du-prd/books/images/9781501101816.jpg", "position": 2, "description": "The second book in the Scott Brodie and Maggie Taylor series. After a mission in Venezuela, Brodie and Taylor search for the murderer of a fellow agent."},
	{"title": "WILDFIRE", "img_url": "https://storage.googleapis.com/du-prd/books/images/9781668026274.jpg", "position": 3, "description": "The second book in the Maple Hills series. Two summer camp counselors who previously had a one-night stand may run afoul of the camp\u2019s rules."},
	{"title": "LESSONS IN CHEMISTRY", "img_url": "https://storage.g