# In this Jupyter notebook we use the website "Books to Scrape" (http://books.toscrape.com/). 
# The goal of this project is to practice with Web scraping. 
# In Section 1 we use web scraping to retrieve the title, rating, price, link and book cover for a list of 20 books. 
# In Section 2 we construct interactive tools for the users. The users can give the title of a book as input, and they will receive the rating, link, price and book cover as the output. 


In [1]:
import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib

# *Section 1: Web scraping without interaction with the final user*

# Use BeautifulSoup to retrieve the HTML for all books included in the website

In [2]:
url = "http://books.toscrape.com/"
response = requests.get(url)
# Feed that HTML to the Parser:
soup = BeautifulSoup(response.content, "html.parser")

# Display the HTML for just one book

In [3]:
books_html = soup.find_all("article", class_="product_pod")
books_html[0] 

<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

# Parsing the title of one book

In [4]:
book_title = books_html[0].find("h3").find("a") 
book_title

<a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>

In [6]:
book_title = books_html[0].find("h3").find("a").attrs['title']
book_title

'A Light in the Attic'

# Parsing the price of one book 

In [9]:
book_price = books_html[0].find("p", attrs={'class':'price_color'})
book_price_str = str(book_price.string)

book_price_float = float(book_price_str[1:])
book_price_float

51.77

# Parsing the rating of one book 

In [10]:
book_stars_html = books_html[0].find("p", class_="star-rating")
book_stars_html.attrs['class']

['star-rating', 'Three']

In [11]:
def parse_rating(rating_classes):
    if 'One' in rating_classes:
        return 1
    elif 'Two' in rating_classes:
        return 2
    elif 'Three' in rating_classes:
        return 3
    elif 'Four' in rating_classes:
        return 4
    elif 'Five' in rating_classes:
        return 5
    else:
        return 0

In [12]:
book_rating = parse_rating(books_html[0].find("p", class_="star-rating").attrs['class'])
book_rating

3

# Parsing the book cover of one book 

In [13]:
base_url = "http://books.toscrape.com/"
url_for_jpeg = base_url + books_html[0].find('img')['src']
url_for_jpeg

'http://books.toscrape.com/media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg'

**Show the picture below:**

In [15]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= url_for_jpeg, width=200, height=200)    

# Parsing all 20 books with a "for loop": 

In [17]:
base_url = "http://books.toscrape.com/"
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')

books = soup.find_all('li',attrs={"class":"col-xs-6 col-sm-4 col-md-3 col-lg-3"})

link_images=[]
titles=[]
prices=[]
ratings=[]
for book in books:
    picture = base_url + book.find('img')['src']
    link_images.append(picture)
    
    title = book.find("h3").find("a").attrs["title"]
    titles.append(title)
    
    price = float(book.find("p", class_="price_color").text[2:])
    prices.append(price)
    
    rating = parse_rating(book.find("p", class_="star-rating").attrs["class"])
    ratings.append(rating)
    
list(zip(titles,link_images,prices,ratings))[0:2]

[('A Light in the Attic',
  'http://books.toscrape.com/media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg',
  51.77,
  3),
 ('Tipping the Velvet',
  'http://books.toscrape.com/media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg',
  53.74,
  1)]

**Store all info in a Pandas dataframe:**

In [18]:
import pandas as pd 

list_of_list = [link_images,titles,prices,ratings]
df = pd.DataFrame(list_of_list)
df = df.T
df.columns = ['link_images', 'titles', 'prices', 'ratings']
df

Unnamed: 0,link_images,titles,prices,ratings
0,http://books.toscrape.com/media/cache/2c/da/2c...,A Light in the Attic,51.77,3
1,http://books.toscrape.com/media/cache/26/0c/26...,Tipping the Velvet,53.74,1
2,http://books.toscrape.com/media/cache/3e/ef/3e...,Soumission,50.1,1
3,http://books.toscrape.com/media/cache/32/51/32...,Sharp Objects,47.82,4
4,http://books.toscrape.com/media/cache/be/a5/be...,Sapiens: A Brief History of Humankind,54.23,5
5,http://books.toscrape.com/media/cache/68/33/68...,The Requiem Red,22.65,1
6,http://books.toscrape.com/media/cache/92/27/92...,The Dirty Little Secrets of Getting Your Dream...,33.34,4
7,http://books.toscrape.com/media/cache/3d/54/3d...,The Coming Woman: A Novel Based on the Life of...,17.93,3
8,http://books.toscrape.com/media/cache/66/88/66...,The Boys in the Boat: Nine Americans and Their...,22.6,4
9,http://books.toscrape.com/media/cache/58/46/58...,The Black Maria,52.15,1


# *Section 2: Interaction with the users and interactive tools*

# Choice of book, user input  and output (The user input is the title of the book, and the output is rating, price, link, book cover)

In [19]:
print('The available book titles are the following ones:')
print('')

for i in titles:
    print(i)

print('')
a = input('Please copy and paste one title and paste it here:  ')

print('')
print(f'The book that you have chosen has the following title: {a}')

# find the index for the chosen product: 
index = df.index[df['titles'] == a]
index
print('')
price_a = df['prices'].iloc[index[0]]
print(f'The book that you have chosen has the following price: £{price_a} ')

print('')
rating_a = df['ratings'].iloc[index[0]]
print(f'The book that you have chosen has the following rating: {rating_a} ')

print('')
link_a = df['link_images'].iloc[index[0]]
print(f'The cover book for that the chosen book has the following link: {link_a} ')

print('')
print(f'Here is the book cover: ')

from IPython.display import Image
from IPython.core.display import HTML 
Image(url= link_a, width=200, height=200)


The available book titles are the following ones:

A Light in the Attic
Tipping the Velvet
Soumission
Sharp Objects
Sapiens: A Brief History of Humankind
The Requiem Red
The Dirty Little Secrets of Getting Your Dream Job
The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics
The Black Maria
Starving Hearts (Triangular Trade Trilogy, #1)
Shakespeare's Sonnets
Set Me Free
Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)
Rip it Up and Start Again
Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991
Olio
Mesaerion: The Best Science Fiction Stories 1800-1849
Libertarianism for Beginners
It's Only the Himalayas

Please copy and paste one title and paste it here:  Sharp Objects

The book that you have chosen has the following title: Sharp Objects

The book that you have chosen has the following price: £47.82 

The book that you ha

# Interactive tool for the data (The user scrolls with an interactive tool and the output is rating, price, link, book cover)

In [20]:
from ipywidgets import interact
import ipywidgets as widgets

def my_function(x):
    return df.iloc[x]

# create a slider
interact(my_function, x=widgets.IntSlider(min=0,max=19,step=1,value=10))

interactive(children=(IntSlider(value=10, description='x', max=19), Output()), _dom_classes=('widget-interact'…

<function __main__.my_function(x)>

In [21]:
from ipywidgets import interact
import ipywidgets as widgets

def cover_book(x):
    return Image(url= df['link_images'].loc[x], width=200, height=200)

# create a slider
interact(cover_book, x=widgets.IntSlider(min=0,max=19,step=1,value=10))

interactive(children=(IntSlider(value=10, description='x', max=19), Output()), _dom_classes=('widget-interact'…

<function __main__.cover_book(x)>

In [22]:
from ipywidgets import interact
def image(x):
    from IPython.display import Image
    from IPython.core.display import HTML 
    image = Image(url= df['link_images'].loc[x], width=200, height=200)
    print(my_function(x))
    return image

# create a slider
interact(image, x=widgets.IntSlider(min=0,max=19,step=1,value=10))

interactive(children=(IntSlider(value=10, description='x', max=19), Output()), _dom_classes=('widget-interact'…

<function __main__.image(x)>