In [None]:
#installing scrapy
!pip install scrapy

In [None]:
#importing necessary packages
import numpy as np
import pandas as pd
import time 
import requests
from scrapy.http import TextResponse 

In [None]:
url="http://books.toscrape.com/"
base_url = "http://books.toscrape.com/catalogue/"
base_url1 = "http://books.toscrape.com/"
page = requests.get(url)
response = TextResponse(url=page.url,body=page.text,encoding="utf-8")

In [None]:
#Separately defining scraping functions for book title, rating, price, hyperlink, picture, availability, individual page, genre and description

def get_title(url):
  page = requests.get(url)
  response = TextResponse(url=page.url,body=page.text,encoding="utf-8")
  title = response.css("a[title]::attr(title)").extract()
  return title
  
def get_rating(url):
  page = requests.get(url)
  response = TextResponse(url=page.url,body=page.text,encoding="utf-8")
  rating =response.css("p[class^='star-rating']::attr(class)").extract()
  return rating 

def get_price(url):
  page = requests.get(url)
  response = TextResponse(url=page.url,body=page.text,encoding="utf-8")
  price = response.xpath("//p[@class = 'price_color']/text()").extract()
  price = [i.replace("Â£", " ") for i in price]
  return price

def get_hyperlink(url):
  page = requests.get(url)
  response = TextResponse(url=page.url,body=page.text,encoding="utf-8")
  hyperlink = [base_url +i for i in response.css("a[title]::attr(href)").extract()]
  return hyperlink

def get_book_picture(url):
  page = requests.get(url)
  response = TextResponse(url=page.url,body=page.text,encoding="utf-8")
  book_picture_url = response.css("img::attr(src)").extract()
  book_picture = [base_url1 +i for i in book_picture_url]
  return book_picture

def get_availabilities(url):
  page = requests.get(url)
  response = TextResponse(url=page.url,body=page.text,encoding="utf-8")
  availability = response.xpath("//p[contains(@class,'stock')]/text()[2]").re("\w+.+\w")
  return availability

def individual_page_url(url):
    page = requests.get(url)
    response = TextResponse(url=page.url,body=page.text,encoding="utf-8")
    individual_page = [base_url+'catalogue/'+i for i in response.xpath('//article[@class="product_pod"]//div[@class = "image_container"]/a/@href').extract()]
    return individual_page

def book_genre(url): 
  page = requests.get(url)
  response = TextResponse(url=page.url,body=page.text,encoding="utf-8")
  genre = response.xpath("//ul[@class = 'breadcrumb']/li[3]/a/text()").extract_first()
  return genre

def book_description(url):
    page = requests.get(url)
    response = TextResponse(url=page.url,body=page.text,encoding="utf-8")
    description = response.xpath("//article[@class='product_page']/p/text() ").extract_first()
    return description


In [None]:
#Defining the links for all pages for using them while scraping all pages of the website
#The range is from 1 to 51, since we have 50 pages
all_pages = [base_url+"catalogue/page-{}.html".format(i) for i in range(1,51)]

In [None]:
#Defining empty lists for each item, and extending the scraped first page to the rest of the pages
#Items below are present on all pages of the website, therefore they can be extended together 
all_titles = []
all_ratings = []
all_prices = []
all_hyperlinks = []
all_book_pictures = []
all_availabilities = []
individual_page = []
for i in all_pages:
    all_titles.extend(get_title(i))
    all_ratings.extend(get_rating(i))
    all_prices.extend(get_price(i))
    all_hyperlinks.extend(get_hyperlink(i))
    all_book_pictures.extend(get_book_picture(i))
    all_availabilities.extend(get_availabilities(i))
    individual_page.extend(individual_page_url(i))
    time.sleep(1)

In [None]:
#Creating empty lists for genre and descriptions in a similar manner and appending them to the individual page, 
#as they are a part of all pages
books_genre = []
books_descriptions = []
for i in individual_page:
    books_genre.append(book_genre(i))
    books_descriptions.append(book_description(i))
    time.sleep(1)

In [None]:
# Combining all the separate lists aand turning the into one DataFrame
df = pd.DataFrame(np.column_stack([all_titles, all_ratings, all_prices, all_hyperlinks, all_book_pictures, all_availabilities, individual_page, books_genre, books_descriptions]), columns = ['Titles', 'Ratings', 'Prices', 'Hyperlinks', 'Book_Pictures', 'Availabilities', 'Individual_Page', 'Books_Genre', 'Books_Descriptions'])

In [None]:
# Converting DataFrame into a csv file
 df.to_csv('books.csv', index=False) 

In [None]:
#Reading the new csv file to perform analysis 
data = pd.read_csv("books.csv")

In [None]:
# 1. The follwing code calculates the average price for all books
data["Prices"].mean()

35.07034999999999

In [None]:
# 2. The following code identifies the most expensive genre by first grouping prices by genre, then calculating mean prices,
#sorting values in the descending order, and leaving only the most expensive one
data['Prices'].groupby(data['Books_Genre']).mean().sort_values(ascending = False).head(1)

Books_Genre
Suspense    58.33
Name: Prices, dtype: float64

In [None]:
# 3.1 Replacing the string values of the Rating column by numeric values to further use them in calculations
data.Ratings[data.Ratings == 'star-rating One'] = 1
 data.Ratings[data.Ratings == 'star-rating Two'] = 2
data.Ratings[data.Ratings == 'star-rating Three'] = 3
data.Ratings[data.Ratings == 'star-rating Four'] = 4
data.Ratings[data.Ratings == 'star-rating Five'] = 5

In [None]:
# 3.2 Correlation coefficient determines the relationship between two numeric variables. In this case, we can say that we have a weak positive relationship
#which is not enough to state that books with higher prices do have higher ratings. 
# Additionally, correlation does not show causality, and we cannot say that one factor causes the other.
data["Prices"].corr(data["Ratings"])

0.028166239485873015