# Book data scrapping using python and some of its libraries


**Part1: In the first part, you will scrape a web site.** 

<p>
The web site is "https://www.idefix.com". You will get the information of the books in "Bilim" category.
In short, you can use "https://www.idefix.com/kategori/Kitap/Bilim/grupno=00052?Page=1" as a link.
In the "Bilim" category, there are 2531 books in 71 pages as I am preparing this project. 
</p>


<p>
First you need to find the links of the pages, then you can get the book links from these pages.After getting the link of each book, you can extract the information.You should get the values of "name", "author", "price", "number of reviwers", "rate given by the reviewers", "publication year" and "number of pages". Be careful, there may be missing values. After collecting the data, you need to store them in a file, preferably json.
</p>



**Part2: In the second part, you will use statistical and probability tools to understand data.**

<p>
You need the find mean and median and stdev values of the book prices.
You need to draw the histogram of the reviewers' number. Use discrete values such as 10,20 on the x-axis.
You need to find out if there is a relationship between between the book price and the book rate.
You need to find the distribution of the words and the most used word in the book names. You can draw a histogram.
</p>


In [None]:
from selenium.webdriver import Chrome

from bs4 import BeautifulSoup
import matplotlib.pyplot as plt 
import requests, json, re, numpy as np
from collections import Counter
from googletrans import Translator 


In [None]:
# only b/c I wanted to save names in Json in English and wanted to explore googletrans lib as well
translator = Translator()

# Global Variables 
site_base_link = 'https://www.idefix.com'
home_page_url = 'https://www.idefix.com/kategori/Kitap/Bilim/grupno=00052' 
chrome_path = r'/usr/local/bin/chromedriver'
page_list= []


# Classes 
class Book:
    def __init__(self, name = "BookName", author = "Author" , price = "Price", number_of_reviewers = "No_of_Reviewers", rating = "BookRating", publication_year = "PublicationYear", page_count = "BookPages", book_count = "book_count"):
        self.name = name 
        self.author = author 
        self.price = price 
        self.number_of_reviewers = number_of_reviewers
        self.rating = rating 
        self.publication_year = publication_year
        self.page_count = page_count
        self.book_count = book_count


# Methods 
def findRatingElements(parentDiv):
    ratingOutof5, pplRated = "" , ""
    reviewDivFirstSpan = parentDiv.find('div', attrs= {'class' : 'review-container'}).span
    ratingOutof5 = reviewDivFirstSpan.span.text
    if (',' in ratingOutof5 ):
        rating = ratingOutof5.replace(',', '.')
    pplRated = reviewDivFirstSpan.span.findNext('span').findNext('span').text
    return pplRated, ratingOutof5

def getBooksDetailFromGivenPage(pagelink, all_data):
    name, author, year = '', '',''
    html = requests.get(pagelink).text
    page_soup = BeautifulSoup(html, 'html5lib') 
    class_books = page_soup.find_all('div', attrs = {'class' : 'cart-product-box-view'})
    book_counter = 0 
    j = 0 
    for div in class_books:
        book_link = div.a['href']
        html = requests.get(site_base_link+book_link).text
        book_soup = BeautifulSoup(html, 'html5lib')
        # -- finding number_of_reviewers, rating, price
        _bookUpperInfoDiv = book_soup.find(id= "productpricedetails")
        try:
            rate = findRatingElements(_bookUpperInfoDiv)
            number_of_reviewers = rate[0]
            rating = rate[1]
            price = _bookUpperInfoDiv.find(id = "salePrice").text
        except Exception as ex: 
            print ('Value is null')
            print ('UpperDiv Exception: {0} at book_link = {1}'.format(str(ex), book_link)) 
            print ('VALUES ARE: rate = {0} | number_of_reviewers = {1} | price = {2}'.format(rate, number_of_reviewers, price))

        # -- finding name, author, publication_year, page_count
        _bookInfoDiv = book_soup.find('div', attrs={'class': 'product-description'})
        try:
            name = translator.translate(_bookInfoDiv.find(text= 'Kitap Adı:').findNext('a').text).text
            author = translator.translate(_bookInfoDiv.find(text= 'Yazar: ').findNext('a').text).text
            year = _bookInfoDiv.find(text= 'İlk Baskı Yılı:').findNext('a').text
            try:
                pages = _bookInfoDiv.find('font', attrs = {'text' : 'Sayfa Sayısı: '})
            except:
                print ('Not Given')
            
        except Exception as ex: 
            print ('Value is null')

        book_counter = book_counter+1 
        _book = Book(name, author, price, number_of_reviewers,rating,year, "pages", book_counter)
        print (_book.name, _book.author,  _book.price , _book.number_of_reviewers, _book.rating, _book.publication_year, _book.book_count)
        all_data.append(_book)  
    


In [None]:
# writing in Json file
def dumpDataListInJsonFile(list):
    try:
        with open("bilimBooks.json", "a") as writeJSON:
            _content = json.dumps(list, default=lambda x: x.__dict__)
            # print(_content)
            writeJSON.write(_content)
    except Exception as ex:
        print ("Could not write in the file there was an exception {0}".format(ex))
    print ('----------------------------------------------------------------------')    


In [None]:
# Main 
driver = Chrome(chrome_path)
main_page = driver.get(home_page_url)
pager = driver.find_element_by_xpath('//ul[@class="pager pager-list"]')
list_elements = pager.find_elements_by_tag_name('li')
max_page =   int(max([(v.text,i) for i,v in enumerate(list_elements)])[0])
for i in range(1,max_page+1):
    url = home_page_url + '?Page=' + str(i)
    #print (url)
    page_list.append(url)
driver.close()

page_read = input("-----------Please enter for how many pages you want to run this scrapping:-------------- ")
i = 0

all_data = []
open("bilimBooks.json", "w").close()
for p in page_list:
    getBooksDetailFromGivenPage(p, all_data)
    if (i == page_read):
        break
    i = i + 1
dumpDataListInJsonFile(all_data)
     


In [None]:
# Second Part - Visual Analysis 

all_books_prices = [] 
all_books_reviewers = []
all_books_ratings = []
all_books_names = []


sorted_data = sorted(all_data, key=lambda item: item.number_of_reviewers)

for book in sorted_data:
    all_books_names.append(book.name)
    all_books_reviewers.append(float(book.number_of_reviewers))
    price = book.price.split(' ')[0].replace(',','.')
    all_books_prices.append(price)
    rating = book.rating
    if "," in rating:
        rating = rating.replace(",", ".")
    all_books_ratings.append(float(rating))
    

all_books_prices_mean = np.mean(np.array(all_books_prices).astype(np.float))
all_books_prices_median = np.median(np.array(all_books_prices).astype(np.float))
all_books_prices_stdev = np.std(np.array(all_books_prices).astype(np.float))

print ('all_book_prices MEAN = {0}'.format(all_books_prices_mean))
print ('all_book_prices MEDIAN = {0}'.format(all_books_prices_median))
print ('all_book_prices STDEV = {0}'.format(all_books_prices_stdev))



In [None]:
# (a). You need to draw the histogram of the reviewers' number. Use discrete values such as 10,20 on the x-axis. 

x_axis = [x * 10 for x in range(10)]

plt.bar(all_books_reviewers, all_books_ratings, 2)
plt.xticks([5 * i for i in range(27)])
plt.show()


In [None]:
# (b). You need to find out if there is a relationship between the book price and the book rate.

# summarize
all_books_prices = np.array(all_books_prices).astype(np.float)
all_books_ratings = np.array(all_books_ratings).astype(np.float)

print('book_prices: mean=%.3f stdv=%.3f' % (np.mean(all_books_prices), np.std(all_books_prices)))

print('all_books_ratings: mean=%.3f stdv=%.3f' % (np.mean(all_books_ratings), np.std(all_books_ratings)))

# finding relationship
correlation = np.corrcoef(all_books_prices, all_books_ratings)
print('Correlation b/w book prices and book rating %s' % correlation)
if(correlation[0][1] > 0):
    print ('Positive relationship')
elif(correlation[0][1] < 0 ):
    print ('Negative relationship')
else:
    print ('Neutral relationship')   

#plot
plt.scatter(all_books_prices, all_books_ratings)
plt.show()


In [None]:
# (c). You need to find the distribution of the words and the most used word in the book names. You can draw a histogram.
words_in_names = []

for name in all_books_names:
    pattern = r'[^A-Za-z0-9 ]'
    regex = re.compile(pattern)
    n = regex.sub('', name).split(' ')
    words_in_names.extend(n)
#print(words_in_names)
_counter = Counter(words_in_names)
first_ten_words = _counter.most_common(10)
#print(first_ten_words)

d = {}
for t in first_ten_words:
    if (t[0] != ''):
        d[t[0]] = t[1]

#print (d)
plt.bar(d.keys(), d.values(), color='g')
plt.yticks(np.arange(0, 50, 5))
plt.title('Word Occurrence')
plt.xlabel('Words')
plt.ylabel('Count')

plt.show()
