# In this section data is loaded into spark dataframe and filtered data
in this section there a data pipeline in order to extract data from json file transform it using PySpark and load it back into anothor json file.

### Importing libraries

In [4]:
#importing the needed libraries
import json
import gzip
from pyspark.sql.functions import col
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

ModuleNotFoundError: No module named 'findspark'

## Sample of data

In [2]:
with gzip.open("goodreads_books_.json.gz",'r') as f:
    print(f.readline())

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

###  Parseline Function to parse data we care about from every single line

In [3]:
'''
parseLine function get single row from goodreads_books data and return
dictionary of (book id, title, number of ratings, url and cover Image)
'''

def parseLine(line):
    data = json.loads(line)
    return {
        "book_id" : data["book_id"],
        "title" : data["title_without_series"],
        "ratings" : data["ratings_count"],
        "url" : data["url"],
        "cover_image" : data["image_url"]
    }

### Loading goodreads_books and getting needed values

In [4]:
books_titles = []

with gzip.open("goodreads_books_.json.gz",'r') as f:
    i = 0
    while 100000 :
        i += 1
        line = f.readline()
        if not line:
            break;
            
        fields = parseLine(line)
        
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
            
        if ratings > 5:
            books_titles.append(fields)

In [5]:
books_titles[0]

{'book_id': '1333909',
 'title': 'Good Harbor',
 'ratings': '10',
 'url': 'https://www.goodreads.com/book/show/1333909.Good_Harbor',
 'cover_image': 'https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png'}

###  Loading data into Spark Dataframe to clean it

In [6]:
#creating a dataframe FROM books_titles list of dictionary in order to clean data
df = spark.createDataFrame(books_titles)

In [7]:
df.head()

Row(book_id='1333909', cover_image='https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png', ratings='10', title='Good Harbor', url='https://www.goodreads.com/book/show/1333909.Good_Harbor')

In [8]:
#Converting Ratings column from String datatype to Integer Data type
df = df.withColumn("ratings",col("ratings").cast("Integer")) 

In [9]:
df.select('title').show(truncate=False)

+--------------------------------------------------------------------+
|title                                                               |
+--------------------------------------------------------------------+
|Good Harbor                                                         |
|The Unschooled Wizard (Sun Wolf and Starhawk, #1-2)                 |
|Best Friends Forever                                                |
|Runic Astrology: Starcraft and Timekeeping in the Northern Tradition|
|The Aeneid for Boys and Girls                                       |
|The Wanting of Levine                                               |
|All's Fairy in Love and War (Avalon: Web of Magic, #8)              |
|The Devil's Notebook                                                |
|Crowner Royal (Crowner John Mystery, #13)                           |
|The House of Memory (Pluto's Snitch #2)                             |
|The Te Of Piglet                                                    |
|Spiri

### Cleaning Data

In [10]:
#importing regexp_replace from pyspark to perform regular expressions
from pyspark.sql.functions import regexp_replace,lower

#removing all special chars from titles so that searching for book titles become easier and more efficient
#removing all chars apart from  a-z, A-Z, 0-9 and ' '(space)
df = df.withColumn("title", regexp_replace(df["title"], "[^a-zA-Z0-9 ]", "")); 

#Convering title column into lower case so can search for book titles become more efficient
df = df.withColumn("title", lower(df["title"])); 

#replacing more than one space to single space
df = df.withColumn("title", regexp_replace(df["title"], "\s+", " ")); 

In [11]:
df.select('title').show(truncate=False)

+-------------------------------------------------------------------+
|title                                                              |
+-------------------------------------------------------------------+
|good harbor                                                        |
|the unschooled wizard sun wolf and starhawk 12                     |
|best friends forever                                               |
|runic astrology starcraft and timekeeping in the northern tradition|
|the aeneid for boys and girls                                      |
|the wanting of levine                                              |
|alls fairy in love and war avalon web of magic 8                   |
|the devils notebook                                                |
|crowner royal crowner john mystery 13                              |
|the house of memory plutos snitch 2                                |
|the te of piglet                                                   |
|spirit lake die leg

In [12]:
### Filter using length of the column in pyspark
 
    
from pyspark.sql.functions import length
df = df.where(length(col("title")) > 0)

 ###  Saving Cleaned data into Json File

In [None]:
#Storing our dataframe into Json file format by converting it into pandas DF
pandasDF = df.toPandas()
#pandasDF.to_json("books_titles.json")

#  Implementing Search engine after preparing data


###  Loading data into vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(pandasDF["title"])
#tfidf = vectorizer.fit_transform(df["title"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

### HTML  code to show image and link of book

In [None]:
def clickable(val):
    return '<a target="_blank" href="{}">Book_Link</a>'.format(val)

def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)

In [None]:
def search(query):
    processed = re.sub("[^a-zA-Z0-9 ]","",query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity,-10)[-10:]
    results = pandasDF.iloc[indices]
    return results.head(10).style.format({'url':clickable, 'cover_image' : show_image})

##  Searching a book

In [None]:
res = search("No Excusees")