# In this section data is loaded into spark dataframe and filtered data
in this section there a data pipeline in order to extract data from json file transform it using PySpark and load it back into anothor json file.

### Importing libraries

In [1]:
#importing the needed libraries
import json
import gzip
from pyspark.sql.functions import col
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
'''
parseLine function get single row from goodreads_books data and return
dictionary of (book id, title, number of ratings, url and cover Image)
'''

def parseLine(line):
    data = json.loads(line)
    return {
        "book_id" : data["book_id"],
        "title" : data["title_without_series"],
        "ratings" : data["ratings_count"],
        "url" : data["url"],
        "cover_image" : data["image_url"]
    }

### Loading goodreads_books and getting needed values

In [3]:
books_titles = []

with gzip.open("goodreads_books_.json.gz",'r') as f:
    i = 0
    while i < 100000:
        i += 1
        line = f.readline()
        if not line:
            break;
            
        fields = parseLine(line)
        
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
            
        if ratings > 5:
            books_titles.append(fields)

###  Loading data into Spark Dataframe

In [4]:
#creating a dataframe FROM books_titles list of dictionary in order to clean data
df = spark.createDataFrame(books_titles)

In [5]:
#df.show()

In [6]:
#Converting Ratings column from String datatype to Integer Data type
df = df.withColumn("ratings",col("ratings").cast("Integer")) 

In [7]:
#df.show()

### Cleaning Data

In [8]:
#importing regexp_replace from pyspark to perform regular expressions
from pyspark.sql.functions import regexp_replace,lower

#removing all special chars from titles so that searching for book titles become easier and more efficient
#removing all chars apart from  a-z, A-Z, 0-9 and ' '(space)
df = df.withColumn("title", regexp_replace(df["title"], "[^a-zA-Z0-9 ]", "")); 

#Convering title column into lower case so can search for book titles become more efficient
df = df.withColumn("title", lower(df["title"])); 

#replacing more than one space to single space
df = df.withColumn("title", regexp_replace(df["title"], "\s+", " ")); 

In [9]:
### Filter using length of the column in pyspark
 
    
from pyspark.sql.functions import length
df = df.where(length(col("title")) > 0)

 ###  Saving Cleaned data into Json File

In [10]:
#Storing our dataframe into Json file format by converting it into pandas DF
pandasDF = df.toPandas()
pandasDF.to_json("books_titles.json")