In [2]:
import json
import numpy as np
import pandas as pd
import re
import tqdm
import ast

In [2]:
df = pd.read_csv("Books_Final.csv")

In [3]:
df.columns

Index(['Book Name', 'Author', 'ISBN 13', 'ISBN', 'Number of Pages',
       'Average Rating', 'Total Number of Ratings', 'Publication Date',
       'Date Started', 'Date Completed', 'Name', 'Series', 'Volume',
       'Goodreads Url', 'Description', 'Genre', 'Image Url', 'Low Links',
       'High Links', 'Publishers'],
      dtype='object')

## Dropping Values

### Extra Columns

In [4]:
df.drop(['Date Started', 'Date Completed', 'Book Name', 'Total Number of Ratings'], axis=1, inplace=True)

### The Image Url Columns

In [5]:
def fill_na(value):
    if value == "[]":
        return np.nan
    else:
        return value

In [6]:
df["High Links"] = df["High Links"].apply(fill_na)

In [7]:
def fill_na(value):
    if value == "{'1x': [], '1.5x': [], '2x': [], '2.5x': [], '3x': [], '3.5x': [], '4x': []}":
        return np.nan
    else:
        return value

In [8]:
df["Low Links"] = df["Low Links"].apply(fill_na)

### The ISBN Column

In [9]:
df["ISBN"].isna().sum()

140

In [10]:
df["ISBN 13"].isna().sum()

127

In [11]:
df[(df["ISBN"].isna()) & (df["ISBN 13"].isna())].shape

(126, 16)

In [12]:
df.dropna(thresh=13).shape

(407, 16)

In [13]:
df[df["High Links"].isna()].shape

(110, 16)

In [14]:
df[df["Image Url"].isna()].shape

(11, 16)

In [15]:
df[df["Low Links"].isna()].shape

(86, 16)

In [16]:
df[(df["Low Links"].isna()) & (df["High Links"].isna())].shape

(86, 16)

## Creating New Columns

### The Low_Res_Image_Url Column and the High_Res_Image_Url Column

In [17]:
df.columns

Index(['Author', 'ISBN 13', 'ISBN', 'Number of Pages', 'Average Rating',
       'Publication Date', 'Name', 'Series', 'Volume', 'Goodreads Url',
       'Description', 'Genre', 'Image Url', 'Low Links', 'High Links',
       'Publishers'],
      dtype='object')

In [41]:
df["Image Url"][10]

'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1407572377l/12591698.jpg'

In [42]:
def create_low_res_urls(row):
    low_keys = ['2x', '1.5x']
    low_urls = []
    goodreads_url = row["Image Url"]
    if goodreads_url is not np.nan:
        low_urls.append(goodreads_url)
    try:
        amazon_urls = ast.literal_eval(row["Low Links"])
        for key, values in amazon_urls.items():
            if key in low_keys:
                low_urls.extend(values)
                break
    except:
        pass
    return low_urls

In [43]:
create_low_res_urls(df.iloc[10])

['https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1407572377l/12591698.jpg',
 'https://m.media-amazon.com/images/I/81t2pcbfgrL._AC_UY327_FMwebp_QL65_.jpg',
 'https://m.media-amazon.com/images/I/71XBl40bfiL._AC_UY218_.jpg',
 'https://m.media-amazon.com/images/I/71XBl40bfiL._AC_UY654_FMwebp_QL65_.jpg']

In [44]:
def create_high_res_urls(row):
    high_keys = ['3x', "2.5x"]
    high_urls = []
    try:
        high_urls_amazon = ast.literal_eval(row["High Links"])
        for url in high_urls_amazon:
            high_urls.append(url)
    except:
        pass
    try:
        amazon_urls = ast.literal_eval(row["Low Links"])
        for key, values in amazon_urls.items():
            if key in high_keys:
                high_urls.extend(values)
                break
    except:
        pass
    return high_urls

In [45]:
create_high_res_urls(df.iloc[2])

['https://m.media-amazon.com/images/I/81bxeFvkGoL._AC_UY545_FMwebp_QL65_.jpg',
 'https://m.media-amazon.com/images/I/61u-3eSwrtL._AC_UY436_FMwebp_QL65_.jpg']

In [46]:
create_low_res_urls(df.iloc[2])

['https://m.media-amazon.com/images/I/81bxeFvkGoL._AC_UY327_FMwebp_QL65_.jpg',
 'https://m.media-amazon.com/images/I/61u-3eSwrtL._AC_UY218_.jpg']

In [47]:
df["Low_Res_Image_Url"] = df.apply(create_low_res_urls, axis=1)
df["High_Res_Image_Url"] = df.apply(create_high_res_urls, axis=1)

In [48]:
df.head()

Unnamed: 0,Author,ISBN 13,ISBN,Number of Pages,Average Rating,Publication Date,Name,Series,Volume,Goodreads Url,Description,Genre,Image Url,Low Links,High Links,Publishers,Low_Res_Image_Url,High_Res_Image_Url
0,Patrick Rothfuss,,,93,4.0,2014-01-01,The Lightning Tree,Standalone,1,https://www.goodreads.com/book/show/55262194,The Lightning Tree is a companion short story ...,"['Fantasy', 'Fiction', 'Short Stories', 'Fanta...",https://i.gr-assets.com/images/S/compressed.ph...,,,,[https://i.gr-assets.com/images/S/compressed.p...,[]
1,Randall Munroe,9780544000000.0,544272994.0,303,4.14,2014-09-02,What If? Serious Scientific Answers to Absurd ...,What If?,1,https://www.goodreads.com/book/show/21413662,Randall Munroe left NASA in 2005 to start up h...,"['Nonfiction', 'Science', 'Humor', 'Audiobook'...",https://i.gr-assets.com/images/S/compressed.ph...,,,Mariner Books,[https://i.gr-assets.com/images/S/compressed.p...,[]
2,James S.A. Corey,9780357000000.0,356510395.0,528,4.54,2021-11-30,Leviathan Falls,The Expanse,9,https://www.goodreads.com/book/show/58783427,"The Laconian Empire has fallen, setting the th...","['Science Fiction', 'Fiction', 'Space', 'Space...",,{'1x': ['https://m.media-amazon.com/images/I/8...,,,[https://m.media-amazon.com/images/I/81bxeFvkG...,[https://m.media-amazon.com/images/I/81bxeFvkG...
3,James S.A. Corey,9780316000000.0,316332879.0,534,4.57,2019-03-26,Tiamat's Wrath,The Expanse,8,https://www.goodreads.com/book/show/28335698,Thirteen hundred gates have opened to solar sy...,"['Science Fiction', 'Fiction', 'Space', 'Space...",https://i.gr-assets.com/images/S/compressed.ph...,{'1x': ['https://m.media-amazon.com/images/I/8...,['https://images-na.ssl-images-amazon.com/imag...,Orbit Books,[https://i.gr-assets.com/images/S/compressed.p...,[https://images-na.ssl-images-amazon.com/image...
4,James S.A. Corey,9780316000000.0,316332828.0,608,4.37,2017-12-05,Persepolis Rising,The Expanse,7,https://www.goodreads.com/book/show/28335696,In the thousand-sun network of humanity's expa...,"['Science Fiction', 'Fiction', 'Space', 'Space...",https://i.gr-assets.com/images/S/compressed.ph...,{'1x': ['https://m.media-amazon.com/images/I/9...,['https://images-na.ssl-images-amazon.com/imag...,Orbit Books,[https://i.gr-assets.com/images/S/compressed.p...,[https://images-na.ssl-images-amazon.com/image...


In [49]:
df["Author"].isna().sum()

0

In [50]:
df["Low_Res_Image_Url"].fillna(value="[]", inplace=True)
df["High_Res_Image_Url"].fillna(value="[]", inplace=True)
df["Genre"].fillna(value="[]", inplace=True)
df["Publishers"].fillna(value="", inplace=True)
df["Description"].fillna(value="", inplace=True)
df["Publication Date"].fillna(value="", inplace=True)
df["Average Rating"].fillna(value="", inplace=True)
df["Number of Pages"].fillna(value="", inplace=True)
df["ISBN 13"].fillna(value="", inplace=True)

In [51]:
df["ISBN 13"]= df["ISBN 13"].apply(lambda x: int(x) if x != "" else '')

In [52]:
def get_date_format(value):
    try:
        date = pd.to_datetime(value)
        return date.strftime("%B %d, %Y")
    except:
        return value

In [53]:
get_date_format(df["Publication Date"][10])

'June 07, 2012'

In [54]:
df["Publication Date"] = df["Publication Date"].apply(get_date_format)

In [55]:
df.to_csv("FINAL_DF_WITH_EVERYTHING.csv", index=False)

In [3]:
df = pd.read_csv("FINAL_DF_WITH_EVERYTHING.csv")
df["Genre"]

0      ['Fantasy', 'Fiction', 'Short Stories', 'Fanta...
1      ['Nonfiction', 'Science', 'Humor', 'Audiobook'...
2      ['Science Fiction', 'Fiction', 'Space', 'Space...
3      ['Science Fiction', 'Fiction', 'Space', 'Space...
4      ['Science Fiction', 'Fiction', 'Space', 'Space...
                             ...                        
445    ['Fantasy', 'Young Adult', 'Fiction', 'Fantasy...
446    ['Fantasy', 'Fiction', 'Young Adult', 'Fantasy...
447    ['Fantasy', 'Classics', 'Fiction', 'Adventure'...
448    ['Fantasy', 'Fiction', 'Young Adult', 'Fantasy...
449    ['Classics', 'Fiction', 'Romance', 'Historical...
Name: Genre, Length: 450, dtype: object

In [7]:
df["Description"] = df["Description"].apply(lambda x: f"""{str(x)}""")

In [19]:
df.columns

Index(['Author', 'ISBN 13', 'ISBN', 'Number of Pages', 'Average Rating',
       'Publication Date', 'Name', 'Series', 'Volume', 'Goodreads Url',
       'Description', 'Genre', 'Image Url', 'Low Links', 'High Links',
       'Publishers', 'Low_Res_Image_Url', 'High_Res_Image_Url',
       'New_Low_Res_Image_Url', 'New_High_Res_Image_Url'],
      dtype='object')

In [23]:
df["High Links"][10], df["New_High_Res_Image_Url"][10]

("['https://images-na.ssl-images-amazon.com/images/I/81t2pcbfgrL.jpg', 'https://images-na.ssl-images-amazon.com/images/I/81bSs2e--VL.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71XLEDYZfDL.jpg', 'https://images-na.ssl-images-amazon.com/images/I/81xqAxr5wfL.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41mDwIDbxbL.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71XBl40bfiL.jpg', 'https://images-na.ssl-images-amazon.com/images/I/915mh7F7qKL.jpg']",
 "['https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/81t2pcbfgrL.jpg', 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/81bSs2e--VL.jpg', 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/71XLEDYZfDL.jpg', 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/81xqAxr5wfL.jpg', 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/41mDwIDbxbL.jpg', 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/71XBl4

In [26]:
df[df["High Links"].isna()]

Unnamed: 0,Author,ISBN 13,ISBN,Number of Pages,Average Rating,Publication Date,Name,Series,Volume,Goodreads Url,Description,Genre,Image Url,Low Links,High Links,Publishers,Low_Res_Image_Url,High_Res_Image_Url,New_Low_Res_Image_Url,New_High_Res_Image_Url
0,Patrick Rothfuss,,,93,4.00,"January 01, 2014",The Lightning Tree,Standalone,1,https://www.goodreads.com/book/show/55262194,The Lightning Tree is a companion short story ...,"['Fantasy', 'Fiction', 'Short Stories', 'Fanta...",https://i.gr-assets.com/images/S/compressed.ph...,,,,['https://i.gr-assets.com/images/S/compressed....,[],['https://bucketforbookgallery.s3.ap-south-1.a...,[]
1,Randall Munroe,9.780544e+12,0544272994,303,4.14,"September 02, 2014",What If? Serious Scientific Answers to Absurd ...,What If?,1,https://www.goodreads.com/book/show/21413662,Randall Munroe left NASA in 2005 to start up h...,"['Nonfiction', 'Science', 'Humor', 'Audiobook'...",https://i.gr-assets.com/images/S/compressed.ph...,,,Mariner Books,['https://i.gr-assets.com/images/S/compressed....,[],['https://bucketforbookgallery.s3.ap-south-1.a...,[]
2,James S.A. Corey,9.780357e+12,0356510395,528,4.54,"November 30, 2021",Leviathan Falls,The Expanse,9,https://www.goodreads.com/book/show/58783427,"The Laconian Empire has fallen, setting the th...","['Science Fiction', 'Fiction', 'Space', 'Space...",,{'1x': ['https://m.media-amazon.com/images/I/8...,,,['https://m.media-amazon.com/images/I/81bxeFvk...,['https://m.media-amazon.com/images/I/81bxeFvk...,['https://bucketforbookgallery.s3.ap-south-1.a...,['https://bucketforbookgallery.s3.ap-south-1.a...
9,James S.A. Corey,,,30,4.02,"November 27, 2012",Drive,The Expanse,2,https://www.goodreads.com/book/show/25899877,"Read ""Drive"", a Prequel to The Expanse series....","['Science Fiction', 'Fiction', 'Short Stories'...",https://i.gr-assets.com/images/S/compressed.ph...,{'1x': ['https://m.media-amazon.com/images/I/7...,,SyFy,['https://i.gr-assets.com/images/S/compressed....,['https://m.media-amazon.com/images/I/71tfW-GC...,['https://bucketforbookgallery.s3.ap-south-1.a...,['https://bucketforbookgallery.s3.ap-south-1.a...
11,James S.A. Corey,,,68,3.72,"January 01, 2012",Gods of Risk,The Expanse,2,https://www.goodreads.com/book/show/15837317,"As tension between Mars and Earth mounts, and ...","['Science Fiction', 'Fiction', 'Space', 'Space...",https://i.gr-assets.com/images/S/compressed.ph...,{'1x': ['https://m.media-amazon.com/images/I/8...,,Orbit,['https://i.gr-assets.com/images/S/compressed....,['https://m.media-amazon.com/images/I/81ZhBPQ6...,['https://bucketforbookgallery.s3.ap-south-1.a...,['https://bucketforbookgallery.s3.ap-south-1.a...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,Hilda D. Spear,9.780333e+12,0333372867,96,4.14,"April 04, 1985",Wuthering Heights by Emily Brontë,Standalone,1,https://www.goodreads.com/book/show/130346,,"['Classics', 'Fiction', 'Romance', 'Literature...",https://i.gr-assets.com/images/S/compressed.ph...,,,,['https://i.gr-assets.com/images/S/compressed....,[],['https://bucketforbookgallery.s3.ap-south-1.a...,[]
419,Jane Austen,9.781593e+12,1593082649,260,3.84,"December 01, 1817",Northanger Abbey,Standalone,1,https://www.goodreads.com/book/show/50398,A wonderfully entertaining coming-of-age story...,"['Classics', 'Fiction', 'Romance', 'Gothic', '...",https://i.gr-assets.com/images/S/compressed.ph...,,,Barnes & Noble Classics,['https://i.gr-assets.com/images/S/compressed....,[],['https://bucketforbookgallery.s3.ap-south-1.a...,[]
420,Jane Austen,9.780141e+12,0141439807,488,3.86,"July 01, 1814",Mansfield Park,Standalone,1,https://www.goodreads.com/book/show/45032,Taken from the poverty of her parents' home in...,"['Classics', 'Fiction', 'Romance', 'Historical...",https://i.gr-assets.com/images/S/compressed.ph...,,,Penguin Books,['https://i.gr-assets.com/images/S/compressed....,[],['https://bucketforbookgallery.s3.ap-south-1.a...,[]
430,George R.R. Martin,,,1218,4.54,"August 08, 2000",A Storm of Swords,A Song of Ice and Fire,3,https://www.goodreads.com/book/show/62291,alternate cover for ISBN 055357342X/9780553573...,"['Fantasy', 'Fiction', 'Fantasy', 'Epic Fantas...",https://i.gr-assets.com/images/S/compressed.ph...,,,Bantam,['https://i.gr-assets.com/images/S/compressed....,[],['https://bucketforbookgallery.s3.ap-south-1.a...,[]


In [6]:
df["High Links"].fillna(value="['https://images-na.ssl-images-amazon.com/images/I/21-kmLZ9t0L.jpg', 'https://d1lp72kdku3ux1.cloudfront.net/title_instance/8db/small/612711.jpg']", inplace=True)

In [9]:
df["Low Links"].fillna(value="['https://images-na.ssl-images-amazon.com/images/I/21-kmLZ9t0L.jpg', 'https://d1lp72kdku3ux1.cloudfront.net/title_instance/8db/small/612711.jpg']", inplace=True)

In [10]:
df.Genre.isna().sum()

0

# Creating the JSON

In [8]:
sample = {
        "name": "Rhythm of War",
        "author": "Brandon Sanderson",
        "genre": [
            "Fantasy",
            "Epic Fantasy"
        ],
        "language": "English",
        "numPages": 1220,
        "description": "After forming a coalition of human resistance against the enemy invasion, Dalinar Kholin and his Knights Radiant have spent a year fighting a protracted, brutal war. Neither side has gained an advantage, and the threat of a betrayal by Dalinar's crafty ally Taravangian looms over every strategic move./n Now, as new technological discoveries by Navani Kholin's scholars begin to change the face of the war, the enemy prepares a bold and dangerous operation. The arms race that follows will challenge the very core of the Radiant ideals, and potentially reveal the secrets of the ancient tower that was once the heart of their strength./n At the same time that Kaladin Stormblessed must come to grips with his changing role within the Knights Radiant, his Windrunners face their own problem: As more and more deadly enemy Fused awaken to wage war, no more honorspren are willing to bond with humans to increase the number of Radiants. Adolin and Shallan must lead the coalition’s envoy to the honorspren stronghold of Lasting Integrity and either convince the spren to join the cause against the evil god Odium, or personally face the storm of failure.",
        "yearPublished": 2020,
        "series": "Stormlight Archive",
        "ISBN": 9780765326386,
        "avgRating": 4.62,
        "publisher": "Tor Books",
        "imgUrlsLow": [
            "https://images-na.ssl-images-amazon.com/images/I/51-X-Q-X-QL._SX331_BO1,204,203,200_.jpg"
        ],
        "imgUrlsHigh": [
            "/images/400/Rhythm  400px.png",
            "/images/400/Jane Eyre.jpg",
            "/images/400/Harry Potter and the Prisoner Azkaban 400px.png"
        ]
    },

In [9]:
df.columns

Index(['Author', 'ISBN 13', 'ISBN', 'Number of Pages', 'Average Rating',
       'Publication Date', 'Name', 'Series', 'Volume', 'Goodreads Url',
       'Description', 'Genre', 'Image Url', 'Low Links', 'High Links',
       'Publishers', 'Low_Res_Image_Url', 'High_Res_Image_Url',
       'New_Low_Res_Image_Url', 'New_High_Res_Image_Url'],
      dtype='object')

In [10]:
ast.literal_eval(df.iloc[10]["Genre"])

['Science Fiction', 'Fiction', 'Space', 'Space Opera', 'Audiobook', 'Space']

In [11]:
df.iloc[10]["Low_Res_Image_Url"], df.iloc[10]["New_Low_Res_Image_Url"]

("['https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1407572377l/12591698.jpg', 'https://m.media-amazon.com/images/I/81t2pcbfgrL._AC_UY327_FMwebp_QL65_.jpg', 'https://m.media-amazon.com/images/I/71XBl40bfiL._AC_UY218_.jpg', 'https://m.media-amazon.com/images/I/71XBl40bfiL._AC_UY654_FMwebp_QL65_.jpg']",
 "['https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/low/12591698.jpg', 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/low/81t2pcbfgrL._AC_UY327_FMwebp_QL65_.jpg', 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/low/71XBl40bfiL._AC_UY218_.jpg', 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/low/71XBl40bfiL._AC_UY654_FMwebp_QL65_.jpg']")

In [15]:
the_json = []
for i in tqdm.tqdm(range(df.shape[0])):
    name = df.iloc[i]["Name"]
    author = df.iloc[i]["Author"]
    try:
        genre = list(ast.literal_eval(df.iloc[i]["Genre"]))
    except:
        genre = ast.literal_eval(df.iloc[i]["Genre"])
    language = "English"
    numPages = df.iloc[i]["Number of Pages"]
    description = df.iloc[i]["Description"]
    yearPublished = df.iloc[i]["Publication Date"]
    series = df.iloc[i]["Series"]
    ISBN = df.iloc[i]["ISBN 13"]
    avgRating = df.iloc[i]["Average Rating"]
    publisher = df.iloc[i]["Publishers"]
    imgUrlsLow = list(ast.literal_eval(df.iloc[i]["New_Low_Res_Image_Url"]))
    imgUrlsHigh = list(ast.literal_eval(df.iloc[i]["New_High_Res_Image_Url"]))
    temp_dict = {
        "name": name,
        "author": author,
        "genre": genre,
        "language": language,
        "numPages": numPages,
        "description": description,
        "yearPublished": yearPublished,
        "series": series,
        "ISBN": ISBN,
        "avgRating": avgRating,
        "publisher": publisher,
        "imgUrlsLow": imgUrlsLow,
        "imgUrlsHigh": imgUrlsHigh
    }
    the_json.append(temp_dict)

100%|██████████| 450/450 [00:01<00:00, 314.38it/s]


In [62]:
i

449

In [20]:
genre

['Classics',
 'Fiction',
 'Romance',
 'Historical',
 'Historical Fiction',
 'Literature']

In [16]:
the_json[100]["genre"]

['Science Fiction', 'Young Adult', 'Fantasy', 'Romance', 'Audiobook']

In [17]:
the_json[100]["imgUrlsHigh"]

['https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/519FCkOOYRL.jpg',
 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/81ak4DBpIpL.jpg',
 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/61kCfD5R8PL.jpg',
 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/519FCkOOYRL.jpg',
 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/81vRz5AdpmL.jpg',
 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/91XbN+bLNDL.jpg',
 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/81ak4DBpIpL._AC_UY436_FMwebp_QL65_.jpg',
 'https://bucketforbookgallery.s3.ap-south-1.amazonaws.com/images/high/61kCfD5R8PL._AC_UY327_FMwebp_QL65_.jpg']

In [18]:
json_file = open("../FINAL_2.json", "w")
json_file.write(json.dumps(the_json))
json_file.close()