In [4]:
import json
import numpy as np
import pandas as pd
import re
import tqdm
import ast

In [22]:
df = pd.read_csv("Books_Final.csv")

In [17]:
df.columns

Index(['Book Name', 'Author', 'ISBN 13', 'ISBN', 'Number of Pages',
       'Average Rating', 'Total Number of Ratings', 'Publication Date',
       'Date Started', 'Date Completed', 'Name', 'Series', 'Volume',
       'Goodreads Url', 'Description', 'Genre', 'Image Url', 'Low Links',
       'High Links', 'Publishers'],
      dtype='object')

## Dropping Values

### Extra Columns

In [23]:
df.drop(['Date Started', 'Date Completed', 'Book Name', 'Total Number of Ratings'], axis=1, inplace=True)

### The Image Url Columns

In [45]:
def fill_na(value):
    if value == "[]":
        return np.nan
    else:
        return value

In [47]:
df["High Links"] = df["High Links"].apply(fill_na)

In [50]:
def fill_na(value):
    if value == "{'1x': [], '1.5x': [], '2x': [], '2.5x': [], '3x': [], '3.5x': [], '4x': []}":
        return np.nan
    else:
        return value

In [52]:
df["Low Links"] = df["Low Links"].apply(fill_na)

### The ISBN Column

In [24]:
df["ISBN"].isna().sum()

140

In [25]:
df["ISBN 13"].isna().sum()

127

In [28]:
df[(df["ISBN"].isna()) & (df["ISBN 13"].isna())].shape

(126, 16)

In [55]:
df.dropna(thresh=13).shape

(407, 16)

In [58]:
df[df["High Links"].isna()].shape

(110, 16)

In [59]:
df[df["Image Url"].isna()].shape

(11, 16)

In [60]:
df[df["Low Links"].isna()].shape

(86, 16)

In [61]:
df[(df["Low Links"].isna()) & (df["High Links"].isna())].shape

(86, 16)

## Creating New Columns

### The Low_Res_Image_Url Column and the High_Res_Image_Url Column

In [63]:
df.columns

Index(['Author', 'ISBN 13', 'ISBN', 'Number of Pages', 'Average Rating',
       'Publication Date', 'Name', 'Series', 'Volume', 'Goodreads Url',
       'Description', 'Genre', 'Image Url', 'Low Links', 'High Links',
       'Publishers'],
      dtype='object')

In [68]:
df["Image Url"][10]

'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1407572377l/12591698.jpg'

In [74]:
def create_low_res_urls(row):
    low_keys = ['2x', '1.5x']
    low_urls = []
    goodreads_url = row["Image Url"]
    if goodreads_url is not np.nan:
        low_urls.append(goodreads_url)
    try:
        amazon_urls = ast.literal_eval(row["Low Links"])
        for key, values in amazon_urls.items():
            if key in low_keys:
                low_urls.extend(values)
                break
    except:
        pass
    return low_urls

In [75]:
create_low_res_urls(df.iloc[10])

['https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1407572377l/12591698.jpg',
 'https://m.media-amazon.com/images/I/81t2pcbfgrL._AC_UY327_FMwebp_QL65_.jpg',
 'https://m.media-amazon.com/images/I/71XBl40bfiL._AC_UY218_.jpg',
 'https://m.media-amazon.com/images/I/71XBl40bfiL._AC_UY654_FMwebp_QL65_.jpg']

In [78]:
def create_high_res_urls(row):
    high_keys = ['3x', "2.5x"]
    high_urls = []
    try:
        high_urls_amazon = ast.literal_eval(row["High Links"])
        for url in high_urls_amazon:
            high_urls.append(url)
    except:
        pass
    try:
        amazon_urls = ast.literal_eval(row["Low Links"])
        for key, values in amazon_urls.items():
            if key in high_keys:
                high_urls.extend(values)
                break
    except:
        pass
    return high_urls

In [82]:
create_high_res_urls(df.iloc[2])

['https://m.media-amazon.com/images/I/81bxeFvkGoL._AC_UY545_FMwebp_QL65_.jpg',
 'https://m.media-amazon.com/images/I/61u-3eSwrtL._AC_UY436_FMwebp_QL65_.jpg']

In [84]:
create_low_res_urls(df.iloc[2])

['https://m.media-amazon.com/images/I/81bxeFvkGoL._AC_UY327_FMwebp_QL65_.jpg',
 'https://m.media-amazon.com/images/I/61u-3eSwrtL._AC_UY218_.jpg']

In [86]:
df["Low_Res_Image_Url"] = df.apply(create_low_res_urls, axis=1)
df["High_Res_Image_Url"] = df.apply(create_high_res_urls, axis=1)

In [87]:
df.head()

Unnamed: 0,Author,ISBN 13,ISBN,Number of Pages,Average Rating,Publication Date,Name,Series,Volume,Goodreads Url,Description,Genre,Image Url,Low Links,High Links,Publishers,Low_Res_Image_Url,High_Res_Image_Url
0,Patrick Rothfuss,,,93,4.0,2014-01-01,The Lightning Tree,Standalone,1,https://www.goodreads.com/book/show/55262194,The Lightning Tree is a companion short story ...,"['Fantasy', 'Fiction', 'Short Stories', 'Fanta...",https://i.gr-assets.com/images/S/compressed.ph...,,,,[https://i.gr-assets.com/images/S/compressed.p...,[]
1,Randall Munroe,9780544000000.0,544272994.0,303,4.14,2014-09-02,What If? Serious Scientific Answers to Absurd ...,What If?,1,https://www.goodreads.com/book/show/21413662,Randall Munroe left NASA in 2005 to start up h...,"['Nonfiction', 'Science', 'Humor', 'Audiobook'...",https://i.gr-assets.com/images/S/compressed.ph...,,,Mariner Books,[https://i.gr-assets.com/images/S/compressed.p...,[]
2,James S.A. Corey,9780357000000.0,356510395.0,528,4.54,2021-11-30,Leviathan Falls,The Expanse,9,https://www.goodreads.com/book/show/58783427,"The Laconian Empire has fallen, setting the th...","['Science Fiction', 'Fiction', 'Space', 'Space...",,{'1x': ['https://m.media-amazon.com/images/I/8...,,,[https://m.media-amazon.com/images/I/81bxeFvkG...,[https://m.media-amazon.com/images/I/81bxeFvkG...
3,James S.A. Corey,9780316000000.0,316332879.0,534,4.57,2019-03-26,Tiamat's Wrath,The Expanse,8,https://www.goodreads.com/book/show/28335698,Thirteen hundred gates have opened to solar sy...,"['Science Fiction', 'Fiction', 'Space', 'Space...",https://i.gr-assets.com/images/S/compressed.ph...,{'1x': ['https://m.media-amazon.com/images/I/8...,['https://images-na.ssl-images-amazon.com/imag...,Orbit Books,[https://i.gr-assets.com/images/S/compressed.p...,[https://images-na.ssl-images-amazon.com/image...
4,James S.A. Corey,9780316000000.0,316332828.0,608,4.37,2017-12-05,Persepolis Rising,The Expanse,7,https://www.goodreads.com/book/show/28335696,In the thousand-sun network of humanity's expa...,"['Science Fiction', 'Fiction', 'Space', 'Space...",https://i.gr-assets.com/images/S/compressed.ph...,{'1x': ['https://m.media-amazon.com/images/I/9...,['https://images-na.ssl-images-amazon.com/imag...,Orbit Books,[https://i.gr-assets.com/images/S/compressed.p...,[https://images-na.ssl-images-amazon.com/image...


In [115]:
df["Author"].isna().sum()

0

In [116]:
df["Low_Res_Image_Url"].fillna(value="[]", inplace=True)
df["High_Res_Image_Url"].fillna(value="[]", inplace=True)
df["Genre"].fillna(value="[]", inplace=True)
df["Publishers"].fillna(value="", inplace=True)
df["Description"].fillna(value="", inplace=True)
df["Publication Date"].fillna(value="", inplace=True)
df["Average Rating"].fillna(value="", inplace=True)
df["Number of Pages"].fillna(value="", inplace=True)
df["ISBN 13"].fillna(value="", inplace=True)

In [127]:
df["ISBN 13"]= df["ISBN 13"].apply(lambda x: int(x) if x != "" else '')

In [145]:
def get_date_format(value):
    try:
        date = pd.to_datetime(value)
        return date.strftime("%B %d, %Y")
    except:
        return value

In [146]:
get_date_format(df["Publication Date"][10])

'June 07, 2012'

In [147]:
df["Publication Date"] = df["Publication Date"].apply(get_date_format)

In [149]:
df.to_csv("FINAL_DF_WITH_EVERYTHING.csv", index=False)

# Creating the JSON

In [150]:
sample = {
        "name": "Rhythm of War",
        "author": "Brandon Sanderson",
        "genre": [
            "Fantasy",
            "Epic Fantasy"
        ],
        "language": "English",
        "numPages": 1220,
        "description": "After forming a coalition of human resistance against the enemy invasion, Dalinar Kholin and his Knights Radiant have spent a year fighting a protracted, brutal war. Neither side has gained an advantage, and the threat of a betrayal by Dalinar's crafty ally Taravangian looms over every strategic move./n Now, as new technological discoveries by Navani Kholin's scholars begin to change the face of the war, the enemy prepares a bold and dangerous operation. The arms race that follows will challenge the very core of the Radiant ideals, and potentially reveal the secrets of the ancient tower that was once the heart of their strength./n At the same time that Kaladin Stormblessed must come to grips with his changing role within the Knights Radiant, his Windrunners face their own problem: As more and more deadly enemy Fused awaken to wage war, no more honorspren are willing to bond with humans to increase the number of Radiants. Adolin and Shallan must lead the coalition’s envoy to the honorspren stronghold of Lasting Integrity and either convince the spren to join the cause against the evil god Odium, or personally face the storm of failure.",
        "yearPublished": 2020,
        "series": "Stormlight Archive",
        "ISBN": 9780765326386,
        "avgRating": 4.62,
        "publisher": "Tor Books",
        "imgUrlsLow": [
            "https://images-na.ssl-images-amazon.com/images/I/51-X-Q-X-QL._SX331_BO1,204,203,200_.jpg"
        ],
        "imgUrlsHigh": [
            "/images/400/Rhythm  400px.png",
            "/images/400/Jane Eyre.jpg",
            "/images/400/Harry Potter and the Prisoner Azkaban 400px.png"
        ]
    },

In [105]:
df.columns

Index(['Author', 'ISBN 13', 'ISBN', 'Number of Pages', 'Average Rating',
       'Publication Date', 'Name', 'Series', 'Volume', 'Goodreads Url',
       'Description', 'Genre', 'Image Url', 'Low Links', 'High Links',
       'Publishers', 'Low_Res_Image_Url', 'High_Res_Image_Url'],
      dtype='object')

In [153]:
the_json = []
for i in tqdm.tqdm(range(df.shape[0])):
    name = df.iloc[i]["Name"]
    author = df.iloc[i]["Author"]
    genre = df.iloc[i]["Genre"]
    language = "English"
    numPages = df.iloc[i]["Number of Pages"]
    description = df.iloc[i]["Description"]
    yearPublished = df.iloc[i]["Publication Date"]
    series = df.iloc[i]["Series"]
    ISBN = df.iloc[i]["ISBN 13"]
    avgRating = df.iloc[i]["Average Rating"]
    publisher = df.iloc[i]["Publishers"]
    imgUrlsLow = df.iloc[i]["Low_Res_Image_Url"]
    imgUrlsHigh = df.iloc[i]["High_Res_Image_Url"]
    temp_dict = {
        "name": name,
        "author": author,
        "genre": genre,
        "language": language,
        "numPages": numPages,
        "description": description,
        "yearPublished": yearPublished,
        "series": series,
        "ISBN": ISBN,
        "avgRating": avgRating,
        "publisher": publisher,
        "imgUrlsLow": imgUrlsLow,
        "imgUrlsHigh": imgUrlsHigh
    }
    the_json.append(temp_dict)

100%|██████████| 450/450 [00:04<00:00, 102.52it/s]


In [155]:
the_json[100]

{'name': 'Aurora Burning',
 'author': 'Amie Kaufman',
 'genre': "['Science Fiction', 'Young Adult', 'Fantasy', 'Romance', 'Audiobook']",
 'language': 'English',
 'numPages': '495',
 'description': 'Our heroes are back… kind of. From the bestselling co-authors of the Illuminae Files comes the second book in the epic series about a squad of misfits, losers, and discipline cases who just might be the galaxy’s best hope for survival.\nFirst, the bad news: an ancient evil—you know, your standard consume-all-life-in-the-galaxy deal—is about to be unleashed. The good news? Squad 312 is standing by to save the day. They’ve just got to take care of a few small distractions first.\nLike the clan of gremps who’d like to rearrange their favorite faces.\nAnd the cadre of illegit GIA agents with creepy flowers where their eyes used to be, who’ll stop at nothing to get their hands on Auri.\nThen there’s Kal’s long-lost sister, who’s not exactly happy to see her baby brother, and has a Syldrathi army 

In [157]:
json_file = open("FINAL_INFO_WITH_EVERYTHING.json", "w")
json_file.write(json.dumps(the_json))
json_file.close()