### Extract data from kaggle's file 'nyt2.json'

In [1]:
# Import dependencies needed
import json
import pandas as pd
from pprint import pprint

In [2]:
# Load 'nyt2.json' file into dataframe:
raw_nyt = pd.read_json('Resources/nyt2.json', lines=True, orient='columns')
#raw_nyt.head()

In [3]:
# Save DataFrame 'raw_nyt' into a json file ('output.json') and load it as 'data' we can now work with:
raw_nyt.to_json(path_or_buf='output.json', orient = "records")
with open('output.json') as file:
    data = json.load(file)
#pprint(data)

In [4]:
# Set up lists to hold reponse info:

nyt_ids = []
urls = []
authors = []
bestsellers_dates = []
descriptions = []
prices = []
published_dates = []
publishers = []
ranks = []
ranks_last_week = []
titles = []
weeks_on_lists = []

# Populate the lists:

for item in data:
    nyt_ids.append(item['_id']['$oid'])
    urls.append(item['amazon_product_url'])
    authors.append(item['author'])
    bestsellers_dates.append(item['bestsellers_date']['$date']['$numberLong'])
    descriptions.append(item['description'])
    published_dates.append(item['published_date']['$date']['$numberLong'])
    publishers.append(item['publisher'])
    ranks.append(item['rank']['$numberInt'])
    ranks_last_week.append(item['rank_last_week']['$numberInt'])
    titles.append(item['title'])
    weeks_on_lists.append(item['weeks_on_list']['$numberInt'])
# Here we have to check for the correct keyname before we can extract the price string:
    price_key, = item['price'].keys()
    if price_key == '$numberInt' or price_key == '$numberDouble':
        prices.append(item['price'][price_key]) 

In [5]:
# Create a DataFrame from the lists

bestsellers_dict = {
    "nyt_id": nyt_ids,
    "title": titles,
    "author": authors,
    "url": urls,
    "publisher": publishers,
    "description": descriptions,
    "list_price": prices,
    "published_date": published_dates,
    "bestseller_date": bestsellers_dates,
    "rank": ranks,
    "rank_last_week": ranks_last_week,
    "weeks_on_list": weeks_on_lists
}
bestsellers_data = pd.DataFrame(bestsellers_dict)
#bestsellers_data.head()

### Extract info from amazon

In this step, for each unique URL from our bestsellers dataframe, we collect the Amazon listing price, number of customer reviews, and average 5-star rating, and thumbnail image for the book cover.

In [6]:
# Import dependencies needed
import requests
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"}


In [7]:
# Remove duplicates from list of amazon urls
amazon_urls = list(dict.fromkeys(urls))
len(amazon_urls)

2329

In [8]:
# Set-up lists to store scraped info and user-defined functions for the iterations

reviews = []
ratings = []
offers = []
img_urls = []

In [9]:
def get_nb_reviews(x_url):
    if x_url not in [url for (url, _) in reviews]:
        response = requests.get(x_url, headers=headers)
        soup = BeautifulSoup(response.text, 'lxml')
        review = soup.find_all('span', id="acrCustomerReviewText")[0].text
        reviews.append((url, review))

def get_rating(x_url):
    if x_url not in [url for (url, _) in ratings]:
        response = requests.get(x_url, headers=headers)
        soup = BeautifulSoup(response.text, 'lxml')
        rating = soup.find('span', class_="a-icon-alt").text
        ratings.append((url, rating))

def get_offer(x_url):
    if x_url not in [url for (url, _) in offers]:
        response = requests.get(x_url, headers=headers)
        soup = BeautifulSoup(response.text, 'lxml')
        offer = soup.find('span', class_="a-size-medium a-color-price offer-price a-text-normal").text
        offers.append((url, offer))

def get_img(x_url):
    if x_url not in [url for (url, _) in img_urls]:
        response = requests.get(x_url, headers=headers)
        soup = BeautifulSoup(response.text, 'lxml')
        img_url = soup.find('div', id='img-canvas').find('img')['data-a-dynamic-image']
        img_urls.append((url, img_url))

In [10]:
search_urls = amazon_urls

In [228]:
# extract info from amazon using requests

missing_urls = []
for url in search_urls:
    try:
        get_img(url)
        get_offer(url)
        get_rating(url)
        get_nb_reviews(url)
        print(url)
    except (AttributeError, RuntimeError, TypeError, NameError, IndexError):
        print("error")
        missing_urls.append(url)
search_urls = missing_urls


http://www.amazon.com/Odd-Hours-Dean-Koontz/dp/0553807056?tag=NYTBS-20
http://www.amazon.com/The-Host-Novel-Stephenie-Meyer/dp/0316218502?tag=NYTBS-20
http://www.amazon.com/Love-Youre-With-Emily-Giffin/dp/0312348665?tag=NYTBS-20
error
http://www.amazon.com/Snuff-Chuck-Palahniuk/dp/0385517882?tag=NYTBS-20
http://www.amazon.com/Sundays-at-Tiffanys-James-Patterson/dp/0446199443?tag=NYTBS-20
http://www.amazon.com/Phantom-Prey-John-Sandford/dp/0425227987?tag=NYTBS-20
http://www.amazon.com/From-Worse-Southern-Vampire-Mysteries/dp/0441015891?tag=NYTBS-20
http://www.amazon.com/Where-Are-You-Now-Novel/dp/1416566384?tag=NYTBS-20
http://www.amazon.com/The-Whole-Truth-David-Baldacci/dp/0446539686?tag=NYTBS-20
http://www.amazon.com/Careless-Red-Novel-Elizabeth-George/dp/0061160873?tag=NYTBS-20
http://www.amazon.com/Unaccustomed-Earth-Jhumpa-Lahiri/dp/0307265730?tag=NYTBS-20
error
http://www.amazon.com/Twenty-Wishes-Blossom-Street-Books/dp/0778326314?tag=NYTBS-20
http://www.amazon.com/Hold-Tight-Har

error
error
http://www.amazon.com/A-Good-Woman-Novel/dp/0440243300?tag=NYTBS-20
http://www.amazon.com/Mckettrick-Christmas-Linda-Lael-Miller/dp/0373773021?tag=NYTBS-20
http://www.amazon.com/Divine-Justice-Camel-David-Baldacci/dp/0446544884?tag=NYTBS-20
http://www.amazon.com/Salvation-Death-J-D-Robb/dp/0399155228?tag=NYTBS-20
http://www.amazon.com/Swallowing-Darkness-Novel-Merry-Gentry/dp/0345495942?tag=NYTBS-20
http://www.amazon.com/Midnight-A-Gangster-Love-Story/dp/1416545360?tag=NYTBS-20
error
http://www.amazon.com/The-Christmas-Sweater-Picture-Book/dp/1416995439?tag=NYTBS-20
http://www.amazon.com/Just-After-Sunset-Stories-Collectors/dp/1439115303?tag=NYTBS-20
http://www.amazon.com/The-Hour-I-First-Believed/dp/0060393491?tag=NYTBS-20
http://www.amazon.com/A-Mercy-Toni-Morrison/dp/0307276767?tag=NYTBS-20
http://www.amazon.com/The-Bodies-Left-Behind-Novel/dp/B002BWQ4MI?tag=NYTBS-20
http://www.amazon.com/Ender-Exile-The-Quintet/dp/0765304961?tag=NYTBS-20
http://www.amazon.com/Cross-Coun

error
http://www.amazon.com/Brimstone-Robert-B-Parker/dp/0399155716?tag=NYTBS-20
http://www.amazon.com/Brooklyn-A-Novel-Colm-Toibin/dp/1439138311?tag=NYTBS-20
http://www.amazon.com/Wicked-Prey-Lucas-Davenport-No/dp/0425234606?tag=NYTBS-20
http://www.amazon.com/Cemetery-Dance-Douglas-Preston/dp/0446580295?tag=NYTBS-20
error
error
http://www.amazon.com/Road-Dogs-Novel-Elmore-Leonard/dp/0061733148?tag=NYTBS-20
http://www.amazon.com/The-Selected-Works-T-Spivet/dp/1594202176?tag=NYTBS-20
http://www.amazon.com/Gone-Tomorrow-Jack-Reacher-13/dp/0440243688?tag=NYTBS-20
http://www.amazon.com/The-Sign-Raymond-Khoury/dp/0525950974?tag=NYTBS-20
error
http://www.amazon.com/The-Scarecrow-Michael-Connelly/dp/0316166308?tag=NYTBS-20
http://www.amazon.com/Shanghai-Girls-Novel-Lisa-See/dp/0812980530?tag=NYTBS-20
http://www.amazon.com/Heartless-Diana-Palmer/dp/0373773781?tag=NYTBS-20
http://www.amazon.com/Trade-Anita-Blake-Vampire-Hunter/dp/0515148059?tag=NYTBS-20
http://www.amazon.com/Medusa-Kurt-Austin-

error
http://www.amazon.com/Ice-A-Novel-Linda-Howard/dp/0345517202?tag=NYTBS-20
http://www.amazon.com/New-York-Novel-Edward-Rutherfurd/dp/0345497422?tag=NYTBS-20
http://www.amazon.com/Wishin-Hopin-Novel-Wally-Lamb/dp/0061941018?tag=NYTBS-20
error
http://www.amazon.com/The-Wrecker-Isaac-Bell-Adventure/dp/0399155996?tag=NYTBS-20
http://www.amazon.com/Pirate-Latitudes-Novel-Michael-Crichton/dp/0061929379?tag=NYTBS-20
error
http://www.amazon.com/First-Lords-Fury-Codex-Alera/dp/0441019625?tag=NYTBS-20
http://www.amazon.com/U-Undertow-Kinsey-Millhone-Mystery/dp/039915597X?tag=NYTBS-20
http://www.amazon.com/The-Paris-Vendetta-A-Novel/dp/0345505476?tag=NYTBS-20
http://www.amazon.com/Divine-Misdemeanors-Novel-Meredith-Gentry/dp/0345495969?tag=NYTBS-20
http://www.amazon.com/Dynasty-Evil-Darth-Novel-Republic/dp/0345511565?tag=NYTBS-20
http://www.amazon.com/Sizzle-A-Novel-Julie-Garwood/dp/0345500784?tag=NYTBS-20
http://www.amazon.com/Fired-Up-Dreamlight-Trilogy-Society/dp/0399155961?tag=NYTBS-20
h

KeyboardInterrupt: 

In [250]:
len(reviews)

1641

In [251]:
len(ratings)

1615

In [252]:
len(offers)

1237

In [253]:
len(img_urls)

1307

In [256]:
# Create dataframe for reviews
r_keys = []
r_values = []
for r in reviews:
    r_keys.append(r[0])
    r_values.append(r[1].split()[0].replace(",",""))
reviews_dict = {
    'url': r_keys,
    'nb_reviews': r_values
}  
reviews_df = pd.DataFrame(reviews_dict)
reviews_df

Unnamed: 0,url,nb_reviews
0,http://www.amazon.com/Odd-Hours-Dean-Koontz/dp...,920
1,http://www.amazon.com/Love-Youre-With-Emily-Gi...,702
2,http://www.amazon.com/Unaccustomed-Earth-Jhump...,521
3,http://www.amazon.com/Unaccustomed-Vintage-Con...,521
4,http://www.amazon.com/The-Beach-House-A-Novel/...,521
5,http://www.amazon.com/Queen-Babble-Gets-Hitche...,78
6,http://www.amazon.com/Silent-Thunder-Iris-Joha...,80
7,http://www.amazon.com/Say-Goodbye-FBI-Profiler...,368
8,http://www.amazon.com/Secret-Servant-Gabriel-A...,828
9,http://www.amazon.com/The-Lace-Reader-A-Novel/...,633


In [258]:
# Create dataframe for ratings
s_keys = []
s_values = []
for s in ratings:
    s_keys.append(s[0])
    s_values.append(s[1].split()[0])
stars_dict = {
    'url': s_keys,
    'nb_stars': s_values
}  
stars_df = pd.DataFrame(stars_dict)
stars_df

Unnamed: 0,url,nb_stars
0,http://www.amazon.com/Odd-Hours-Dean-Koontz/dp...,4.4
1,http://www.amazon.com/Love-Youre-With-Emily-Gi...,4.0
2,http://www.amazon.com/Unaccustomed-Earth-Jhump...,4.4
3,http://www.amazon.com/Unaccustomed-Vintage-Con...,4.4
4,http://www.amazon.com/The-Beach-House-A-Novel/...,4.1
5,http://www.amazon.com/Queen-Babble-Gets-Hitche...,3.5
6,http://www.amazon.com/Silent-Thunder-Iris-Joha...,4.1
7,http://www.amazon.com/Say-Goodbye-FBI-Profiler...,4.2
8,http://www.amazon.com/Secret-Servant-Gabriel-A...,4.7
9,http://www.amazon.com/The-Lace-Reader-A-Novel/...,4.0


In [259]:
# Create dataframe for Amazon prices
o_keys = []
o_values = []
for o in offers:
    o_keys.append(o[0])
    o_values.append(o[1])
offers_dict = {
    'url': o_keys,
    'amazon_price': o_values
}  
offers_df = pd.DataFrame(offers_dict)
offers_df

Unnamed: 0,url,amazon_price
0,http://www.amazon.com/Odd-Hours-Dean-Koontz/dp...,$12.93
1,http://www.amazon.com/Love-Youre-With-Emily-Gi...,$9.91
2,http://www.amazon.com/Unaccustomed-Earth-Jhump...,$14.99
3,http://www.amazon.com/Unaccustomed-Vintage-Con...,$13.38
4,http://www.amazon.com/The-Beach-House-A-Novel/...,$10.94
5,http://www.amazon.com/Queen-Babble-Gets-Hitche...,$8.88
6,http://www.amazon.com/Say-Goodbye-FBI-Profiler...,$7.99
7,http://www.amazon.com/Secret-Servant-Gabriel-A...,$7.99
8,http://www.amazon.com/The-Lace-Reader-A-Novel/...,$12.04
9,http://www.amazon.com/Paul-Dune-Brian-Herbert/...,$28.90


In [262]:
# Create dataframe for thumbnail urls
i_keys = []
i_values = []
for i in img_urls:
    i_keys.append(i[0])
    i_values.append(i[1])
img_urls_dict = {
    'url': i_keys,
    'img_url': i_values
}  
img_urls_df = pd.DataFrame(img_urls_dict)
img_urls_df.head()

Unnamed: 0,url,img_url
0,http://www.amazon.com/Odd-Hours-Dean-Koontz/dp...,"{""https://images-na.ssl-images-amazon.com/imag..."
1,http://www.amazon.com/Love-Youre-With-Emily-Gi...,"{""https://images-na.ssl-images-amazon.com/imag..."
2,http://www.amazon.com/Unaccustomed-Earth-Jhump...,"{""https://images-na.ssl-images-amazon.com/imag..."
3,http://www.amazon.com/Unaccustomed-Vintage-Con...,"{""https://images-na.ssl-images-amazon.com/imag..."
4,http://www.amazon.com/The-Beach-House-A-Novel/...,"{""https://images-na.ssl-images-amazon.com/imag..."


In [263]:
amazon_scrape_df = reviews_df.merge(stars_df, how='outer', on='url', suffixes=('_x', '_y'))
amazon_scrape_df = amazon_scrape_df.merge(offers_df, how='outer', on='url', suffixes=('_x', '_y'))
amazon_scrape_df = amazon_scrape_df.merge(img_urls_df, how='outer', on='url', suffixes=('_x', '_y'))
amazon_scrape_df.count()

url             2046
nb_reviews      1641
nb_stars        1615
amazon_price    1237
img_url         1307
dtype: int64

In [264]:
# Save scraped data to 'more_scraped_data.csv'

amazon_scrape_df.to_csv('Output/more_scraped_data.csv')