# Gathering Data:

- Using the Best Buy API, a function was developed to acquire any information about a desired product category
- There is a limit of 5 calls per second and 50,000 calls per day
- Product information is returned as a dataframe where rows = products and columns = attributes

## Additional Information:

- best buy API: https://developer.bestbuy.com/
- instructions: https://bestbuyapis.github.io/api-documentation/?shell#getting-started

In [1]:
#import libraries
import urllib.request, urllib.parse, urllib.error
import json
import ssl
import pandas as pd
import numpy as np
import time

In [2]:
def query_bestbuy(category, api_key, active = "*", manufacturer = "*", show = "all", sort = "salePrice.dsc",
                  page_size = "100", next_page ="*", pages = 1):
    """
    query_bestbuy extracts information about a particular product category from the bestbuy website.
    
    Parameters
    ----------
    :param category: Name of a particular product category. Must be a plural noun. 
    :param api_key: Unique user api key acquired from https://developer.bestbuy.com/
    :param active: Active and/or inactive products. By default set to "*" to show all.
    :param manufacturer: Manufacturer name for a product category. By default all manufacturers are shown.
    :param show: Control which attributes are returned. By default set to "all".
    :param sort: Sort the results by one of the attributes. By default sorted by "salePrice" in descending order.
    :param page_size: Control the number of product entries per page. By default 100 entries per page are returned.
    :param next_page: A unique "CursorMark" is provided for each product page. To walk through each page, next_page
                      must be set to "*" and then it is iteratively updated to the next "CursorMark".
    :param pages: Number of pages requested. By default this is set to 1.
    
    Returns
    -------
    :return: pandas dataframe, where rows = product and columns = attributes.
    """ 
    #Ensure the input names are URL suitable (quoting special characters and appropriately encoding non-ASCII text)
    category = urllib.parse.quote(category)
    show = urllib.parse.quote(show)
    
    #Using an API key and customizable parameters, open the best buy URL and load the accompanying json data
    try:
        serviceurl = f'https://api.bestbuy.com/v1/products(categoryPath.name={category}&active={active}'\
                        f'&manufacturer={manufacturer})?format=json&show={show}&sort={sort}&pageSize={page_size}'\
                        f'&cursorMark={next_page}&apiKey={api_key}'
        uh = urllib.request.urlopen(serviceurl)
        data = uh.read().decode()
        js_pg1 = json.loads(data)
    
    #If issues arise with the api key,information is missing or an item can't be found then an error is returned
    except urllib.error.HTTPError as e:
        if e.code == 403:
            return("The API key is not valid, or the allocated call limit has been exceeded.")
        elif e.code == 400:
            return("The request is missing key information or is malformed.")
        elif e.code == 404:
            return("The requested item cannot be found.")
    
    #If a particular category doesn't have any entries then an error is thrown
    if js_pg1['totalPages'] == 0:
        raise ValueError("No information is available for this category")
        
    #If more pages are requested than there are available an error is provided. 0 pages cannot be provided either.
    elif pages > js_pg1['totalPages'] or pages == 0:
        raise ValueError('The number of pages requested must be between 1 and %d' %(js_pg1['totalPages']))
  
    #Iterate through each page and save product information in a dataframe (rows = product,columns = attributes)
    output = pd.DataFrame()
    for i in range(js_pg1['totalPages']): 
        time.sleep(0.1)
        
        #Save information from the first page into a dataframe and acquire the unique next page "CursorMark"
        if i == 0:
            next_page = js_pg1['nextCursorMark']
            next_page = urllib.parse.quote(next_page)
            for d in js_pg1['products']:
                df_dictionary = pd.DataFrame([d])
                output = pd.concat([output, df_dictionary], ignore_index=True)
                
        #For subsequent pages make a new URL request, load the data, append it to the dataframe and save 
        #the unique next page "CursorMark"
        elif i >= 1:
            serviceurl = f'https://api.bestbuy.com/v1/products(categoryPath.name={category}&active={active}'\
                            f'&manufacturer={manufacturer})?format=json&show={show}&sort={sort}&pageSize={page_size}'\
                            f'&cursorMark={next_page}&apiKey={api_key}'
            uh = urllib.request.urlopen(serviceurl)
            data = uh.read().decode()
            js_pgx = json.loads(data)
            
            next_page = js_pgx['nextCursorMark']
            next_page = urllib.parse.quote(next_page)
            for d in js_pgx['products']:
                df_dictionary = pd.DataFrame([d])
                output = pd.concat([output, df_dictionary], ignore_index=True)
        
        #For each iteration update the user on the current page, how many pages are left to query and which
        #information is being acquired about a specific manufacturer and product category. 
        if manufacturer == "*":
            manufacturer_name = "all"
        elif manufacturer != "*":
            manufacturer_name = manufacturer
        print("Obtaining information about {} {} on page {}. \nThere a total of {} pages left."
              .format(manufacturer_name,category,i+1,js_pg1['totalPages']-(i+1)))
        
        #Stop acquiring data at a user specified page number. 
        if pages == i+1:
            break
        
    return(output)

### Example1: Obtaining information about the the top 50 least expensive Samsung monitors.

In [3]:
df_SamsungMonitors = query_bestbuy(category = "Monitors",api_key = '#############',
                                   manufacturer = "Samsung",sort ="salePrice.asc",page_size = 50)

print("\nAttributes (columns) provided by Best Buy:\n{}".format(list(df_SamsungMonitors.columns)))

Obtaining information about Samsung Monitors on page 1. 
There a total of 5 pages left.

Attributes (columns) provided by Best Buy:


In [4]:
df_SamsungMonitors.loc[:,["name","salePrice","customerReviewCount", "customerReviewAverage","longDescription"]]

Unnamed: 0,name,salePrice,customerReviewCount,customerReviewAverage,longDescription
0,Samsung - Geek Squad Certified Refurbished 23....,71.99,1.0,5.0,Geek Squad&#174; Certified Refurbished product...
1,Samsung - SE65 Series LS24E65KPLH/GO 23.6” LED...,79.99,108.0,4.6,Our SE650 Series provides an environmentally f...
2,"Samsung - SF350 Series S19F350HNN 19"" LED HD M...",89.99,2.0,4.0,Display content efficiently with this 19-inch ...
3,Samsung - Refurbished TE310 Series T24E310ND 2...,89.99,1.0,4.0,Access online and broadcast entertainment with...
4,"Samsung - 18.5"" LCD Monitor",99.99,1.0,5.0,Perfect High Speed Performance With a respons...
5,"Samsung - 24"" LED FHD AMD FreeSync Monitor wit...",99.99,13.0,4.5,The SR35 FHD monitor has a 3-sided bezel-less ...
6,"Samsung - Geek Squad Certified Refurbished 24""...",107.99,2.0,4.5,Geek Squad&#174; Certified Refurbished product...
7,"Samsung - 21.5"" LED HD Monitor - Glossy Black",109.99,1038.0,4.6,"Utilizing Samsung MagicAngle, MagicBright and ..."
8,"Samsung - 19.5"" LED HD Monitor - Black",111.99,88.0,4.6,"This Samsung S20D300H 19.5"" LED HD monitor fea..."
9,"Samsung - 21.5"" LED HD Monitor - Red",118.99,605.0,4.6,With 250 cd/m&#178; brightness and support for...


### Example2: Obtaining specific information about the 5 most reviewed Sony headphones.

In [5]:
df_SonyHeadphones = query_bestbuy(category = "Headphones",api_key = '#############',
                                   manufacturer = "Sony",sort = "customerReviewCount.dsc",page_size = 5,
                                   show = "name,salePrice,customerReviewCount,customerReviewAverage,longDescription")

df_SonyHeadphones

Obtaining information about Sony Headphones on page 1. 
There a total of 84 pages left.


Unnamed: 0,name,salePrice,customerReviewCount,customerReviewAverage,longDescription
0,Sony - ZX Series Wired On-Ear Headphones - Black,12.99,14346,4.5,These Sony ZX Series MDRZX110/BLK headphones f...
1,Sony - ZX Series Wired On-Ear Headphones - White,12.99,14346,4.5,These Sony ZX Series MDRZX110/WHI headphones f...
2,Sony - MDRXB50 Wired Earbud Headphones - Black,49.99,6918,4.5,"Enjoy crisp, powerful low-end audio with these..."
3,Sony - MDRXB50 Wired Earbud Headphones - Blue,49.99,6918,4.5,"Enjoy crisp, powerful low-end audio with these..."
4,Sony - EX14AP Wired Earbud Headphones - Black,7.49,6440,4.4,Enjoy music on your run with these Sony in-ear...


### Example3: Obtain all information about headphones, which will be sorted by price. Lets first test how the function responds to an incorrect page number argument. 

In [6]:
df_Headphones = query_bestbuy(category = "Headphones",api_key = '#############',
                              sort ="salePrice.dsc",pages = 0, page_size = 100)

ValueError: The number of pages requested must be between 1 and 89

In [7]:
df_Headphones = query_bestbuy(category = "Headphones",api_key = '#############',
                              sort ="salePrice.dsc",pages = 89, page_size = 100)
df_Headphones

Obtaining information about all Headphones on page 1. 
There a total of 88 pages left.
Obtaining information about all Headphones on page 2. 
There a total of 87 pages left.
Obtaining information about all Headphones on page 3. 
There a total of 86 pages left.
Obtaining information about all Headphones on page 4. 
There a total of 85 pages left.
Obtaining information about all Headphones on page 5. 
There a total of 84 pages left.
Obtaining information about all Headphones on page 6. 
There a total of 83 pages left.
Obtaining information about all Headphones on page 7. 
There a total of 82 pages left.
Obtaining information about all Headphones on page 8. 
There a total of 81 pages left.
Obtaining information about all Headphones on page 9. 
There a total of 80 pages left.
Obtaining information about all Headphones on page 10. 
There a total of 79 pages left.
Obtaining information about all Headphones on page 11. 
There a total of 78 pages left.
Obtaining information about all Headphone

Unnamed: 0,sku,score,productId,name,source,type,startDate,new,active,lowPriceGuarantee,...,haulawayAvailable,proposition65WarningMessage,proposition65WarningType,collection,powerSource,totalHarmonicDistortion,multiroomCapability,numberOfSpeakers,numberOfChannels,headphoneJacks
0,6454189,,,Shure - KSE1500 Electrostatic Earphones System...,,HardGood,2021-03-08,False,False,True,...,,,04,Magnolia Home Theater,,,,,,
1,6227404,,,Pioneer - SE MASTER1 Wired Over-the-Ear Headph...,,HardGood,2018-04-11,False,False,True,...,,,01,Magnolia Home Theater,,,,,,
2,6135450,,,Sennheiser - HDV 820 Digital Headphones Amplif...,,HardGood,2017-11-15,False,False,True,...,,,04,Magnolia Home Theater,Plug-in,,,,,
3,8862206,,,Sennheiser - Digital Headphone Amplifier - Silver,,HardGood,2013-05-21,False,False,True,...,,,04,Magnolia Home Theater,Plug-in,,,,,
4,6442577,,,Sennheiser - HD 820 Over-the-Ear Audiophile He...,,HardGood,2020-12-01,False,True,True,...,,This product can expose you to chemicals inclu...,04,Magnolia Home Theater,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8881,8416169,,,"Rocketfish™ - Earbud Headphones - Gray, Clear",,HardGood,2013-07-07,False,False,False,...,,,05,,,,,,,
8882,8416178,,,Rocketfish™ - Earbud Headphones - Black,,HardGood,2013-07-07,False,False,False,...,,,05,,,,,,,
8883,8416211,,,Rocketfish™ - Earbud Headphones - White,,HardGood,2013-07-07,False,False,False,...,,,05,,,,,,,
8884,8605253,,,JVC - Gumy Earphone - White,,HardGood,2008-09-14,False,False,False,...,,,05,,,,,,,


In [8]:
df_Headphones.to_csv('/home/mslobody/Desktop/Best_buy_Project/Data/Headphones.csv', sep = ',')  