In [12]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time 
from datetime import date
from s3fs import S3FileSystem

In [13]:
# Function to extract Product Title
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        price = soup.find("span", attrs={'id':'priceblock_ourprice'}).string.strip()

    except AttributeError:

        try:
            # If there is some deal price
            price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()

        except:
            price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()

    except AttributeError:
        available = "Not Available"	

    return available



In [14]:



# add user agent 
HEADERS = ({'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

# The webpage URL
URL = "https://www.amazon.com.au/s?k=playstation+5&crid=GXDJHW274LY0&sprefix=play%2Caps%2C268&ref=nb_sb_ss_ts-doa-p_1_4"

# HTTP Request
webpage = requests.get(URL, headers=HEADERS)

# Soup Object containing all data
soup = BeautifulSoup(webpage.content, "html.parser")

# Fetch links as List of Tag Objects
links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

# Store the links
links_list = []

# Loop for extracting links from Tag Objects
for link in links:
        links_list.append(link.get('href'))

d = {"title":[], "price":[], "rating":[], "reviews":[],"availability":[]}

# Loop for extracting product details from each link 
for link in links_list:
    time.sleep(0.45)
    new_webpage = requests.get("https://www.amazon.com.au" + link, headers=HEADERS)

    new_soup = BeautifulSoup(new_webpage.content, "html.parser")
    

    # Function calls to display all necessary product information
    d['title'].append(get_title(new_soup))
    d['price'].append(get_price(new_soup))
    d['rating'].append(get_rating(new_soup))
    d['reviews'].append(get_review_count(new_soup))
    d['availability'].append(get_availability(new_soup))
    


amazon_df = pd.DataFrame.from_dict(d)
amazon_df['title'].replace('', np.nan, inplace=True)
amazon_df = amazon_df.dropna(subset=['title'])
    

In [15]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability
0,Sonic Origins Plus Day 1 - PlayStation 5,,Previous page,,"This item will be released on June 23, 2023."
1,Final Fantasy XVI - PlayStation 5,,Previous page,,"This item will be released on June 22, 2023."
2,Dead Island 2 Day One Edition - PlayStation 5,,4.7 out of 5 stars,18 ratings,In stock
3,Diablo IV - PlayStation 5,,4.7 out of 5 stars,,"This item will be released on June 6, 2023."
4,PlayStation 5 Console - God of War Ragnarok Bu...,,4.6 out of 5 stars,246 ratings,Only 2 left in stock
5,PlayStation 5 Console,,4.7 out of 5 stars,"1,241 ratings",In stock
6,PlayStation 5 Digital Edition Console,,4.7 out of 5 stars,311 ratings,In stock
7,PlayStation 5 Console - Two DualSense Wireless...,,4.7 out of 5 stars,5 ratings,In stock
8,PlayStation 5 Console - Horizon Forbidden West...,,4.8 out of 5 stars,312 ratings,Only 5 left in stock
9,PlayStation 5 Digital Edition Console,,4.7 out of 5 stars,"3,424 ratings",


In [16]:

date = date.today()

s3 = S3FileSystem(anon = False)
with s3.open("s3://8893-leo/{}/amazon_playstation5.csv".format(date), 'w') as file:
        file.write(amazon_df.to_csv(header=True, index=False))

    