### AMAZON WEB SCRAPING:


In [1]:
# Import Libraries

from bs4 import BeautifulSoup
import requests
import datetime
import time

import smtplib

import pandas as pd
import csv


##### Connect URL and extract data using URL, headers, page and BeatutifulSoup

In [2]:
# Connect to Website and extract data from Amazon

URL = ' https://www.amazon.co.uk/Ladies-Winter-Coat-Black-Size/dp/B08KH4NGMM/ref=sr_1_5?crid=TUM9142G51HK&keywords=coat%2Bfor%2Bwomen&qid=1679713932&sprefix=coat%2Bfor%2B%2Caps%2C87&sr=8-5&th=1 '

# Get headers from http://httpbin.org/get
headers =  {"User-Agent": 	"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0" } 

page = requests.get (URL, headers = headers)

Soup1 = BeautifulSoup (page.content, "html.parser")
#print (Soup1)

Soup2 = BeautifulSoup (Soup1.prettify (), "html.parser" )                         #make Soup1 html in better formatting
#print (Soup2)

title = Soup2.find (id = 'productTitle').get_text ()                               #specify the title of the product to extract from html as the id is mentioned

#price_span = Soup2.find ('div', attrs = {'class' : 'a-offscreen'})
#price = price_span.text                                                          #specify the price of the product to extract from html from class as span has no id
#print (price_span)

price = Soup2.find ('span', attrs = {'class' : 'a-offscreen'}).get_text ()        # specify span and attributes since span class is given for price
rating = Soup2.find ('span', attrs = {'class' : 'a-icon-alt'}).get_text ()
brand = Soup2.find (id = 'bylineInfo').get_text ()

print ('Title: ' + title)
print ('Price: ' + price)
print ('Rating: ' + rating)
print (brand)


Title: 
                Ladies Belted Long Coat Womens Girls Winter Hooded Warm Jacket UK
               
Price: 
                  £21.99
                 
Rating: 
                    3.4 out of 5 stars
                   

               Brand: Generic
              


In [3]:
# Clean and strip the extracted data

title = title.strip ()
price = price.strip ()[1:]           # all after 1st object, so that only numeric value is extracted
rating = rating.strip ()
brand = brand.strip ()

print ('Title: ' + title)
print ('Price: ' + price)
print ('Rating: ' + rating)
print (brand)


Title: Ladies Belted Long Coat Womens Girls Winter Hooded Warm Jacket UK
Price: 21.99
Rating: 3.4 out of 5 stars
Brand: Generic


In [4]:
# Show the date of data extraction

date_extracted = datetime.date.today ()
print (date_extracted)

2023-03-30


##### Automate data to csv

In [5]:
# Automate the data to extract to csv/excel 

header = ['Title', 'Price', 'Rating', 'Brand', 'Date']
data = [title, price, rating, brand, date_extracted]
#type(data)

#with open ('AWS_Dataset_Women_Coat.csv', 'w', newline = '', encoding = 'UTF8') as fill:          #w is write
 #   writer = csv.writer (fill)
  #  writer.writerow (header)
   # writer.writerow (data)
    
    # COMMENTED SO THAT IT DOESN"T DELETE DATA AND IS USED IN AUTOMATION LATER

##### Retrieve csv file using Pandas

In [6]:
# Using Pandas, retrieve the csv

csv_df = pd.read_csv ('C:\DataAnalytics\AWS_Dataset_Women_Coat.csv')

print (csv_df)

                                               Title  Price  \
0  Ladies Belted Long Coat Womens Girls Winter Ho...  21.99   

               Rating           Brand        Date  
0  3.4 out of 5 stars  Brand: Generic  2023-03-30  


##### Append data to the extracted csv

In [7]:
with open ('AWS_Dataset_Women_Coat.csv', 'a+', newline = '', encoding = 'UTF8') as fill:          #a+ is append
    writer = csv.writer (fill)
    writer.writerow (data)                                                                      #dont need header, as we only append data

##### Automate whole process without having manual execution over time

In [14]:
# Keep everything inside the fucntion now.

def scrape_data ():
    URL = ' https://www.amazon.co.uk/Ladies-Winter-Coat-Black-Size/dp/B08KH4NGMM/ref=sr_1_5?crid=TUM9142G51HK&keywords=coat%2Bfor%2Bwomen&qid=1679713932&sprefix=coat%2Bfor%2B%2Caps%2C87&sr=8-5&th=1 '

    headers =  {"User-Agent": 	"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0" } 

    page = requests.get (URL, headers = headers)

    Soup1 = BeautifulSoup (page.content, "html.parser")
    Soup2 = BeautifulSoup (Soup1.prettify (), "html.parser" )                         

    title = Soup2.find (id = 'productTitle').get_text ()                               
    price = Soup2.find ('span', attrs = {'class' : 'a-offscreen'}).get_text ()     
    rating = Soup2.find ('span', attrs = {'class' : 'a-icon-alt'}).get_text ()
    brand = Soup2.find (id = 'bylineInfo').get_text ()
    
    title = title.strip ()
    price = price.strip ()[1:]        
    rating = rating.strip ()
    brand = brand.strip ()
    date_extracted = datetime.date.today ()
    
    header = ['Title', 'Price', 'Rating', 'Brand', 'Date']
    data = [title, price, rating, brand, date_extracted]
    
    with open ('AWS_Dataset_Women_Coat.csv', 'w', newline = '', encoding = 'UTF8') as fill:          #w is write
        writer = csv.writer (fill)
        writer.writerow (header)
        writer.writerow (data)
        


##### Add Timer to run once every 12hr using time library

In [None]:

#Run the csv in the set 12hr time
while (True):
    scrape_data ()
    time.sleep (43200)

csv_df = pd.read_csv ('C:\DataAnalytics\AWS_Dataset_Women_Coat.csv')
print (csv_df)