-
Notifications
You must be signed in to change notification settings - Fork 1
/
Amazon_review_scraper.py
95 lines (81 loc) · 3.42 KB
/
Amazon_review_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
# For ignoring SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url=input("Enter Amazon Product Url- ")
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
html = soup.prettify('utf-8')
product_json = {}
# This block of code will help extract the Brand of the item
for divs in soup.findAll('div', attrs={'class': 'a-box-group'}):
try:
product_json['brand'] = divs['data-brand']
break
except:
pass
# This block of code will help extract the Prodcut Title of the item
for spans in soup.findAll('span', attrs={'id': 'productTitle'}):
name_of_product = spans.text.strip()
product_json['name'] = name_of_product
break
# This block of code will help extract the price of the item in dollars
for divs in soup.findAll('div'):
try:
price = str(divs['data-asin-price'])
product_json['price'] = '$' + price
break
except:
pass
# This block of code will help extract the image of the item in dollars
for divs in soup.findAll('div', attrs={'id': 'rwImages_hidden'}):
for img_tag in divs.findAll('img', attrs={'style': 'display:none;'
}):
product_json['img-url'] = img_tag['src']
break
# This block of code will help extract the average star rating of the product
for i_tags in soup.findAll('i',
attrs={'data-hook': 'average-star-rating'}):
for spans in i_tags.findAll('span', attrs={'class': 'a-icon-alt'}):
product_json['star-rating'] = spans.text.strip()
break
# This block of code will help extract the number of customer reviews of the product
for spans in soup.findAll('span', attrs={'id': 'acrCustomerReviewText'
}):
if spans.text:
review_count = spans.text.strip()
product_json['customer-reviews-count'] = review_count
break
# This block of code will help extract top specifications and details of the product
product_json['details'] = []
for ul_tags in soup.findAll('ul',
attrs={'class': 'a-unordered-list a-vertical a-spacing-none'
}):
for li_tags in ul_tags.findAll('li'):
for spans in li_tags.findAll('span',
attrs={'class': 'a-list-item'}, text=True,
recursive=False):
product_json['details'].append(spans.text.strip())
# This block of code will help extract the short reviews of the product
product_json['short-reviews'] = []
for a_tags in soup.findAll('a',
attrs={'class': 'a-size-base a-link-normal review-title a-color-base a-text-bold'
}):
short_review = a_tags.text.strip()
product_json['short-reviews'].append(short_review)
# This block of code will help extract the long reviews of the product
product_json['long-reviews'] = []
for divs in soup.findAll('div', attrs={'data-hook': 'review-collapsed'
}):
long_review = divs.text.strip()
product_json['long-reviews'].append(long_review)
# Saving the scraped data in json format
with open('product.json', 'w') as outfile:
json.dump(product_json, outfile, indent=4)
print ('----------Extraction of data is complete. Check json file.----------')