-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape1.py
28 lines (28 loc) · 1.39 KB
/
scrape1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import requests
from bs4 import BeautifulSoup
import pandas as pdreviewlist = []def get_soup(url):
r = requests.get('http://localhost:8050/render.html', params={'url': url, 'wait': 2})
soup = BeautifulSoup(r.text, 'html.parser')
return soupdef get_reviews(soup):
reviews = soup.find_all('div', {'data-hook': 'review'})
try:
for item in reviews:
review = {
'product': soup.title.text.replace('Amazon.in:Customer reviews:', '').strip(), #'product': soup.title.text.replace('Amazon.in:Customer reviews:', '').strip(),
'title': item.find('a', {'data-hook': 'review-title'}).text.strip(),
'rating': float(item.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip()),
'body': item.find('span', {'data-hook': 'review-body'}).text.strip(),
}
reviewlist.append(review)
except:
passfor x in range(1,999):
soup = get_soup(f'https://www.amazon.in/Samsung-Galaxy-Storage-Additional-Exchange/product-reviews/B086KFBNV5/ref=cm_cr_getr_d_paging_btm_prev_2?ie=UTF8&reviewerType=all_reviews&pageNumber={x}')
print(f'Getting page: {x}')
get_reviews(soup)
print(len(reviewlist))
if not soup.find('li', {'class': 'a-disabled a-last'}):
pass
else:
breakdf = pd.DataFrame(reviewlist)
df.to_excel('Samsung_Galaxy_Z.xlsx', index=False)
print('Fin.')