# 1. Get Reviews from Amazon

## 1.1 Set up Beautiful Soup and Requests

In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [6]:
# Set up hedder to Scrape Amazon
HEADERS = ({'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/90.0.4430.212 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})

In [7]:
url = r'https://www.amazon.de/M%C3%9CHLE-10er-Rasierklingen-traditionelle-Sicherheitsrasierer/product-reviews/B00TQLHOTO/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews'

In [8]:
# Get raw html data with Requests
html = requests.get(url, headers=HEADERS)

In [9]:
# Check html Status code (200 = ok)
html.status_code

200

## 1.2 Convert Text to Soup

In [10]:
# Create Soup object from html Data
soup = BeautifulSoup(html.text, 'html.parser')

In [11]:
# Find user reviews in the data
results = soup.find_all('span', {'data-hook': 'review-body'})
print(results[0].text)


Top Produkt mit einer sehr gute Qualität



In [12]:
len(results)

10

## 1.3 Scraping Multible Pages

In [14]:
# Construct list of links to scrape multiple pages
links = []
for x in range(14):
    links.append(f'https://www.amazon.de/M%C3%9CHLE-10er-Rasierklingen-traditionelle-Sicherheitsrasierer/product-reviews/B00TQLHOTO/ref=cm_cr_getr_d_paging_btm_prev_1?ie=UTF8&reviewerType=all_reviews&pageNumber={x}')

In [15]:
# Scrape all links in the constructed list
reviews = []
for link in links:
    html = requests.get(link, headers=HEADERS)
    if html.status_code == 200:
        # HTML response was sucssesfull
        soup = BeautifulSoup(html.text, 'html.parser')
        results = soup.find_all('span', {'data-hook': 'review-body'})
        for review in results:
            reviews.append(review.text.replace('\n', ''))
    else:
        # HTML response was unsuccsessfull
        print('[BAD HTML RESPONSE] Response Code =', html.status_code)            

In [16]:
len(reviews)

139

# 2. Translate Reviews

## 2.1 Determine Language of Reviews

In [171]:
!pip install detectlanguage pandas

Collecting pandas
  Using cached pandas-1.5.3-cp310-cp310-win_amd64.whl (10.4 MB)
Collecting numpy>=1.21.0
  Using cached numpy-1.24.1-cp310-cp310-win_amd64.whl (14.8 MB)
Collecting pytz>=2020.1
  Using cached pytz-2022.7.1-py2.py3-none-any.whl (499 kB)
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.24.1 pandas-1.5.3 pytz-2022.7.1


In [17]:
import detectlanguage
import pandas as pd

In [18]:
# Setup the API key
detectlanguage.configuration.api_key = "eafb77af54089e8edcefc285c09ecd46"

In [19]:
# Test API to detect languages
review = reviews[120]
language = detectlanguage.detect(review)
print(review, '|' ,language[0]['language'])

Pacco arrivato per tempo, grazie alla puntualità di amazon. Prodotto valido per un ottimo compromesso di affilatura e durata. Utilizzato sia con un rasoio di sicurezza Merkur Future 700 che con un rasoio closed comb di muhle. Consigliato | it


In [21]:
# Detect languges of all reviews
languages = []
for review in reviews:
    language = detectlanguage.detect(review)
    languages.append(language[0]['language'])

In [22]:
# Construct Data frame containing review and Language
df = pd.DataFrame({'Review': reviews, 'Language': languages})

In [179]:
df['Language'].value_counts()

de    95
it    17
fr    13
es     8
en     6
Name: Language, dtype: int64

In [23]:
# Save df as csv
df.to_csv('Data/Reviews.csv')

## 2.2 Translate reviews

In [31]:
!pip install deepl

Collecting deepl
  Downloading deepl-1.13.0-py3-none-any.whl (37 kB)
Installing collected packages: deepl
Successfully installed deepl-1.13.0


In [67]:
import deepl

In [68]:
# Read csv
df = pd.read_csv('Data/Reviews.csv', index_col=0)

In [69]:
deepl_auth_key = '850a0a6f-bf39-3786-c632-d7e3868e217f:fx'
translator = deepl.Translator(deepl_auth_key)

In [70]:
review = df.iloc[0,0]
language = df.iloc[0,1]
translated = translator.translate_text(review, target_lang='en-gb')
print(review, '|' ,translated)

Top Produkt mit einer sehr gute Qualität | Top product with a very good quality


In [71]:
# Translate reviews
translations = []
for row in df.iterrows():
    review = row[1][0]
    language = row[1][1]
    if language == 'en':
        tranlation = review
    else:
        translation = translator.translate_text(review, target_lang='en-gb', source_lang=language)
        
    translations.append(translation)

In [72]:
# Add Translation to dataframe
df['Translated'] = translations

In [73]:
df.to_csv('Data/Reviews_Translated.csv')

# 3. Analysing the Reviews

## 3.1 Callculating Text Metrics

In [87]:
df = pd.read_csv('Data/Reviews_Translated.csv', index_col=0)

In [88]:
df.head()

Unnamed: 0,Review,Language,Translated
0,Top Produkt mit einer sehr gute Qualität,de,Top product with a very good quality
1,"Hallo,Ich hab das Produkt für meinen Rasierhob...",de,"Hello,I bought this product for my razor. The ..."
2,Da ich mich länger mit Rasiermesser und Rasier...,de,Since I've been shaving with a razor and strai...
3,Klingen laufen gut und sind scharf für 2 Rasur...,de,"Blades run well and are sharp for 2 shaves, th..."
4,Bei mir halten die mindestens 10 Mal wobei die...,de,"For me, they last at least 10 times, whereas t..."


In [92]:
df['Word_Count'] = df['Translated'].apply(lambda x: len(str(x).split()))
df['Char_Count'] = df['Translated'].str.len()

In [107]:
df[df['Word_Count'] == 113]['Translated'][8]

"We are beginners with the razor and are now trying out the blades from various manufacturers. I, a woman, am completely satisfied with the blades. I use them with a little shaving foam for all parts of the body. They are very sharp and don't dull quickly. The packaging is also good, you can always slide out a single blade. My boyfriend also thinks they are great and definitely better than those from the cheap manufacturers, but he is a bit demanding. He gets a bit twitchy when shaving his beard, so he's looking a bit further. All in all, it's great quality, but for the price, just try it out for yourself."

In [111]:
df[df['Word_Count'] == 1]

Unnamed: 0,Review,Language,Translated,Word_Count,Char_Count
59,Perfekt,de,Perfect,1,7
70,Qualität,de,Quality,1,7
71,zufrieden!,de,satisfied!,1,10
110,Perfetto,it,Perfect,1,7


## 3.2 Counting Stopwords

In [112]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leonr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [113]:
stop_words = stopwords.words('english')

In [116]:
df['Stop-Words'] = df['Translated'].apply(lambda x: len([x for x in x.split() if x in stop_words]))

In [117]:
df.head()

Unnamed: 0,Review,Language,Translated,Word_Count,Char_Count,Stop-Words
0,Top Produkt mit einer sehr gute Qualität,de,Top product with a very good quality,7,36,3
1,"Hallo,Ich hab das Produkt für meinen Rasierhob...",de,"Hello,I bought this product for my razor. The ...",107,635,44
2,Da ich mich länger mit Rasiermesser und Rasier...,de,Since I've been shaving with a razor and strai...,102,554,45
3,Klingen laufen gut und sind scharf für 2 Rasur...,de,"Blades run well and are sharp for 2 shaves, th...",22,183,9
4,Bei mir halten die mindestens 10 Mal wobei die...,de,"For me, they last at least 10 times, whereas t...",20,98,7
