In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd


In [2]:
driver_path = r'C:\chromedriver-win64\chromedriver-win64\chromedriver.exe'
url = 'https://ca.trustpilot.com/review/www.amazon.com'


In [9]:
# Driver path
driver_path = r'C:\chromedriver-win64\chromedriver-win64\chromedriver.exe'

# Set Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Optional: run in headless mode

# Initialize the Selenium WebDriver
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Target URL (Amazon Reviews on Trustpilot)
url = 'https://ca.trustpilot.com/review/www.amazon.com'

# Open the URL
driver.get(url)

# Wait for the reviews container to load
wait = WebDriverWait(driver, 10)

# List to store the extracted reviews
review_data = []

# Recursive scroll function
def scroll_page(scroll_count, max_scrolls):
    try:
        # Re-fetch the reviews container to avoid stale element
        reviews_container = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "styles_reviewsContainer__3_GQw")))

        # Scroll to the reviews container
        driver.execute_script("arguments[0].scrollIntoView();", reviews_container)
        time.sleep(2)  # Adjust sleep if necessary to allow reviews to load

        # Get the page source and parse with BeautifulSoup after scrolling
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find all review containers on the page (updated class selector)
        reviews = soup.find_all('div', {'class': 'styles_reviewCard__9HxJJ'})

        # Print the number of reviews found in this scroll
        print(f"Number of reviews found: {len(reviews)}")

        # Extract the review text, rating, and consumer details for each review
        for review in reviews:
            try:
                consumer_info = review.find('aside', {'class': 'styles_consumerInfoWrapper__KP3Ra'})

                # Find the details wrapper within the consumer info
                consumer_details = consumer_info.find('div', {'class': 'styles_consumerDetailsWrapper__p2wdr'})

                # Extract the consumer name
                consumer_name = consumer_details.find('a').find('span', {'class': 'typography_heading-xxs__QKBS8 typography_appearance-default__AAY17'}).text

                # Extract consumer review number (handle missing elements)
                consumer_review_no_element = consumer_details.find('div', {'class':'styles_consumerExtraDetails__fxS4S'}).find('span', 'typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l')
                consumer_review_no = consumer_review_no_element.text if consumer_review_no_element else 'Not Available'

                # Extract consumer country (handle missing elements)
                consumer_country_element = consumer_details.find('div', {'class':'styles_consumerExtraDetails__fxS4S'}).find('div', 'typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l styles_detailsIcon__Fo_ua')
                consumer_country = consumer_country_element.find('span').text if consumer_country_element else 'Not Available'

                consumer_review_info = review.find('section', {'class':'styles_reviewContentwrapper__zH_9M'})

                consumer_rating = consumer_review_info.find('div', {'class':'styles_reviewHeader__iU9Px'}).find('div', {'class': 'star-rating_starRating__4rrcf star-rating_medium__iN6Ty'}).find('img')
                rating_text = consumer_rating.get('alt')

                review_uploaded = consumer_review_info.find('div', {'class':'styles_reviewHeader__iU9Px'}).find('div', {'class': 'typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l styles_datesWrapper__RCEKH'}).find('time')
                review_uploaded_info = review_uploaded.get('title')

                review_heading = review.find('div', {'class':'styles_reviewContent__0Q2Tg'}).find('a', {'class':'link_internal__7XN06 typography_appearance-default__AAY17 typography_color-inherit__TlgPO link_link__IZzHN link_notUnderlined__szqki'}).find('h2', {'class':'typography_heading-s__f7029 typography_appearance-default__AAY17'}).text

                review_para = review.find('div', {'class':'styles_reviewContent__0Q2Tg'}).find('p', {'class':'typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn'}).text

                # Print the extracted details
                print(f"Consumer Name: {consumer_name}")
                print(f'Consumer Review Number: {consumer_review_no}' )
                print(f'Consumer Country: {consumer_country}')
                print("                                           ")
                print(review_uploaded_info)
                print(f'{rating_text}')
                print("                        ")
                print(review_heading)
                print(review_para)
                print("----------------------------------")

                # Add review details to the list
                review_data.append({
                    'consumer_name': consumer_name,
                    'consumer_review': consumer_review_no,
                    'consumer_country': consumer_country,
                    'date': review_uploaded_info,
                    'rating' : rating_text,
                    'review_topic': review_heading,
                    'review_content': review_para
                })

            except Exception as e:
                print(f"Error extracting review data: {e}")

        # If max scrolls have not been reached, click the 'Next' link
        if scroll_count < max_scrolls:
            try:
                # Try to find the 'Next' link and scroll into view before clicking it
                next_link = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a.button_button__T34Lr.pagination-link_next__SDNU4")))
                
                # Scroll the link into view
                driver.execute_script("arguments[0].scrollIntoView(true);", next_link)
                time.sleep(2)  # Allow the page to settle after the scroll

                # Try to click the link (either via standard click or JavaScript click if necessary)
                try:
                    next_link.click()  # Try normal click
                except:
                    # Fallback to JavaScript click if the normal click fails
                    driver.execute_script("arguments[0].click();", next_link)

                print("Next page link clicked.")
                time.sleep(3)  # Wait for the next page of reviews to load

                # After navigating to the next page, continue scrolling
                scroll_page(scroll_count + 1, max_scrolls)

            except Exception as e:
                print(f"No more pages or next link not found: {e}")

        else:
            return

    except Exception as e:
        print(f"Error while scrolling: {e}")
        return

# Start the scraping process
scroll_page(scroll_count=0, max_scrolls=5)

# Close the driver after all scrolling and reviews have been processed
driver.quit()

# Convert the list of reviews into a DataFrame
df = pd.DataFrame(review_data)

# Show the DataFrame
print(df)

# Save the reviews to a CSV file
df.to_csv('amazon_reviews_trustpilot.csv', index=False)

print("Scraping complete. Data saved to 'amazon_reviews_trustpilot.csv'.")


NameError: name 'Options' is not defined

In [None]:
81e5cffba460ad297270a9c1a1cbdc87cdf5b0861349050531603234


In [1]:
from google.cloud import language_v1
from google.oauth2 import service_account
import pandas as pd

# Load the dataset
file_path = 'amazon_reviews_trustpilot.csv'
df = pd.read_csv(file_path)

# Path to your service account key file
key_path = r"C:\ML\neural-cirrus-437807-t9-29c137016ba4.json"

# Load credentials from the JSON file
credentials = service_account.Credentials.from_service_account_file(key_path)

# Initialize the Google NLP client with the credentials
client = language_v1.LanguageServiceClient(credentials=credentials)

# Function to analyze sentiment, entities, and other NLP features for a review
def analyze_review(text):
    # Prepare document
    document = language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT)
    
    # Get sentiment analysis
    sentiment_response = client.analyze_sentiment(request={'document': document})
    sentiment = sentiment_response.document_sentiment
    
    # Get sentence-level sentiment analysis
    positive_sentences = []
    negative_sentences = []
    for sentence in sentiment_response.sentences:
        sentence_sentiment = sentence.sentiment.score
        if sentence_sentiment >= 0:
            positive_sentences.append(sentence.text.content)
        else:
            negative_sentences.append(sentence.text.content)
    
    # Get entity sentiment analysis (entities and their sentiment)
    entity_sentiment_response = client.analyze_entity_sentiment(request={'document': document})
    entity_sentiments = []
    for entity in entity_sentiment_response.entities:
        entity_info = {
            'name': entity.name,
            'type': language_v1.Entity.Type(entity.type_).name,
            'salience': entity.salience,
            'sentiment_score': entity.sentiment.score,
            'sentiment_magnitude': entity.sentiment.magnitude
        }
        entity_sentiments.append(entity_info)

    # Get syntax analysis (extracting part-of-speech tokens)
    syntax_response = client.analyze_syntax(request={'document': document})
    syntax_tokens = []
    for token in syntax_response.tokens:
        token_info = {
            'text': token.text.content,
            'part_of_speech': language_v1.PartOfSpeech.Tag(token.part_of_speech.tag).name
        }
        syntax_tokens.append(token_info)

    # Detect language (useful if working with multilingual reviews)
    language_response = client.analyze_entities(request={'document': document})
    detected_language = language_response.language

    # Classify text (optional, if you have text classification enabled)
    try:
        classify_response = client.classify_text(request={'document': document})
        classifications = []
        for category in classify_response.categories:
            classifications.append({
                'category': category.name,
                'confidence': category.confidence
            })
    except Exception as e:
        classifications = []
    
    # Classify review as positive or negative based on the overall sentiment score
    review_class = 'Positive' if sentiment.score >= 0 else 'Negative'
    
    return {
        'sentiment_score': sentiment.score,
        'sentiment_magnitude': sentiment.magnitude,
        'entities': [entity['name'] for entity in entity_sentiments],
        'entity_sentiments': entity_sentiments,
        'review_class': review_class,
        'positive_reasons': " ".join(positive_sentences),
        'negative_reasons': " ".join(negative_sentences),
        'syntax_tokens': syntax_tokens,
        'detected_language': detected_language,
        'classifications': classifications
    }

# Add columns for sentiment score, entity sentiment, classification, language detection, syntax tokens, etc.
df['sentiment_score'] = None
df['sentiment_magnitude'] = None
df['entities'] = None
df['entity_sentiments'] = None
df['review_class'] = None
df['positive_reasons'] = None
df['negative_reasons'] = None
df['syntax_tokens'] = None
df['detected_language'] = None
df['classifications'] = None

# Analyze each review
for i, row in df.iterrows():
    review_text = row['review_content']
    analysis_result = analyze_review(review_text)
    
    df.at[i, 'sentiment_score'] = analysis_result['sentiment_score']
    df.at[i, 'sentiment_magnitude'] = analysis_result['sentiment_magnitude']
    df.at[i, 'entities'] = ", ".join(analysis_result['entities'])
    df.at[i, 'entity_sentiments'] = analysis_result['entity_sentiments']
    df.at[i, 'review_class'] = analysis_result['review_class']
    df.at[i, 'positive_reasons'] = analysis_result['positive_reasons']
    df.at[i, 'negative_reasons'] = analysis_result['negative_reasons']
    df.at[i, 'syntax_tokens'] = analysis_result['syntax_tokens']
    df.at[i, 'detected_language'] = analysis_result['detected_language']
    df.at[i, 'classifications'] = analysis_result['classifications']

# Save the updated dataset with all the NLP analysis
output_file = 'amazon_reviews_with_full_nlp_analysis.csv'
df.to_csv(output_file, index=False)

# Display the first few rows of the updated dataframe
print(df.head())


   consumer_name consumer_review consumer_country  \
0       Lucy Loo       8 reviews               GB   
1  Jeffrey Bruce       5 reviews               GB   
2           Jdub       7 reviews               US   
3           Glyn       5 reviews               GB   
4      Ninja Ape       9 reviews               DE   

                                          date                  rating  \
0    Friday, October 11, 2024 at 09:57:32 a.m.  Rated 1 out of 5 stars   
1  Thursday, October 10, 2024 at 05:09:02 p.m.  Rated 1 out of 5 stars   
2  Thursday, October 10, 2024 at 05:26:49 p.m.  Rated 1 out of 5 stars   
3  Thursday, October 10, 2024 at 07:22:45 p.m.  Rated 1 out of 5 stars   
4  Wednesday, October 9, 2024 at 05:44:08 p.m.  Rated 1 out of 5 stars   

                               review_topic  \
0                          Driver stealing    
1  Amazon are quick enough to take payment…   
2       Got severely ill of a supplement I…   
3                   NO stars is too many!!!   
4