In [None]:
import requests # Used to make HTTP requests to fetch web pages
from bs4 import BeautifulSoup # Used for parsing HTML content
from pymongo import MongoClient # Used to interact with MongoDB database
import re # Used for regular expressions, particularly for matching class names
import nltk # Natural Language Toolkit, used for text processing like tokenization
from nltk.tokenize import sent_tokenize, word_tokenize # Specific NLTK modules for tokenization
import urllib.parse # Used for parsing URLs, specifically for joining base URL with relative links
import time  # Used to add a delay between requests to be polite to the website

# --- NLTK Data Download Check ---
# This block checks if the necessary NLTK data ('punkt' tokenizer) is available.
# If not found, it attempts to download it. This is typically needed only on the first run.
print("Checking NLTK data...")
try:
    # Try to find the 'punkt' tokenizer data
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    # If download fails, print message and attempt download
    print("Downloading NLTK 'punkt' tokenizer...")
    nltk.download('punkt')
except LookupError:
    # If data is not found, print message and attempt download
    print("Downloading NLTK 'punkt' tokenizer...")
    nltk.download('punkt')
print("NLTK data check complete.")


# --- MongoDB setup ---
# Sets up the connection to your MongoDB database.
# Replace the connection string with your actual MongoDB Atlas or local connection string.
# Warning: Storing credentials directly in the script is not recommended for production.
# Consider using environment variables or a configuration file.
MONGO_CONNECTION_STRING = "mongodb+srv://root:admin@cluster0.lfqdtch.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
DB_NAME = 'trustpilot_reviews' # The name of the database to use
COLLECTION_NAME = 'partsofcanada_reviews' # The name of the collection to store reviews in

print(f"\nConnecting to MongoDB: {DB_NAME}.{COLLECTION_NAME}...")
try:
    # Establish the MongoDB connection
    # tls=True and tlsAllowInvalidCertificates=True are often used for MongoDB Atlas,
    # but using invalid certs can be insecure. Use proper certs if possible.
    client = MongoClient(
        MONGO_CONNECTION_STRING,
        tls=True,
        tlsAllowInvalidCertificates=True
    )
    # Access the specified database and collection
    db = client[DB_NAME]
    collection = db[COLLECTION_NAME]
    # Optional: Check connection by running a simple command
    client.admin.command('ismaster')
    print("MongoDB connection successful.")
except Exception as e:
    # If connection fails, print error and exit the script
    print(f"Error connecting to MongoDB: {e}")
    exit() # Exit the script if the database connection cannot be established


# --- Trustpilot Configuration ---
# The base URL for the company's reviews page on Trustpilot.
base_url = "https://www.trustpilot.com/review/partsofcanada.com"

# Headers to include in the HTTP request. A User-Agent helps identify your script
# and can sometimes be necessary to avoid being blocked by websites.
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}

# --- Scraping Function ---
def scrape_trustpilot_reviews(url, page_number):
    """
    Scrapes reviews from a specific Trustpilot review page URL.

    Args:
        url (str): The full URL of the review page (including page number).
        page_number (int): The current page number being scraped.

    Returns:
        list: A list of dictionaries, where each dictionary represents a review,
              or None if a 404 error is encountered (indicating end of pages),
              or an empty list if other errors occur but scraping should continue.
    """
    print(f"Fetching URL: {url}")
    try:
        # Send the HTTP GET request to the URL
        response = requests.get(url, headers=headers)
        # Raise an exception for bad status codes (400s or 500s)
        response.raise_for_status()
        print(f"Parsing HTML for page {page_number}...")

        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")

        # --- Display all hyperlinks retrieved during the scraping process ---
        # This block finds all <a> tags (hyperlinks) on the page and prints their URLs.
        # Useful for debugging or exploring the page structure.
        print(f"\n--- Found Hyperlinks on Page {page_number} ---")
        all_links = soup.find_all('a') # Find all <a> tags
        if not all_links:
            print("No hyperlinks found on this page.")
        else:
            for link in all_links:
                href = link.get('href') # Get the value of the 'href' attribute
                if href: # Check if the 'href' attribute exists and is not empty
                    # Use urllib.parse.urljoin to convert relative URLs to absolute ones
                    absolute_url = urllib.parse.urljoin(url, href)
                    print(absolute_url)
                    # If you only want the raw href value, use:
                    # print(href)
        print(f"--- End Hyperlinks on Page {page_number} ---\n")
        # --- End of hyperlink section ---

        # --- Extract Review Data ---
        # Find all HTML elements that represent individual review blocks.
        # Trustpilot uses an <article> tag with a specific class for each review.
        review_blocks = soup.find_all('article', class_='styles_reviewCard__Qwhpy')
        print(f"Found {len(review_blocks)} review blocks on page {page_number}.")

        reviews_data = [] # List to store extracted data for all reviews on this page
        for review in review_blocks:
            # Extract Review Text:
            # Find the paragraph tag containing the main review text.
            # Uses a regex pattern because the class name might have a dynamic suffix (e.g., typography_body-l__abcde).
            review_text_tag = review.find('p', class_=re.compile(r'typography_body-l__.*'))
            # Get the text content, strip whitespace, default to empty string if tag not found
            review_text = review_text_tag.text.strip() if review_text_tag else ""

            # Extract Rating:
            # Find the container holding the star rating.
            rating = None # Initialize rating as None
            rating_container = review.find('div', class_='styles_reviewHeader__DzoAZ')
            if rating_container:
                # Find the div with star rating classes within the container.
                rating_tag = rating_container.find('div', class_=re.compile(r'star-rating_starRating.*'))
                if rating_tag:
                    # Find the <img> tag within the rating div that has an alt text like "Rated X out of 5 stars".
                    img_tag = rating_tag.find('img', alt=re.compile(r'Rated (\d) out of 5 stars'))
                    if img_tag:
                        alt_text = img_tag.get('alt') # Get the alt text
                        # Use regex to extract the digit (the rating) from the alt text
                        rating_match = re.search(r'Rated (\d) out of 5 stars', alt_text)
                        if rating_match:
                            rating = int(rating_match.group(1)) # Convert the extracted digit to an integer

            # Extract Reviewer Name:
            # Find the element containing the reviewer's name.
            # Trustpilot uses different attributes/classes; check for data-attribute first, then common classes.
            reviewer_tag = review.find('span', attrs={"data-consumer-name-typography": "true"})
            if not reviewer_tag:
                 # Fallback to searching for common typography or consumer name classes
                 reviewer_tag = review.find('span', class_=re.compile(r'typography_heading-xs__.*|consumer-information__name'))

            # Get the text content, strip whitespace, default to "Anonymous" if tag not found
            reviewer_name = reviewer_tag.text.strip() if reviewer_tag else "Anonymous"

            # --- Text Processing (Tokenization) ---
            # Process the review text using NLTK to break it into sentences and words.
            if review_text: # Only process if review text was found
                sentences = sent_tokenize(review_text) # Split text into sentences
                # Split each sentence into words (tokens)
                tokenized_words = [word_tokenize(sentence) for sentence in sentences]
            else:
                # If no text, set sentences and tokenized_words to empty lists
                sentences = []
                tokenized_words = []

            # --- Structure Review Data ---
            # Create a dictionary to hold all extracted and processed data for this review.
            review_data = {
                "review_text": review_text,
                "sentences": sentences,
                "tokenized_words": tokenized_words,
                "rating": rating,
                "reviewer_name": reviewer_name,
                "source_url": url, # Store the URL the review was scraped from
                "page_number": page_number, # Store the page number
                "scrape_timestamp": time.time() # Add a timestamp for when it was scraped
            }
            reviews_data.append(review_data) # Add the review dictionary to the list

        return reviews_data # Return the list of reviews scraped from this page

    # --- Error Handling for HTTP Requests ---
    except requests.exceptions.HTTPError as e:
        # Handle HTTP errors (like 404 Not Found, 500 Internal Server Error)
        print(f"HTTP error occurred on page {page_number}: {e}")
        # If a 404 error occurs, it often means we've reached the end of the review pages.
        if response.status_code == 404:
             print("Encountered 404 error, likely reached the end of reviews. Stopping.")
             return None # Return None to signal the main loop to stop
        # For other HTTP errors, print the error and return an empty list to continue scraping
        return []
    # --- Error Handling for other Request Issues ---
    except requests.exceptions.RequestException as e:
        # Handle other requests library errors (e.g., network problems, timeouts)
        print(f"Error fetching page {page_number}: {e}")
        # Return an empty list to continue scraping other pages despite this error
        return []
    # --- Error Handling for Parsing Issues ---
    except Exception as e:
        # Handle any other unexpected errors during parsing or processing
        print(f"Error parsing page {page_number}: {e}")
        # Return an empty list to continue scraping other pages despite this error
        return []

# --- Main Execution Block ---
# This block runs only when the script is executed directly (not imported as a module).
if __name__ == "__main__":
    total_reviews_inserted = 0 # Counter for the total number of new reviews inserted
    num_pages_to_scrape = 9 # Set the maximum number of pages to attempt to scrape. Adjust as needed.

    # Loop through the specified range of page numbers
    for page_number in range(1, num_pages_to_scrape + 1):
        # Construct the full URL for the current page
        url = f"{base_url}?page={page_number}"

        # Call the scraping function for the current page
        reviews = scrape_trustpilot_reviews(url, page_number)

        # Check the return value from the scraping function
        if reviews is None:
            # If scrape_trustpilot_reviews returned None (due to 404), stop the loop
            print(f"Stopping scraping as page {page_number} could not be fetched or indicates end of reviews.")
            break
        elif reviews:
            # If reviews were successfully scraped (list is not empty)
            try:
                # --- MongoDB Insertion ---
                # Prepare a list to hold reviews that are new and should be inserted.
                new_reviews_to_insert = []
                # Iterate through the reviews scraped from the current page
                for review in reviews:
                     # --- Basic Duplicate Check ---
                     # Check if a review with the same text, reviewer name, and source URL
                     # already exists in the database. This prevents inserting the same review
                     # multiple times if the script is run again on already scraped pages.
                     # Note: A more robust check might use a unique review ID if available.
                     existing_review = collection.find_one({
                         "review_text": review["review_text"],
                         "reviewer_name": review["reviewer_name"],
                         "source_url": review["source_url"]
                     })
                     # If no existing review is found, add the review to the list of new reviews
                     if not existing_review:
                         new_reviews_to_insert.append(review)
                     # else:
                         # Optional: Uncomment the line below to see which reviews are skipped as duplicates
                         # print(f"Skipping likely duplicate review from page {page_number} by {review['reviewer_name']}.")
                         # 'pass' does nothing, just a placeholder if the else block is empty

                # If there are new reviews to insert
                if new_reviews_to_insert:
                    # Insert the list of new reviews into the MongoDB collection
                    result = collection.insert_many(new_reviews_to_insert)
                    # Get the count of successfully inserted documents
                    inserted_count = len(result.inserted_ids)
                    total_reviews_inserted += inserted_count # Add to the total counter
                    print(f"Inserted {inserted_count} new reviews from page {page_number} into MongoDB.")
                else:
                    # If no new reviews were found on this page (either none scraped or all were duplicates)
                    print(f"No new reviews to insert from page {page_number}.")

            # --- Error Handling for MongoDB Insertion ---
            except Exception as e:
                # Handle any errors that occur during the database insertion process
                print(f"Error inserting reviews from page {page_number} into MongoDB: {e}")

        # --- Polite Scraping Delay ---
        # Pause the script for a few seconds before making the next request.
        # This reduces the load on the website's server and makes your scraping less aggressive.
        # Adjust the duration (in seconds) as needed.
        time.sleep(3)

    # --- Final Summary ---
    # Print the total number of new reviews that were inserted into the database.
    print(f"\n✅ Finished processing. {total_reviews_inserted} total new reviews inserted into the database.")

    # --- Close MongoDB Connection ---
    # Close the connection to the MongoDB client when the script finishes. Good practice.
    client.close()
    print("MongoDB connection closed.")


Checking NLTK data...
NLTK data check complete.

Connecting to MongoDB: trustpilot_reviews.partsofcanada_reviews...
MongoDB connection successful.
Fetching URL: https://www.trustpilot.com/review/partsofcanada.com?page=1
Parsing HTML for page 1...

--- Found Hyperlinks on Page 1 ---
https://www.trustpilot.com/
https://www.trustpilot.com/review/partselect.ca
https://www.trustpilot.com/review/pilot.io
https://www.trustpilot.com/review/neilgilbert.ca
https://www.trustpilot.com/categories
https://www.trustpilot.com/blog
https://www.trustpilot.com/users/connect?redirect=&source_cta=header
https://business.trustpilot.com
https://business.trustpilot.com
https://www.trustpilot.com/users/connect?redirect=&source_cta=header
https://www.trustpilot.com/categories
https://www.trustpilot.com/blog
https://www.trustpilot.com/categories/electronics_technology
https://www.trustpilot.com/categories/appliances_electronics
https://www.trustpilot.com/categories/appliance_store
http://partsofcanada.com
https: