### Trustpilot Reviews Extraction

In [1]:
import requests
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import json
import re
import pandas as pd

class TrustpilotReviewScraper:
    def __init__(self, url):
        self.url = url
        self.soup = None
        self.raw_data = None
        self.reviews_data = None

    class MyHTMLParser(HTMLParser):
        def __init__(self):
            super().__init__()
            self.data = []

        def handle_data(self, data):
            self.data.append(data)

    def fetch_page(self):
        response = requests.get(self.url)
        if response.status_code == 200:
            self.soup = BeautifulSoup(response.text, 'html.parser')
        else:
            raise Exception(f"Failed to fetch the page: {response.status_code}")

    def parse_script_data(self):
        script_elements = self.soup.find_all(re.compile('script'))
        if len(script_elements) > 6:
            html_string = str(script_elements[6])
            parser = self.MyHTMLParser()
            parser.feed(html_string)
            self.raw_data = json.loads(json.dumps({"TheData": parser.data[0]}))
        else:
            raise Exception("Expected script data not found")

    def extract_reviews_data(self):
        if not self.raw_data:
            raise Exception("Raw data is not available for extraction")

        string_data = self.raw_data["TheData"]

        # Extract URLs
        user_url_pattern = re.compile(r'https://uk.trustpilot.com/users/[0-9a-z/]*')
        user_urls = user_url_pattern.findall(string_data)

        # Extract User Names
        name_matches = [
            match for match in re.finditer("name", string_data)
        ]
        users = []
        for match in name_matches:
            snippet = string_data[match.end() + 3:match.end() + 30]
            url_match = re.search("url", snippet)
            if url_match:
                users.append(snippet[:url_match.start() - 3])

        # Extract Dates
        dates = [
            string_data[match.end() + 3:match.end() + 27]
            for match in re.finditer("datePublished", string_data)
        ]

        # Extract Review Bodies
        review_bodies = []
        for match1, match2 in zip(
            re.finditer("reviewBody", string_data),
            re.finditer("reviewRating", string_data),
        ):
            review_bodies.append(string_data[match1.end() + 2:match2.start() - 2])

        # Extract Ratings
        ratings = [
            string_data[match.end() + 3:match.end() + 4]
            for match in re.finditer("ratingValue", string_data)
        ][1:]

        # Extract Website Context
        result = self.url.partition("https://uk.trustpilot.com/review/")[2].partition("?")
        housing_association_web = pd.Series(result[0]).repeat(len(user_urls))
        data_source = pd.Series(self.url).repeat(len(user_urls))

        # Create DataFrame
        self.reviews_data = pd.DataFrame(
            zip(
                housing_association_web,
                data_source,
                users,
                user_urls,
                dates,
                review_bodies,
                ratings
            ),
            columns=[
                "HausingAssociation_web",
                "DataSource",
                "User",
                "UserURL",
                "DatePublished",
                "ReviewBody",
                "RatingValue"
            ]
        )

    def get_reviews(self):
        self.fetch_page()
        self.parse_script_data()
        self.extract_reviews_data()
        return self.reviews_data

# Usage
if __name__ == "__main__":
    URL = 'https://uk.trustpilot.com/review/placesforpeople.co.uk'
    scraper = TrustpilotReviewScraper(URL)
    reviews_df = scraper.get_reviews()
    print(reviews_df)

   HausingAssociation_web                                         DataSource  \
0   placesforpeople.co.uk  https://uk.trustpilot.com/review/placesforpeop...   
1   placesforpeople.co.uk  https://uk.trustpilot.com/review/placesforpeop...   
2   placesforpeople.co.uk  https://uk.trustpilot.com/review/placesforpeop...   
3   placesforpeople.co.uk  https://uk.trustpilot.com/review/placesforpeop...   
4   placesforpeople.co.uk  https://uk.trustpilot.com/review/placesforpeop...   
5   placesforpeople.co.uk  https://uk.trustpilot.com/review/placesforpeop...   
6   placesforpeople.co.uk  https://uk.trustpilot.com/review/placesforpeop...   
7   placesforpeople.co.uk  https://uk.trustpilot.com/review/placesforpeop...   
8   placesforpeople.co.uk  https://uk.trustpilot.com/review/placesforpeop...   
9   placesforpeople.co.uk  https://uk.trustpilot.com/review/placesforpeop...   
10  placesforpeople.co.uk  https://uk.trustpilot.com/review/placesforpeop...   
11  placesforpeople.co.uk  https://uk.tr