### Trustpilot_review_extraction

"""
Trustpilot Review Scraper

This script allows you to scrape Trustpilot reviews for a given Housing Association or company. 
It can handle URLs in various formats, including those with or without the 'page' parameter. 
The scraper fetches review data from multiple pages, if available, and stores the results in a CSV file.

Usage:
1. Set the base URL for the company or Housing Association you want to scrape.
2. The script automatically generates URLs for all available pages (if the 'page' parameter is included) 
   or just scrapes the first page (if no 'page' parameter is present).
3. Reviews are extracted, including user names, user links, publication dates, review bodies, and ratings.
4. The scraped data is saved to a CSV file for easy analysis.

Example:
- For a URL with 'page' and 'sort' parameters:
  url = 'https://uk.trustpilot.com/review/www.jigsawhomes.org.uk?page=8&sort=recency'

- For a URL without the 'page' parameter (only one page):
  url = 'https://uk.trustpilot.com/review/placesforpeople.co.uk'

- For a URL with 'sort' only (but no 'page'):
  url = 'https://uk.trustpilot.com/review/www.myclarionhousing.com?sort=recency'

To save the results to a CSV file, use the `save_reviews_to_csv` method with the desired file path.

Example:
  output_path = "YOURpath/yourhousinggroup.csv"
  scraper.save_reviews_to_csv(output_path)

Requirements:
- `requests`
- `beautifulsoup4`
- `pandas`

Ensure that these libraries are installed before running the script. You can install them using:
  pip install requests beautifulsoup4 pandas
"""

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
from html.parser import HTMLParser


class TrustpilotScraper:
    def __init__(self, url):
        self.url = url
        self.full_url_list = self.generate_url_list()

    def generate_url_list(self):
        """Generate a list of URLs for all available pages. Handles URLs with or without the 'page' parameter."""
        base_url, query_params = self.url.split("?") if "?" in self.url else (self.url, '')
        
        # If the URL doesn't contain a page parameter, assume just the first page
        if "page=" in query_params:
            page_part, sort_part = query_params.split("&")
            current_page = int(page_part.split("=")[1])
            return [
                f"{base_url}?page={i}&{sort_part}" for i in range(1, current_page + 1)
            ]
        else:
            # If no page parameter, assume just one page
            return [base_url]

    def fetch_reviews(self, url):
        """Fetch review data from the provided URL."""
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        html_string = str(soup.find_all(re.compile('script'))[6])  # The JSON is typically in the 6th script tag
        return self.parse_json_data(html_string)

    def parse_json_data(self, html_string):
        """Parse the JSON data embedded in the script tag."""
        class MyHTMLParser(HTMLParser):
            def __init__(self):
                super().__init__()
                self.data = []

            def handle_data(self, data):
                self.data.append(data)

        parser = MyHTMLParser()
        parser.feed(html_string)
        json_output = json.dumps({"TheData": parser.data[0]})
        return json.loads(json_output)

    def extract_review_data(self, json_data):
        """Extract relevant review data from the parsed JSON."""
        reviews_data = list(json_data.items())[0][1]
        
        user_links = re.findall(r'https://uk.trustpilot.com/users/[0-9a-z/]*', reviews_data)
        user_names = self.extract_user_names(reviews_data)
        dates = self.extract_dates(reviews_data)
        review_bodies = self.extract_review_bodies(reviews_data)
        ratings = self.extract_ratings(reviews_data)
        
        return user_links, user_names, dates, review_bodies, ratings

    def extract_user_names(self, reviews_data):
        """Extract user names from the review data."""
        name_matches = re.finditer("name", reviews_data)
        names = [
            reviews_data[(match.span()[1] + 3):(match.span()[1] + 30)]
            for match in name_matches
        ]
        return [name.split('"')[0] for name in names]  # Cleanup names

    def extract_dates(self, reviews_data):
        """Extract publication dates from the review data."""
        date_matches = re.finditer("datePublished", reviews_data)
        return [
            reviews_data[(match.span()[1] + 3):(match.span()[1] + 27)]
            for match in date_matches
        ]

    def extract_review_bodies(self, reviews_data):
        """Extract review bodies from the review data."""
        review_body_matches = zip(
            re.finditer("reviewBody", reviews_data),
            re.finditer("reviewRating", reviews_data),
        )
        return [
            reviews_data[(match1.span()[1] + 2):(match2.span()[0] - 2)]
            for match1, match2 in review_body_matches
        ]

    def extract_ratings(self, reviews_data):
        """Extract rating values from the review data."""
        rating_matches = re.finditer("ratingValue", reviews_data)
        return [
            reviews_data[(match.span()[1] + 3):(match.span()[1] + 4)]
            for match in rating_matches
        ]

    def extract_housing_association(self, url):
        """Extract housing association name from the URL."""
        result = url.partition("https://uk.trustpilot.com/review/")[2].partition("?")
        ha_0 = result[0].split(".")
        return ha_0[1] if ha_0[0] == 'www' else ha_0[0]

    def scrape_reviews(self):
        """Scrape reviews from all the pages and save to a DataFrame."""
        all_reviews = []

        for url in self.full_url_list:
            json_data = self.fetch_reviews(url)
            user_links, user_names, dates, review_bodies, ratings = self.extract_review_data(json_data)
            housing_association = [self.extract_housing_association(url)] * len(user_links)
            data_source = [url] * len(user_links)

            all_reviews.extend(zip(housing_association, data_source, user_names, user_links, dates, review_bodies, ratings))

        # Create a DataFrame and return it
        df = pd.DataFrame(all_reviews, columns=['HousingAssociation', 'DataSource', 'User', 'UserURL', 'DatePublished', 'ReviewBody', 'RatingValue'])
        return df

    def save_reviews_to_csv(self, output_path):
        """Save the scraped reviews to a CSV file."""
        df = self.scrape_reviews()
        df.to_csv(output_path, index=False)


# Example usage
url = 'https://uk.trustpilot.com/review/placesforpeople.co.uk'  # URL without 'page' parameter
scraper = TrustpilotScraper(url)
output_path = "YOURpath/placesforpeople.csv"
scraper.save_reviews_to_csv(output_path)

url = 'https://uk.trustpilot.com/review/www.myclarionhousing.com?sort=recency'  # URL with 'sort' but no 'page'
scraper = TrustpilotScraper(url)
output_path = "YOURpath/myclarionhousing.csv"
scraper.save_reviews_to_csv(output_path)

url = 'https://uk.trustpilot.com/review/www.jigsawhomes.org.uk?page=8&sort=recency'  # URL with 'page' and 'sort'
scraper = TrustpilotScraper(url)
output_path = "YOURpath/jigsawhomes.csv"
scraper.save_reviews_to_csv(output_path)