# Reviews Text Extractor
This notebook is used downstream to the scraping algorithm to extract the text of the reviews from the `html` bodies.

In [172]:
import json
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt

In [173]:
def clean_reviews(html: str) -> list[dict[str, str]]:
    soup = BeautifulSoup(html, "html.parser")
    titles = []
    contents = []
    dates = []

    # Extract the titles
    for a in soup.find_all("a", class_="review-title-content"):
        span_tags = a.find_all("span")
        titles.append(span_tags[2].get_text(strip=True)
                      if len(span_tags) >= 3 else '')

    # Extract the content
    for span in soup.find_all("span", class_="review-text-content"):
        span_tags = span.find_all("span")
        contents.append(span_tags[0].get_text(
            strip=True) if len(span_tags) >= 1 else '')

    # Extract the date
    for span in soup.find_all("span", class_="review-date"):
        dates.append(span.get_text(strip=True).split('on ')[1])

    return [{'title': t, 'content': c, 'date': d} for t, c, d in zip(titles, contents, dates)]

In [174]:
reviews_raw={}
with open('reviews_raw.json') as f:
    reviews_raw = json.load(f)

In [175]:
df = pd.DataFrame([
    {
        'stars': key.split(':')[0],
        'timestamp': review['date'],
        'title': review['title'],
        'content': review['content']
    }
    for key, html in reviews_raw.items()
    for review in clean_reviews(html)
])
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [176]:
df.head()

Unnamed: 0,stars,timestamp,title,content
0,1,2024-01-08,Terrible,I've had my printer for a year. It stopped wor...
1,1,2024-12-19,Instant ink is a JOKE,I will never purchase another HP product again...
2,1,2024-09-10,Avoid,The printer itself is decent. HP promotes thei...
3,1,2024-12-30,a bit suspicious of the 3 month ink supply,I've had a series of errors with the printer w...
4,1,2024-11-05,"If you want to waste time, buy this one!","This printer frequently ""resets"" itself and no..."


In [177]:
df.to_csv('reviews_clean.csv')