### Perform removing important content and perform sentiment analysis

In [45]:
import re
import ast
import ollama
import pandas as pd

In [46]:
## Read and set the csv of scraped data

df = pd.read_csv("info/carwow_scraped_data_full.csv")
df.head()

Unnamed: 0,url,title,price,rating,tag,review
0,https://www.carwow.co.uk/mg/mg-4,MG4 EV REVIEW & PRICES,"€37,142",8/10,new,Is the MG4 EV a good car? Is the MG4 EV a good...
1,https://www.carwow.co.uk/mg/5,MG5 EV REVIEW & PRICES,"€37,738",6/10,new,Is the MG5 EV a good car? Is the MG5 EV a good...
2,https://www.carwow.co.uk/mg/zs-ev,MG ZS EV Review & Prices,"€38,616",7/10,new,Is the MG ZS EV a good car? Is the MG ZS EV a ...
3,https://www.carwow.co.uk/mg/mg3,MG3 REVIEW & PRICES,"€22,224",9/10,new,Is the MG3 a good car? Is the MG3 a good car? ...
4,https://www.carwow.co.uk/mg/gs,MG GS Review and Prices,"€21,561",5/10,new,The MG GS is a medium-sized family SUV that’s ...


In [47]:
## Example of line
df.iloc[0]

url                        https://www.carwow.co.uk/mg/mg-4
title                                MG4 EV REVIEW & PRICES
price                                               €37,142
rating                                                 8/10
tag                                                     new
review    Is the MG4 EV a good car? Is the MG4 EV a good...
Name: 0, dtype: object

In [48]:
## Read and set the system information

with open("info/system_information.txt", "r") as f:
    system_information = f.read()

In [49]:
## Creation of a list of reviews to help in the iteration later

reviews = {}
for idx, row in df.iterrows():
    reviews[idx] = {
        'url': row['url'],
        'title': row['title'],
        'price': str(row['price']).replace(',', '.'),
        'rating': row['rating'],
        'tag': row['tag'],
        'review': row['review']
    }

In [50]:
## Selection of the model to use in the sentiment analysis

model_llm = ["llama3.2:3b-instruct-q8_0", "deepseek-r1:7b", "mistral:7b-instruct", "deepseek-r1:8b", "llama3.1:8b-instruct-q5_K_M", "gemma3:4b", "gemma3:4b-it-q8_0", "llama3.1:8b-instruct-q2_K", "llama3.2:3b-instruct-q5_K_M", "gemma3:1b", "llama3.1:8b-instruct-fp16"]
model_llm[0]

'llama3.2:3b-instruct-q8_0'

In [51]:
def extract_insights_ollama(review_dict):
    prompt = (
        system_information
        + "\n\n"
        + "Please classify the following review according to the instructions above. Remember: ONLY output the dictionary in the exact format, enclosed in triple backticks.\n\n"
        + str(review_dict)
    )
    response = ollama.chat(
        model=model_llm[0],
        messages=[
            {'role': 'user', 'content': prompt}
        ],
    )
    return response['message']['content'].strip()

In [52]:
## Function to extract and clean the response

def clean_response(response):
    # Remove <think>...</think> blocks (including multiline)
    # response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
    response = re.sub(
        r"'Price':\s*[$£]?([0-9]{1,3}(?:,[0-9]{3})*(?:\.\d+)?|\d+\.?\d*)",
        lambda m: f"'Price': {m.group(1).replace(',', '')}",
        response
    )
    response = re.sub(r"'Overall':\s*([0-9]+(?:\.\d+)?)\s*/\s*10", r"'Overall': \1", response)
    response = re.sub(r',\s*}', '}', response) 
    return response
    
def extract_backtick_block(s):
    m = re.search(r'```(.*?)```', s, flags=re.DOTALL)
    return m.group(1).strip() if m else s.strip()

In [53]:
results = []

for i, review in reviews.items():
    print(f"Processing {i+1}/{len(reviews)}…")
    print(f"Review: {review}")

    retries = 0
    insights = {}
    while retries <= 10:
        if retries == 0:
            raw = extract_insights_ollama(review)
            no_ticks = extract_backtick_block(raw)
            raw = clean_response(no_ticks)
        else:
            # On retry, prepend strong instructions and format example
            retry_prompt = (
                system_information
                + "\n\n"
                + "You did NOT return the dictionary in the exact format. ONLY output the dictionary below, enclosed in triple backticks. If you do not, your answer will be discarded.\n"
                + """```{
                'Brand': brand name, 
                'Model': model name, 
                'Drive': sentiment value, 
                'Quality of interior': sentiment value, 
                'Infotainment system': sentiment value, 
                'Comfort': sentiment value, 
                'Performance': sentiment value, 
                'Handling': sentiment value, 
                'Practicality': sentiment value, 
                'Reliability': sentiment value, 
                'Safety': sentiment value, 
                'Quality of construction': sentiment value, 
                'Noise': sentiment value, 
                'Engine': sentiment value, 
                'Price': price value, 
                'Tag': tag value,
                'Overall': overall value
                }```\n"""
                + f"\nClassify the following review: {review}"
            )
            raw = extract_insights_ollama(retry_prompt)
        
        response = extract_backtick_block(raw)
        try:
            print(f"Raw response for review {i} (retry {retries}): {response}")
            insights = ast.literal_eval(response)
        except Exception as e:
            print(f"Error processing review {i} (retry {retries}): {e}")
            insights = {}

        if isinstance(insights, dict) and len(insights) == 17:
            results.append(insights)
            break
        else:
            print(f"Invalid or incomplete insights for review {i} (fields: {len(insights)}). Retrying…")
            retries += 1
    if retries > 10:
        print(f"Failed to process review {i} after 10 retries. Skipping.")


Processing 1/206…
Raw response for review 0 (retry 0): {
  "make": "MG",
  "model": "MG4",
  "year": "2022",
  "transmission": "Automatic",
  "engine": {
    "type": "Electric",
    "power": [
      {"type": "SE", "power": 170, "battery": "51kWh"},
      {"type": "SE Long Range", "power": 203, "battery": "64kWh"},
      {"type": "Trophy Long Range", "power": 241, "battery": "77kWh"}
    ]
  },
  "fuel economy": {
    "city": {
      "SE": "200 miles per charge",
      "SE Long Range": "281 miles per charge",
      "Trophy Long Range": "270 miles per charge"
    },
    "highway": {
      "SE": "230 miles per charge",
      "SE Long Range": "300 miles per charge",
      "Trophy Long Range": "310 miles per charge"
    }
  },
  "seats": 5,
  "payload capacity": 800 kg,
  "warranty": "7 years/80,000 miles",
  "price": {
    "SE": "£25,995",
    "SE Long Range": "£27,495",
    "Trophy Long Range": "£31,495"
  },
  "safety features": [
    "MG Pilot (standard)",
    "Adaptive cruise control (

KeyboardInterrupt: 

In [None]:
import json

## For each review we extract the analysis of the model with a max of 10 retries.

results = []

for i, review in reviews.items():
    print(f"Processing {i+1}/{len(reviews)}…")
    print(f"Review: {review}")
    raw = extract_insights_ollama(review)
    response = extract_backtick_block(raw)
    # proc_response = clean_response(response)

    # print(f"Raw response: {raw}")
    # print(f"Processed response: {proc_response}")

    print(f"Response: {response}")
    
    result_dict = {}
    try:
        result_dict = json.loads(response.split("json", 1)[-1].strip() if response.strip().startswith("json") else response)
        print(f"Result dict: {result_dict}")
    except Exception as e:
        print(f"Before retry: Errore nella conversione: {e}")
    
    retries = 0
    while len(result_dict) != 17 and retries <= 10:
        print(f"Invalid format: {len(result_dict)}. Retrying…")
        retry_prompt = (
            # system_information
            # + "\n\n"
            f"You returned {len(result_dict)} fields instead of 17. Please return the output in the exact format:\n"
            +   """```json
                {
                "Brand": brand name, 
                "Model": model name, 
                "Drive": sentiment value, 
                "Quality of interior": sentiment value, 
                "Infotainment system": sentiment value, 
                "Comfort": sentiment value, 
                "Performance": sentiment value, 
                "Handling": sentiment value, 
                "Practicality": sentiment value, 
                "Reliability": sentiment value, 
                "Safety": sentiment value, 
                "Quality of construction": sentiment value, 
                "Noise": sentiment value, 
                "Engine": sentiment value, 
                "Price": price value,
                "tag": "new" or "used", 
                "Overall": overall value
                }
                ```"""
            + f"\nPerform the analysis again for the following review: {review}"
        )
        
        raw = extract_insights_ollama(retry_prompt)
        response = extract_backtick_block(raw)
        # proc_response = clean_response(response)
        
        # print(f"Retry raw response: {raw}")
        # print(f"Retry processed response: {proc_response}")

        try:
            result_dict = json.loads(response.split("json", 1)[-1].strip() if response.strip().startswith("json") else response)
        except Exception as e:
            print(f"After retry: Errore nella conversione: {e}")
        
        retries += 1
        print(f"Retries: {retries}")

    # Ensure correct types for "Price" and "Overall"
    
    print(result_dict)
    
    # try:
    #     price_str = str(result_dict["Price"]).replace("£", "").replace("€", "").replace(",", "").strip()
    #     result_dict["Price"] = float(price_str)
    # except Exception as e:
    #     print(f"Exception on converting the price: {e}")
    #     pass
    
    # try:
    #     result_dict["Overall"] = float(result_dict["Overall"])
    # except Exception as e:
    #     print(f"Exception on converting the overall: {e}")
    #     pass

    results.append(result_dict)
    print("\n")

In [None]:
## Just for example

test_dict = {
    'Brand': 'MG',
    'Model': 'ZS',
    'Drive': 'Medium', 
    'Quality of interior': 'Positive', 
    'Infotainment system': 'Positive', 
    'Comfort': 'Negative', 
    'Performance': 'Medium', 
    'Handling': 'Not mentioned', 
    'Practicality': 'Positive', 
    'Reliability': 'Not mentioned', 
    'Safety': 'Negative', 
    'Quality of construction': 'Medium', 
    'Noise': 'Negative', 
    'Engine': 'Medium', 
    'Price': 18.605,
    'tag': 'new',
    'Overall': 6
}

len(test_dict)

In [None]:
## We use the test_dict keys to create the columns of the Dataframe, then we create it.
df_new = pd.DataFrame(columns = test_dict.keys())

for result_dict in results:
    df_new.loc[len(df_new)] = result_dict
    
df_new

In [None]:
## Saving the Dataframe as a csv.

df_new.to_csv("info/car_review_new.csv", index = False)