### Perform removing important content and perform sentiment analysis

In [1]:
import re
import ast
import ollama
import pandas as pd

In [2]:
## Read and set the csv of scraped data

df = pd.read_csv("info/carwow_scraped_data_full.csv")
df.head()

Unnamed: 0,url,title,price,rating,tag,review
0,https://www.carwow.co.uk/mg/mg-4,MG4 EV REVIEW & PRICES,"€37,142",8/10,new,Is the MG4 EV a good car? Is the MG4 EV a good...
1,https://www.carwow.co.uk/mg/5,MG5 EV REVIEW & PRICES,"€37,738",6/10,new,Is the MG5 EV a good car? Is the MG5 EV a good...
2,https://www.carwow.co.uk/mg/zs-ev,MG ZS EV Review & Prices,"€38,616",7/10,new,Is the MG ZS EV a good car? Is the MG ZS EV a ...
3,https://www.carwow.co.uk/mg/mg3,MG3 REVIEW & PRICES,"€22,224",9/10,new,Is the MG3 a good car? Is the MG3 a good car? ...
4,https://www.carwow.co.uk/mg/gs,MG GS Review and Prices,"€21,561",5/10,new,The MG GS is a medium-sized family SUV that’s ...


In [3]:
## Example of line
df.iloc[0]

url                        https://www.carwow.co.uk/mg/mg-4
title                                MG4 EV REVIEW & PRICES
price                                               €37,142
rating                                                 8/10
tag                                                     new
review    Is the MG4 EV a good car? Is the MG4 EV a good...
Name: 0, dtype: object

In [4]:
## Read and set the system information

with open("info/system_information.txt", "r") as f:
    system_information = f.read()

In [5]:
## Creation of a list of reviews to help in the iteration later

reviews = {}
for idx, row in df.iterrows():
    reviews[idx] = {
        'url': row['url'],
        'title': row['title'],
        'price': str(row['price']).replace(',', '.'),
        'rating': row['rating'],
        'tag': row['tag'],
        'review': row['review']
    }

In [6]:
## Selection of the model to use in the sentiment analysis

model_llm = ["llama3.1:8b", "qwen3:4b", "llama3.2:3b-instruct-q8_0", "deepseek-r1:7b", "mistral:7b-instruct", "deepseek-r1:8b", "llama3.1:8b-instruct-q5_K_M", "gemma3:4b", "gemma3:4b-it-q8_0", "llama3.1:8b-instruct-q2_K", "llama3.2:3b-instruct-q5_K_M", "gemma3:1b", "llama3.1:8b-instruct-fp16"]
model_llm[0]

'llama3.1:8b'

In [7]:
import json

def extract_insights_ollama(review_dict):
    prompt_text = json.dumps(review_dict, ensure_ascii=False, indent=2)
    
    response = ollama.chat(
        model=model_llm[0],
        messages=[
            {"role": "system", "content": system_information},
            {"role": "user", "content": prompt_text}
        ],
    )
    
    return response['message']['content']


In [8]:
## Function to extract and clean the response

def clean_response(response):
    # Remove <think>...</think> blocks (including multiline)
    # response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
    response = re.sub(
        r"'Price':\s*[$£]?([0-9]{1,3}(?:,[0-9]{3})*(?:\.\d+)?|\d+\.?\d*)",
        lambda m: f"'Price': {m.group(1).replace(',', '')}",
        response
    )
    response = re.sub(r"'Overall':\s*([0-9]+(?:\.\d+)?)\s*/\s*10", r"'Overall': \1", response)
    response = re.sub(r',\s*}', '}', response) 
    return response
    
def extract_backtick_block(s):
    m = re.search(r'```(.*?)```', s, flags=re.DOTALL)
    return m.group(1).strip() if m else s.strip()

In [None]:
import ast

results = []

for i, review in reviews.items():
    print(f"Processing {i+1}/{len(reviews)}…")
    raw = extract_insights_ollama(review)
    response = extract_backtick_block(raw)
    proc_response = clean_response(response)

    # Always use ast.literal_eval for parsing
    try:
        result_dict = ast.literal_eval(proc_response)
    except Exception as e:
        print(f"Parsing error: {e}")
        result_dict = {}

    retries = 0
    while len(result_dict) != 17 and retries < 10:
        print(f"Invalid format ({len(result_dict)} keys). Retrying…")
        retry_prompt = (
            system_information
            + "\n\n"
            + "You did NOT return the dictionary in the exact format. ONLY output the dictionary below, enclosed in triple backticks. If you do not, your answer will be discarded.\n"
            + """```{
                'Brand': brand name, 
                'Model': model name, 
                'Drive': sentiment value, 
                'Quality of interior': sentiment value, 
                'Infotainment system': sentiment value, 
                'Comfort': sentiment value, 
                'Performance': sentiment value, 
                'Handling': sentiment value, 
                'Practicality': sentiment value, 
                'Reliability': sentiment value, 
                'Safety': sentiment value, 
                'Quality of construction': sentiment value, 
                'Noise': sentiment value, 
                'Engine': sentiment value, 
                'Price': price value, 
                'Tag': tag value,
                'Overall': overall value
                }```\n"""
            + f"\nPerform the analysis again for the following review: {review}"
        )
        raw = extract_insights_ollama(retry_prompt)
        response = extract_backtick_block(raw)
        proc_response = clean_response(response)
        try:
            result_dict = ast.literal_eval(proc_response)
        except Exception as e:
            print(f"Retry parsing error: {e}")
            result_dict = {}
        retries += 1

    results.append(result_dict)

Processing 1/206…
Parsing error: unterminated string literal (detected at line 1) (<unknown>, line 1)
Invalid format (0 keys). Retrying…
Retry parsing error: invalid syntax (<unknown>, line 2)
Invalid format (0 keys). Retrying…
Retry parsing error: invalid syntax (<unknown>, line 2)
Invalid format (0 keys). Retrying…
Retry parsing error: unterminated triple-quoted string literal (detected at line 16) (<unknown>, line 5)
Invalid format (0 keys). Retrying…
Retry parsing error: invalid syntax (<unknown>, line 2)
Invalid format (0 keys). Retrying…
Retry parsing error: invalid syntax (<unknown>, line 2)
Invalid format (0 keys). Retrying…
Retry parsing error: invalid syntax (<unknown>, line 2)
Invalid format (0 keys). Retrying…
Retry parsing error: invalid syntax (<unknown>, line 2)
Invalid format (0 keys). Retrying…
Retry parsing error: invalid syntax (<unknown>, line 2)
Invalid format (0 keys). Retrying…
Retry parsing error: invalid syntax (<unknown>, line 2)
Invalid format (0 keys). Retry

In [None]:
## Just for example

test_dict = {
    'Brand': 'MG',
    'Model': 'ZS',
    'Drive': 'Medium', 
    'Quality of interior': 'Positive', 
    'Infotainment system': 'Positive', 
    'Comfort': 'Negative', 
    'Performance': 'Medium', 
    'Handling': 'Not mentioned', 
    'Practicality': 'Positive', 
    'Reliability': 'Not mentioned', 
    'Safety': 'Negative', 
    'Quality of construction': 'Medium', 
    'Noise': 'Negative', 
    'Engine': 'Medium', 
    'Price': 18.605,
    'tag': 'new',
    'Overall': 6
}

len(test_dict)

In [None]:
## We use the test_dict keys to create the columns of the Dataframe, then we create it.
df_new = pd.DataFrame(columns = test_dict.keys())

for result_dict in results:
    df_new.loc[len(df_new)] = result_dict
    
df_new

In [None]:
## Saving the Dataframe as a csv.

df_new.to_csv("info/car_review_new.csv", index = False)