### Perform removing important content and perform sentiment analysis

In [62]:
import re
import ast
import ollama
import pandas as pd

In [63]:
## Read and set the csv of scraped data

df = pd.read_csv("/home/guidojobinformatica/sviluppo/rossato_job/info/carwow_scraped_data.csv")
df.head()

Unnamed: 0,url,title,price,rating,review
0,https://www.carwow.co.uk/mg/mg3,MG3 REVIEW & PRICES,"£16,995",9/10,The MG3 used to be a bit of a joke - the old o...
1,https://www.carwow.co.uk/mg/hs,MG HS Review & Prices,"£25,995",8/10,The MG HS has at points been the best-selling ...
2,https://www.carwow.co.uk/mg/cyberster,MG Cyberster Review & Prices,"£54,995",7/10,"The MG Cyberster is, arguably, the first drop-..."
3,https://www.carwow.co.uk/mg/zs/2018,MG ZS Review & Prices,"£18,605",5/10,If you’re looking for a small family SUV that’...
4,https://www.carwow.co.uk/mg/mg-4,MG4 EV REVIEW & PRICES,"£26,995",8/10,"MG pulled an absolute blinder with the MG4, an..."


In [64]:
## Example of line
df.iloc[0]

url                         https://www.carwow.co.uk/mg/mg3
title                                   MG3 REVIEW & PRICES
price                                               £16,995
rating                                                 9/10
review    The MG3 used to be a bit of a joke - the old o...
Name: 0, dtype: object

In [65]:
## Read and set the system information

with open("/home/guidojobinformatica/sviluppo/rossato_job/info/system_information.txt", "r") as f:
    system_information = f.read()

In [66]:
## Creation of a list of reviews to help in the iteration later

reviews = []
for idx, row in df.iterrows():
    url = row['url']
    title = row['title']
    price = row['price']
    rating: str = row['rating']
    source = row['review']
    review = f"{url}, {title}, {price}, {rating}, {source}"
    reviews.append(review)

reviews[:5]

['https://www.carwow.co.uk/mg/mg3, MG3 REVIEW & PRICES, £16,995, 9/10, The MG3 used to be a bit of a joke - the old one was the lowest-ranked car on Carwow - but the new model aims to change all that. Now, instead of being antiquated, it’s right up there with the best small cars for performance, efficiency and equipment. Like an air fryer from the middle of Lidl, the MG3’s feature set is as good as any of its alternatives and it’s not immediately clear where the money’s been saved to bring it in at such a low price. Now that it’s a full self-charging hybrid car , the MG3 no longer targets the very cheapest on the market like the Dacia Sandero or Citroen C3 Origin . Instead, it’s more like a cut-price alternative to the Renault Clio E-Tech or Toyota Yaris - a halfway house for people who want the most efficient small car possible, but either don’t want or can’t have a fully electric model. The MG3 won’t win awards for its practicality - it’s not cramped, but it’s no better than average 

In [67]:
## Selection of the model to use in the sentiment analysis

model_llm = ["deepseek-r1:8b", "llama3.1:8b-instruct-q5_K_M", "gemma3:4b", "gemma3:4b-it-q8_0", "llama3.1:8b-instruct-q2_K", "llama3.2:3b-instruct-q5_K_M", "gemma3:1b"]
model_llm[4]

'llama3.1:8b-instruct-q2_K'

In [68]:
## Set the function to call the model

def extract_insights_ollama(text):
    response = ollama.chat(
        model=model_llm[4],
        messages=[
            {'role': 'system', 'content': system_information},
            {'role': 'user', 'content': text}
        ],
    )    
    return response['message']['content'].strip()

In [69]:
## Just for example

test_dict = {
    'Brand': 'MG',
    'Model': 'ZS',
    'Drive': 'Medium', 
    'Quality of interior': 'Positive', 
    'Infotainment system': 'Positive', 
    'Comfort': 'Negative', 
    'Performance': 'Medium', 
    'Handling': 'Not mentioned', 
    'Practicality': 'Positive', 
    'Reliability': 'Not mentioned', 
    'Safety': 'Negative', 
    'Quality of construction': 'Medium', 
    'Noise': 'Negative', 
    'Engine': 'Medium', 
    'Price': 18.605,
    'Overall': 6
}

len(test_dict)

16

In [72]:
## Function to extract and clean the response

def clean_response(response):
    response = re.sub(
        r"'Price':\s*[$£]?([0-9]{1,3}(?:,[0-9]{3})*(?:\.\d+)?|\d+\.?\d*)",
        lambda m: f"'Price': {m.group(1).replace(',', '')}",
        response
    )
    response = re.sub(r"'Overall':\s*([0-9]+(?:\.\d+)?)\s*/\s*10", r"'Overall': \1", response)
    response = re.sub(r',\s*}', '}', response) 
    return response
    
def extract_backtick_block(s):
    m = re.search(r'```(.*?)```', s, flags=re.DOTALL)
    return m.group(1).strip() if m else s.strip()

In [73]:
## For each review we extract the analysis of the model with a max of 10 retries.

results = []

for i in range(len(reviews)):
    print(f"Processing {i+1}/{len(reviews)}…")
    raw = extract_insights_ollama(reviews[i])
    response = extract_backtick_block(raw)
    proc_response = clean_response(response)
    
    try:
        result_dict = ast.literal_eval(proc_response)
    except Exception as e:
        print(f"Errore nella conversione: {e}")
        print(f"Risultato: {result_dict}")
    
    retries = 0
    while len(result_dict) != 16 and retries <= 10:
        print(f"Invalid format: {len(result_dict)}. Retrying…")
        retry_prompt = (
            system_information.strip()
            + "\n\n"
            + f"You returned {len(result_dict)} fields instead of 16. Please return the output in this exact format:\n"
            +   """```{
                'Brand': brand name, 
                'Model': model name, 
                'Drive': sentiment value, 
                'Quality of interior': sentiment value, 
                'Infotainment system': sentiment value, 
                'Comfort': sentiment value, 
                'Performance': sentiment value, 
                'Handling': sentiment value, 
                'Practicality': sentiment value, 
                'Reliability': sentiment value, 
                'Safety': sentiment value, 
                'Quality of construction': sentiment value, 
                'Noise': sentiment value, 
                'Engine': sentiment value, 
                'Price': price value, 
                'Overall': overall value
                }```"""
            + "Perform the analysis again for the following review:\n"
            + review
        )
        
        raw = extract_insights_ollama(retry_prompt)
        response = extract_backtick_block(raw)
        proc_response = clean_response(response)
        
        try:
            result_dict = ast.literal_eval(proc_response)
        except Exception as e:
            print(f"Errore nella conversione: {e}")
            print(f"Risultato: {result_dict}")

        retries += 1

    # Ensure correct types for 'Price' and 'Overall'
    
    print(result_dict)
    
    try:
        price_str = str(result_dict['Price']).replace('£', '').replace('$', '').replace(',', '').strip()
        result_dict['Price'] = float(price_str)
    except Exception as e:
        print(f"Exception on converting the price: {e}")
        pass
    
    try:
        result_dict['Overall'] = float(result_dict['Overall'])
    except Exception as e:
        print(f"Exception on converting the overall: {e}")
        pass

    results.append(result_dict)
    print("\n")

Processing 1/205…
{'Brand': 'MG', 'Model': 'MG3', 'Drive': 'Positive', 'Quality of interior': 'Medium', 'Infotainment system': 'Positive', 'Comfort': 'Medium', 'Performance': 'Positive', 'Handling': 'Positive', 'Practicality': 'Negative', 'Reliability': 'Not mentioned', 'Safety': 'Not mentioned', 'Quality of construction': 'Not mentioned', 'Noise': 'Positive', 'Engine': 'Positive', 'Price': 16995, 'Overall': 8}


Processing 2/205…
{'Brand': 'MG', 'Model': 'HS', 'Drive': 'Medium', 'Quality of interior': 'Negative', 'Infotainment system': 'Not mentioned', 'Comfort': 'Positive', 'Performance': 'Medium', 'Handling': 'Medium', 'Practicality': 'Positive', 'Reliability': 'Not mentioned', 'Safety': 'Not mentioned', 'Quality of construction': 'Negative', 'Noise': 'Not mentioned', 'Engine': 'Positive', 'Price': 25995, 'Overall': 6}


Processing 3/205…
{'Brand': 'MG', 'Model': 'Cyberster', 'Drive': 'Positive', 'Quality of interior': 'Medium', 'Infotainment system': 'Negative', 'Comfort': 'Positiv

KeyboardInterrupt: 

In [50]:
## We use the test_dict keys to create the columns of the Dataframe, then we create it.
df_new = pd.DataFrame(columns = test_dict.keys())

for result_dict in results:
    df_new.loc[len(df_new)] = result_dict
    
df_new

Unnamed: 0,Brand,Model,Drive,Quality of interior,Infotainment system,Comfort,Performance,Handling,Practicality,Reliability,Safety,Quality of construction,Noise,Engine,Price,Overall
0,MG,ZS,Positive,Medium,Negative,Negative,Negative,Not mentioned,Positive,Negative,Negative,Not mentioned,Negative,Negative,18605.0,5.0
1,MG,GS,Negative,Negative,Medium,Negative,Positive,Not mentioned,Positive,Not mentioned,Negative,Not mentioned,Not mentioned,Positive,15430.0,5.0
2,MG,ZS EV,Medium,Negative,Positive,Medium,Medium,Medium,Positive,Not mentioned,Not mentioned,Negative,Negative,Medium,30505.0,7.0
3,MG,MG3,Positive,Medium,Positive,Not mentioned,Positive,Positive,Negative,Not mentioned,Not mentioned,Medium,Positive,Positive,16995.0,7.0
4,MG,MG5 EV,Positive,Medium,Positive,Positive,Not mentioned,Not mentioned,Positive,Positive,Positive,Good,Not mentioned,Not mentioned,31005.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,Haval,Jolion Pro,Medium,Positive,Negative,Positive,Negative,Not mentioned,Negative,Medium,Positive,Medium,Not mentioned,Negative,24275.0,6.0
405,Jaecoo,7,Positive,Medium,Positive,Negative,Medium,Not mentioned,Negative,Not mentioned,Positive,Medium,Not mentioned,Positive,30115.0,7.0
406,Leapmotor,T03,Positive,Positive,Negative,Medium,Positive,Negative,Negative,Not mentioned,Not mentioned,Not mentioned,Not mentioned,Positive,15995.0,7.0
407,Leapmotor,C10,Medium,Positive,Not mentioned,Positive,Medium,Not mentioned,Positive,Medium,Positive,Medium,Negative,Positive,36500.0,6.0


In [51]:
## Saving the Dataframe as a csv.

df_new.to_csv("/home/guidojobinformatica/sviluppo/rossato_job/info/car_review.csv", index = False)