### Perform removing important content and perform sentiment analysis

In [77]:
import re
import ollama
import pandas as pd

In [78]:
def extract_important_content(text):
    # Rimuovi frasi introduttive generiche
    text = re.sub(r'\bWhat Car\? says\.{0,3}', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\bIs the [^?]+\? ?', '', text, flags=re.IGNORECASE)

    # Rimuovi riferimenti a video e inviti alla lettura
    text = re.sub(r'\b[\w\s]*video review\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Read on to find out…', '', text, flags=re.IGNORECASE)

    # Rimuovi solo le frasi promozionali, senza mangiare testo prima/dopo
    text = re.sub(r'Sound interesting\?', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Check here for our best [^.]+\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'look for a used [^.]+\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'You could also check out other used [^.]+\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Want to sell your car online\? Carwow can help with that, too\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Carwow can even help you sell your car online too\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'head on over to our .*? deals page to see how much money .*? can save you\.', '', text, flags=re.IGNORECASE)

    # Rimuovi firme
    text = re.sub(r'Mat Watson Expert Car Reviewer', '', text, flags=re.IGNORECASE)

    # Rimuovi blocchi con solo listini/offerte (ma non specifiche tecniche)
    text = re.sub(r'(New car deals|Leasing deals).*?(From £[0-9,]+)?', '', text, flags=re.IGNORECASE)

    # Rimuovi intestazioni vuote (es. "Overview", "Performance & drive", se isolate)
    text = re.sub(r'\b(Overview|Our Pick|Performance & drive|Strengths|Weaknesses|Verdict|Specification)\b\s*', '', text)

    # Rimuovi caratteri superflui: - \ | ? ! $ % &
    text = re.sub(r'[-\\|?!$%&]', '', text)

    # Pulisci spazi extra
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [79]:
df = pd.read_csv("/home/guidojobinformatica/sviluppo/rossato_job/info/carwow_scraped_data.csv")
df.head()

Unnamed: 0,url,title,price,rating,review
0,https://www.carwow.co.uk/mg/zs/2018,MG ZS Review & Prices,"£18,605",5/10,If you’re looking for a small family SUV that’...
1,https://www.carwow.co.uk/mg/gs,MG GS Review and Prices,"£15,430",5/10,The MG GS is a medium-sized family SUV that’s ...
2,https://www.carwow.co.uk/mg/zs-ev,MG ZS EV Review & Prices,"£30,505",7/10,The MG ZS EV is an electric SUV that goes up a...
3,https://www.carwow.co.uk/mg/mg3,MG3 REVIEW & PRICES,"£16,995",9/10,The MG3 used to be a bit of a joke - the old o...
4,https://www.carwow.co.uk/mg/5,MG5 EV REVIEW & PRICES,"£31,005",6/10,"This is the MG5 EV, and it’s one of very few e..."


In [80]:
print(df.iloc[0])

url                     https://www.carwow.co.uk/mg/zs/2018
title                                 MG ZS Review & Prices
price                                               £18,605
rating                                                 5/10
review    If you’re looking for a small family SUV that’...
Name: 0, dtype: object


In [81]:
with open("/home/guidojobinformatica/sviluppo/rossato_job/info/system_information.txt", "r") as f:
    system_information = f.read()
print(f"{system_information.strip()}")

You are a sentiment analyzer for car reviews. Your task is to classify the sentiment of a review according to specific categories.

**INPUT FORMAT**:
  You will receive a review structured as follows:
  url, title, price, rating, review

**TASK**:
  You must analyze the full review and classify the sentiment for each of the following 16 categories:

  1. Brand  
  2. Model   
  3. Drive  
  4. Quality of interior  
  5. Infotainment system  
  6. Comfort  
  7. Performance  
  8. Handling  
  9. Practicality  
  10. Reliability  
  11. Safety  
  12. Quality of construction  
  13. Noise  
  14. Engine  
  15. Price  
  16. Overall  

**INSTRUCTIONS**:
  - For Brand and Model categories, assign the Brand and Model in the review (e.g., MG GS).
  - For Drive, Quality of interior, Infotainment system, Comfort, Performance, Handling, Practicality, Reliability, Safety, Quality of construction, Noise and Engine assign one of the following sentiments: Positive, Medium, Negative, or Not mentio

In [82]:
reviews = []
for idx, row in df.iterrows():
    url = row['url']
    title = row['title']
    price = row['price']
    rating: str = row['rating']
    source = row['review']
    review = f"{url}, {title}, {price}, {rating}, {source}"
    reviews.append(review)

reviews[:5]

["https://www.carwow.co.uk/mg/zs/2018, MG ZS Review & Prices, £18,605, 5/10, If you’re looking for a small family SUV that’s practical, affordable and reasonably well-equipped, the MG ZS is well worth considering. Okay, it might not feel quite as upmarket as the likes of the Mazda CX-3 but it’s significantly cheaper to buy and much roomier inside. The standard fabric seats come with enough adjustment to get comfortable and the buttons you’ll regularly use are easy to reach and laid out in a logical, uncluttered fashion. Both the Excite and Exclusive trims come with a 10.1-inch colour touchscreen as standard for the infotainment system.  You get Apple CarPlay and Android Auto, and the Exclusive also has integrated sat nav. The MG’s system is easy to use and responds quickly to your finger on the screen, but some of the menus can be tricky to go back a page, so it’s not quite as intuitive to use as that in a Ford Puma. The back seats don’t get so many nice touches. Although you’ll find t

In [83]:
# Creation of the dataframe
df_new = pd.DataFrame(
    columns = [
        'Brand', 
        'Model', 
        'Drive', 
        'Quality of interior', 
        'Infotainment system', 
        'Comfort', 
        'Performance',
        'Handling', 
        'Practicality',  
        'Reliability', 
        'Safety', 
        'Quality of construction', 
        'Noise', 
        'Engine', 
        'Price',
        'Overall',
    ]
)

In [84]:
model_llm = ["deepseek-r1:8b", "llama3.1:8b-instruct-q5_K_M", "gemma3:4b", "gemma3:4b-it-q8_0", "llama3.1:8b-instruct-q2_K", "llama3.2:3b-instruct-q5_K_M", "gemma3:1b"]
model_llm[4]

'llama3.1:8b-instruct-q2_K'

In [85]:
def extract_insights_ollama(text):
    response = ollama.chat(
        model=model_llm[4],
        messages=[
            {'role': 'system', 'content': system_information},
            {'role': 'user', 'content': text}
        ],
    )    
    return response['message']['content'].strip()

def extract_backtick_block(s):
    m = re.search(r'```(.*?)```', s, flags=re.DOTALL)
    return m.group(1).strip() if m else s.strip()

In [86]:
test_dict = {
    'Brand': 'MG',
    'Model': 'ZS',
    'Drive': 'Medium', 
    'Quality of interior': 'Positive', 
    'Infotainment system': 'Positive', 
    'Comfort': 'Negative', 
    'Performance': 'Medium', 
    'Handling': 'Not mentioned', 
    'Practicality': 'Positive', 
    'Reliability': 'Not mentioned', 
    'Safety': 'Negative', 
    'Quality of construction': 'Medium', 
    'Noise': 'Negative', 
    'Engine': 'Medium', 
    'Price': 18.605,
    'Overall': 6
}

len(test_dict)

16

In [91]:
import ast
# ...existing code...

error = 0
results = []


for i in range(10):
    print(f"Processing {i+1}/{len(reviews)}…")
    raw = extract_insights_ollama(reviews[i])
    cleaned = extract_backtick_block(raw)
    cleaned = re.sub(
        r"'Price':\s*£?([0-9]{1,3}(?:,[0-9]{3})*(?:\.\d+)?|\d+\.?\d*)",
        lambda m: f"'Price': {m.group(1).replace(',', '')}",
        cleaned
    )
    cleaned = re.sub(r"'Overall':\s*([0-9]+)\s*/\s*10", r"'Overall': \1", cleaned)
    cleaned = re.sub(r',\s*}', '}', cleaned)
    print(cleaned)
    
    try:
        result_dict = ast.literal_eval(cleaned)
    except Exception as e:
        print(f"Errore nella conversione: {e}")
        error += 1
        # Retry with ollama, but skip the while loop for this iteration
        continue

    print(result_dict)
    
    retries = 0
    while len(result_dict) != 16 and retries <= 10:
        print(f"Invalid format: {len(result_dict)}. Retrying…")
        retry_prompt = (
            system_information.strip()
            + "\n\n"
            + f"You returned {len(result_dict)} fields instead of 16. Please return the output in this exact format:\n"
            +   """```{
                'Brand': brand name, 
                'Model': model name, 
                'Drive': sentiment value, 
                'Quality of interior': sentiment value, 
                'Infotainment system': sentiment value, 
                'Comfort': sentiment value, 
                'Performance': sentiment value, 
                'Handling': sentiment value, 
                'Practicality': sentiment value, 
                'Reliability': sentiment value, 
                'Safety': sentiment value, 
                'Quality of construction': sentiment value, 
                'Noise': sentiment value, 
                'Engine': sentiment value, 
                'Price': price value, 
                'Overall': overall value
                }```"""
            + "Perform the analysis again for the following review:\n"
            + review
        )
        
        raw = extract_insights_ollama(retry_prompt)
        cleaned = extract_backtick_block(raw)
        cleaned = re.sub(
            r"'Price':\s*£?([0-9]{1,3}(?:,[0-9]{3})*(?:\.\d+)?|\d+\.?\d*)",
            lambda m: f"'Price': {m.group(1).replace(',', '')}",
            cleaned
        )
        cleaned = re.sub(r"'Overall':\s*([0-9]+)\s*/\s*10", r"'Overall': \1", cleaned)
        cleaned = re.sub(r',\s*}', '}', cleaned)
        
        try:
            result_dict = ast.literal_eval(cleaned)
        except Exception as e:
            print(f"Errore nella conversione: {e}")
            print(f"Retring...")

        retries += 1

    # Ensure correct types for 'Price' and 'Overall'
    try:
        result_dict['Price'] = float(str(result_dict['Price']).replace(',', '.').strip())
    except Exception:
        pass
    try:
        result_dict['Overall'] = float(result_dict['Overall'])
    except Exception:
        pass

    results.append(result_dict)

Processing 1/409…
{
    'Brand': 'MG',
    'Model': 'ZS',
    'Drive': 'Positive',
    'Quality of interior': 'Medium',
    'Infotainment system': 'Negative',
    'Comfort': 'Medium',
    'Performance': 'Negative',
    'Handling': 'Not mentioned',
    'Practicality': 'Positive',
    'Reliability': 'Medium',
    'Safety': 'Negative',
    'Quality of construction': 'Not mentioned',
    'Noise': 'Negative',
    'Engine': 'Negative',
    'Price': 18605,
    'Overall': 6
}
{'Brand': 'MG', 'Model': 'ZS', 'Drive': 'Positive', 'Quality of interior': 'Medium', 'Infotainment system': 'Negative', 'Comfort': 'Medium', 'Performance': 'Negative', 'Handling': 'Not mentioned', 'Practicality': 'Positive', 'Reliability': 'Medium', 'Safety': 'Negative', 'Quality of construction': 'Not mentioned', 'Noise': 'Negative', 'Engine': 'Negative', 'Price': 18605, 'Overall': 6}
Processing 2/409…
{
    'Brand': 'MG', 
    'Model': 'GS', 
    'Drive': 'Negative', 
    'Quality of interior': 'Negative', 
    'Infotai

In [92]:
print(results)

[{'Brand': 'MG', 'Model': 'ZS', 'Drive': 'Positive', 'Quality of interior': 'Medium', 'Infotainment system': 'Negative', 'Comfort': 'Medium', 'Performance': 'Negative', 'Handling': 'Not mentioned', 'Practicality': 'Positive', 'Reliability': 'Medium', 'Safety': 'Negative', 'Quality of construction': 'Not mentioned', 'Noise': 'Negative', 'Engine': 'Negative', 'Price': 18605.0, 'Overall': 6.0}, {'Brand': 'MG', 'Model': 'GS', 'Drive': 'Negative', 'Quality of interior': 'Negative', 'Infotainment system': 'Medium', 'Comfort': 'Not mentioned', 'Performance': 'Positive', 'Handling': 'Negative', 'Practicality': 'Positive', 'Reliability': 'Not mentioned', 'Safety': 'Negative', 'Quality of construction': 'Negative', 'Noise': 'Negative', 'Engine': 'Positive', 'Price': 15330.0, 'Overall': 4.0}, {'Brand': 'MG', 'Model': 'ZS EV', 'Drive': 'Medium', 'Quality of interior': 'Negative', 'Infotainment system': 'Positive', 'Comfort': 'Medium', 'Performance': 'Not mentioned', 'Handling': 'Positive', 'Practi

In [97]:
df_new = pd.DataFrame(columns=test_dict.keys())

for result_dict in results:
    df_new.loc[len(df_new)] = result_dict
    
df_new

Unnamed: 0,Brand,Model,Drive,Quality of interior,Infotainment system,Comfort,Performance,Handling,Practicality,Reliability,Safety,Quality of construction,Noise,Engine,Price,Overall
0,MG,ZS,Positive,Medium,Negative,Medium,Negative,Not mentioned,Positive,Medium,Negative,Not mentioned,Negative,Negative,18605.0,6.0
1,MG,GS,Negative,Negative,Medium,Not mentioned,Positive,Negative,Positive,Not mentioned,Negative,Negative,Negative,Positive,15330.0,4.0
2,MG,ZS EV,Medium,Negative,Positive,Medium,Not mentioned,Positive,Positive,Not mentioned,Not mentioned,Negative,Medium,Negative,30505.0,6.0
3,MG,MG3,Positive,Medium,Positive,Positive,Positive,Positive,Negative,Not mentioned,Not mentioned,Medium,Positive,Positive,"£16,995",8.0
4,MG,MG5 EV,Positive,Medium,Positive,Positive,Not mentioned,Not mentioned,Medium,Not mentioned,Positive,Medium,Not mentioned,Positive,31005.0,6.0
5,MG,S5 EV,Positive,Medium,Positive,Positive,Negative,Not mentioned,Positive,Positive,Not mentioned,Medium,Positive,Positive,28995.0,8.0
6,MG,HS,Negative,Medium,Medium,Positive,Positive,Negative,Positive,Not mentioned,Not mentioned,Negative,Medium,Positive,25995.0,7.0
7,MG,MG4,Positive,Medium,Negative,Positive,Positive,Positive,Medium,Not mentioned,Positive,Positive,Negative,Positive,26995.0,8.0
8,MG,ZS,Positive,Medium,Negative,Positive,Positive,Negative,Positive,Not mentioned,Positive,Medium,Negative,Positive,19995.0,7.0
9,MG,4 XPower,Positive,Medium,Positive,Positive,Positive,Negative,Not mentioned,Positive,Positive,Medium,Negative,Positive,36495.0,8.0


In [98]:
df_new.to_csv("/home/guidojobinformatica/sviluppo/rossato_job/info/car_review.csv", index=False)