### Perform removing important content and perform sentiment analysis

In [72]:
import re
import ollama
import pandas as pd
import json
import re
from typing import List

In [73]:
## Read and set the csv of scraped data

df = pd.read_csv("info/carwow_scraped_data_full.csv")
df.head()

Unnamed: 0,url,title,price,rating,tag,review
0,https://www.carwow.co.uk/mg/mg-4,MG4 EV REVIEW & PRICES,"€37,142",8/10,new,Is the MG4 EV a good car? Is the MG4 EV a good...
1,https://www.carwow.co.uk/mg/5,MG5 EV REVIEW & PRICES,"€37,738",6/10,new,Is the MG5 EV a good car? Is the MG5 EV a good...
2,https://www.carwow.co.uk/mg/zs-ev,MG ZS EV Review & Prices,"€38,616",7/10,new,Is the MG ZS EV a good car? Is the MG ZS EV a ...
3,https://www.carwow.co.uk/mg/mg3,MG3 REVIEW & PRICES,"€22,224",9/10,new,Is the MG3 a good car? Is the MG3 a good car? ...
4,https://www.carwow.co.uk/mg/gs,MG GS Review and Prices,"€21,561",5/10,new,The MG GS is a medium-sized family SUV that’s ...


In [74]:
## Example of line
df.iloc[0]

url                        https://www.carwow.co.uk/mg/mg-4
title                                MG4 EV REVIEW & PRICES
price                                               €37,142
rating                                                 8/10
tag                                                     new
review    Is the MG4 EV a good car? Is the MG4 EV a good...
Name: 0, dtype: object

In [75]:
## Creation of a list of reviews to help in the iteration later

reviews = {}
for idx, row in df.iterrows():
    # Remove 'Review & Prices' or similar suffixes from the title to get the model name
    title = row['title']
    model = title.replace('Review & Prices', '').replace('REVIEW & PRICES', '').replace('Review and Prices', '').strip()
    reviews[idx] = {
        'url': row['url'],
        'brand': row['url'].split('/')[3].upper(),
        'model': model,
        'price': str(row['price']).replace(',', '.').replace('€', ''),
        'rating': row['rating'],
        'condition': row['tag'],
        'review': row['review']
    }

In [76]:
reviews[6]['url'].split('/')

['https:', '', 'www.carwow.co.uk', 'mg', 'zs', '2018']

In [77]:
def clean_review(text):
    if pd.isnull(text):
        return text

    # Rimuovi domande introduttive generiche
    text = re.sub(r'(Is the [A-Za-z0-9\s\-]+ (a|an) good car\?\s*)+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'(Should I buy the [A-Za-z0-9\s\-]+\?\s*)+', '', text, flags=re.IGNORECASE)

    # Rimuovi call to action e promozionali
    text = re.sub(r'(Check out our .*?deals.*?\.|You can also check out .*?\.|You can even sell your current car through .*?\.|Want to sell your car online.*?\.|Find out how much you can save.*?\.|You can browse used .*?\.|Remember that .*? can even help you sell your car online too\.)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'(Sound interesting\? Check here for our best .*? deals.*?\.|You could also check out other used .*? here.*?\.|And remember that .*? can even help you sell your car online too\.)', '', text, flags=re.IGNORECASE)

    # Rimuovi FAQ e titoli ripetuti (ma NON i paragrafi che seguono)
    text = re.sub(r'([A-Za-z0-9\s\-]+ FAQs|FAQs|How much is the [A-Za-z0-9\s\-]+\?)\s*', '', text, flags=re.IGNORECASE)

    # Rimuovi specifiche tecniche pure (solo se sono su una riga separata)
    text = re.sub(r'^(Range|Efficiency|Battery size|Boot|Power outputs|Charge time|CO2 emissions|MPG, emissions and tax|0-60mph time|Top speed|Charging|Boot space|Running costs|Company car|Tax|Insurance|Warranty|Recalls):.*?(\.|\n)', '', text, flags=re.IGNORECASE | re.MULTILINE)

    # Rimuovi riferimenti a premi, survey, rating, ecc.
    text = re.sub(r'(Carwow Car of the Year Awards.*?\.|Driver Power reliability survey.*?\.|Euro NCAP safety testing.*?\.|Euro NCAP safety tests.*?\.|Euro NCAP safety rating.*?\.|Euro NCAP.*?\.|Benefit-in-Kind tax.*?\.|Vehicle Excise Duty.*?\.|Warranty.*?\.|recall.*?\.|survey.*?\.|reviewed separately.*?\.|FAQs.*?\.|FAQs)', '', text, flags=re.IGNORECASE)

    # Rimuovi prezzi e valute isolate
    text = re.sub(r'£[0-9,]+', '', text)
    text = re.sub(r'€[0-9,]+', '', text)
    text = re.sub(r'\$[0-9,]+', '', text)

    # Rimuovi link, email, numeri di telefono
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\+?\d[\d\-\s]{7,}\d', '', text)

    # Rimuovi frasi tipo "Read more", "See also", "For more information"
    text = re.sub(r'(Read more.*?\.|See also.*?\.|For more information.*?\.|Find out more.*?\.)', '', text, flags=re.IGNORECASE)

    # Rimuovi disclaimer e note legali
    text = re.sub(r'(All information correct at time of publication.*?\.|Terms and conditions apply.*?\.|Subject to status.*?\.|Representative example.*?\.|Your mileage may vary.*?\.)', '', text, flags=re.IGNORECASE)

    # Rimuovi frasi di confronto generiche
    text = re.sub(r'(compared to the [A-Za-z0-9\s\-]+|versus the [A-Za-z0-9\s\-]+|alternative to the [A-Za-z0-9\s\-]+)', '', text, flags=re.IGNORECASE)

    # Rimuovi parentesi quadre e il loro contenuto
    text = re.sub(r'\[.*?\]', '', text)

    # Rimuovi righe che iniziano con titoli/sezioni non utili
    text = re.sub(r'^(Verdict|Summary|Should I buy.*|What’s it like.*|Pros and cons|Key features|Running costs|Company car|Tax|Insurance|Warranty|Recalls|Boot space|Charging|Range|Final thoughts|Our verdict|In summary|To sum up|Pros:|Cons:|Conclusion|Overview|Introduction|At a glance|Highlights|Quick facts|Performance and drive comfort|Space and practicality|Interior style, infotainment and accessories|Safety and reliability|Alternatives to the .*)\s*$', '', text, flags=re.IGNORECASE | re.MULTILINE)

    # Rimuovi righe molto brevi (meno di 3 parole)
    text = '\n'.join([line for line in text.split('\n') if len(line.strip().split()) > 2])

    # Rimuovi caratteri speciali ripetuti (es: ----, ====, ###)
    text = re.sub(r'[-=_#]{3,}', '', text)

    # Rimuovi tutti i simboli particolari tranne . , : ;
    text = re.sub(r"[^\w\s\.\,\:\;]", "", text)

    # Rimuovi simboli e spazi ripetuti
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r' +', ' ', text)
    text = text.strip()

    return text

In [78]:
for i, review in reviews.items():
    print("Length before: ", len(review['review']))
    review['review'] = clean_review(review['review'])
    print("Length after: ", len(review['review']))

Length before:  15127
Length after:  13841
Length before:  11707
Length after:  11292
Length before:  11305
Length after:  10742
Length before:  12742
Length after:  11491
Length before:  3004
Length after:  2897
Length before:  13334
Length after:  12824
Length before:  14177
Length after:  13571
Length before:  14333
Length after:  13669
Length before:  12554
Length after:  11746
Length before:  12158
Length after:  11756
Length before:  13914
Length after:  13029
Length before:  13247
Length after:  12579
Length before:  12471
Length after:  11956
Length before:  12765
Length after:  12175
Length before:  15364
Length after:  14719
Length before:  13782
Length after:  13094
Length before:  13658
Length after:  13059
Length before:  13878
Length after:  13034
Length before:  2043
Length after:  1840
Length before:  4518
Length after:  4267
Length before:  15408
Length after:  14768
Length before:  14354
Length after:  13583
Length before:  7595
Length after:  7336
Length before:  152

In [79]:
reviews[2]

{'url': 'https://www.carwow.co.uk/mg/zs-ev',
 'brand': 'MG',
 'model': 'MG ZS EV',
 'price': '38.616',
 'rating': '7/10',
 'condition': 'new',
 'review': 'The MG ZS EV is an electric SUV that goes up against the likes of the Hyundai Kona Electric and Kia Niro EV both of which are now wellestablished options at the affordable end of the electric car market. Its a bit like going for a pair of cheap Bluetooth headphones. It might not be from the obvious choice of brands, but it promises to do a similar job at a lower cost. Since its revival as a massproduction manufacturer, MGs models havent been the most exciting to look at. The new ZS EV is in a similar vein, but this updated version now comes with a blankedout grille and LED headlights as part of a tweaked front end. There is also a new rear bumper with altered lights. MG has also improved the interior design, with the main addition being a new 10.1inch infotainment touchscreen that has updated software to make it much smoother than be

In [80]:
## Selection of the model to use in the sentiment analysis

model_llm = {
    0: "llama3.1:8b",
    1: "qwen3:4b",
    2: "llama3.2:3b-instruct-q8_0",
    3: "deepseek-r1:7b",
    4: "mistral:7b-instruct",
    5: "deepseek-r1:8b",
    6: "llama3.1:8b-instruct-q5_K_M",
    7: "gemma3:4b",
    8: "gemma3:4b-it-q8_0",
    9: "llama3.1:8b-instruct-q2_K",
    10: "llama3.2:3b-instruct-q5_K_M",
    11: "gemma3:1b",
    12: "llama3.1:8b-instruct-fp16",
    13: "llama3.2:1b-instruct-q8_0"
}
model_used = model_llm[9]

In [81]:
## Read and set the system information

with open("info/system_information.txt", "r") as f:
    system_information = f.read()

In [82]:
## Read and set the system information

with open("info/summarize.txt", "r") as f:
    system_information_summary = f.read()

In [83]:
def extract_insights_ollama_summarize(review_dict):
    prompt_text = json.dumps(review_dict, ensure_ascii=False)
    
    response = ollama.chat(
        model=model_used,
        messages=[
            {"role": "system", "content": system_information_summary},
            {"role": "user", "content": prompt_text}
        ],
    )
    
    return response['message']['content']

In [84]:
def extract_insights_ollama(review_dict):
    prompt_text = json.dumps(review_dict, ensure_ascii=False)
    
    response = ollama.chat(
        model=model_used,
        messages=[
            {"role": "system", "content": system_information},
            {"role": "user", "content": prompt_text}
        ],
    )
    
    return response['message']['content']


In [85]:
## Function to extract and clean the response

def clean_response(response):
    # Remove <think>...</think> blocks (including multiline)
    # response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
    response = re.sub(
        r"'Price':\s*[$£]?([0-9]{1,3}(?:,[0-9]{3})*(?:\.\d+)?|\d+\.?\d*)",
        lambda m: f"'Price': {m.group(1).replace(',', '')}",
        response
    )
    response = re.sub(r"'Overall':\s*([0-9]+(?:\.\d+)?)\s*/\s*10", r"'Overall': \1", response)
    response = re.sub(r',\s*}', '}', response) 
    return response
    
def extract_backtick_block(s):
    m = re.search(r'```(.*?)```', s, flags=re.DOTALL)
    return m.group(1).strip() if m else s.strip()

In [86]:
import unicodedata

def remove_unicode(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

results_summarize = []

for i, review in reviews.items():
    print(f"Processing {i+1}/{len(reviews)}…")
    review['review'] = remove_unicode(review['review'])
    raw = extract_insights_ollama_summarize(review)
    response = extract_backtick_block(raw)
    proc_response = clean_response(response)

    print(f"Process response: {proc_response}")
    result_dict = {}
    try:
        result_dict = proc_response
    except Exception as e:
        print(f"Result: {result_dict}")
        print(f"Parsing error: {e}")
        result_dict = {}

    results_summarize.append(result_dict)

Processing 1/206…
Process response: The information provided appears to be about an electric vehicle (EV) called the MG4. Here are some key points extracted from the text:

**Performance**

* The MG4 comes in three power outputs: 170hp, 204hp, or 245hp.
* It's comfortable and surprisingly nimble in bends, but there's a bit of wind noise on the motorway.
* The light steering is less welcome at high speeds, but you can increase the weight a bit in the settings.

**Space and Practicality**

* There's decent space for passengers, with adjustable seats and ample headroom for tall individuals.
* However, rear seat space is limited, especially with three adults.
* The boot capacity is not terrible (363 liters), but it lags behind other EVs like the ID3 or Leaf.

**Charging**

* Charging speeds vary depending on the battery size:
	+ Small battery: 10 to 80% in 39 minutes at up to 117kW, or in 35 minutes at 135kW.
	+ Large battery: 39 minutes at up to 150kW.

**Tax and Reliability**

* The MG4 

In [87]:
results_summarize[:5]

 'json\n{\n    "Brand": "MG",\n    "Model": "MG5 EV",\n    "Drive": "Comfortable ride, quiet and serene inside",\n    "Interior quality": "Upgraded interior with modern look, but still some cheap materials",\n    "Infotainment system": "10.25-inch touchscreen display with Android Auto and Apple CarPlay as standard",\n    "Comfort": "Good knee and headroom in the back seats, armrest folds down for passengers",\n    "Performance": "156hp motor provides surprising punch from 50mph to 70mph",\n    "Handling": "Surprisingly capable in corners, but can be overwhelming with front tyres",\n    "Practicality": "580litre boot is big and useful, with space for large suitcases or weekly shop",\n    "Reliability": "Difficult to score reliability as no major issues have cropped up yet",\n    "Safety": "MG Pilot safety system includes adaptive cruise control, automatic emergency braking, and more",\n    "Quality of construction": "Build quality is really good in the upgraded MG5",\n    "Noise": "Wind

In [88]:
with open("results_summarize.txt", "w", encoding="utf-8") as f:
    for item in results_summarize:
        f.write(str(item) + "\n")

In [56]:
import ast
import unicodedata

def remove_unicode(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

results = []

for i, review in reviews.items():
    print(f"Processing {i+1}/{len(reviews)}…")
    review['review'] = remove_unicode(review['review'])
    raw = extract_insights_ollama(review)
    response = extract_backtick_block(raw)
    proc_response = clean_response(response)

    print(f"Process response: {proc_response}")
    result_dict = {}
    try:
        result_dict = ast.literal_eval(proc_response)
    except Exception as e:
        print(f"Result: {result_dict}")
        print(f"Parsing error: {e}")
        result_dict = {}

    retries = 0
    while len(result_dict) != 17 and retries < 3:
        print(f"Invalid format ({len(result_dict)} keys). Retrying…")
        retry_system_message = (
            system_information +
            "\nYou did NOT return the dictionary in the exact format. ONLY output the dictionary below, enclosed in triple backticks. "
            "DO NOT write any explanation, summary, or bullet points. ONLY the dictionary. "
            "If you do not follow the output format, your answer will be discarded."
        )
        # print("SYSTEM PROMPT:", retry_system_message)
        # print("USER PROMPT:", json.dumps(review, ensure_ascii=False))
        raw = ollama.chat(
            model=model_used,
            messages=[
                {"role": "system", "content": retry_system_message},
                {"role": "user", "content": json.dumps(review, ensure_ascii=False)}
            ],
        )['message']['content']
        response = extract_backtick_block(raw)
        proc_response = clean_response(response)

        print(f"Process response: {proc_response}")

        try:
            result_dict = ast.literal_eval(proc_response)
        except Exception as e:
            print(f"Result: {result_dict}")
            print(f"Retry parsing error: {e}")
            result_dict = {}
        retries += 1

    results.append(result_dict)

Processing 1/206…
Process response: I'll summarize the information and provide an analysis based on the given text.

**Car Specifications**

* Brand: MG
* Model: MG4 EV
* Price: £37,142
* Rating: 8/10

**Design Features**

* The MG4 has a unique design with pointy details, sharp creases, and funky shapes.
* It comes in three trim levels: SE (entry-level), Trophy (mid-range), and Trophy Extended Range (top-of-the-line).
* The Trophy models have a bold orange paint finish with a two-tone black roof.

**Performance**

* The MG4 has two battery options, offering 218 claimed miles of range in its most basic trim through to 323 miles for the Trophy Extended Range model.
* It comes with two motor outputs: 170hp and 204hp (in the SE Long Range).
* The XPower is a high-performance variant with a separate review.

**Features**

* The MG4 has automatic LED headlights, tinted windows, and a leather steering wheel as standard on Trophy models.
* The infotainment system is basic, with too many menus

KeyboardInterrupt: 

In [None]:
## Just for example

test_dict = {
    'Brand': 'MG',
    'Model': 'ZS',
    'Drive': 'Medium', 
    'Quality of interior': 'Positive', 
    'Infotainment system': 'Positive', 
    'Comfort': 'Negative', 
    'Performance': 'Medium', 
    'Handling': 'Not mentioned', 
    'Practicality': 'Positive', 
    'Reliability': 'Not mentioned', 
    'Safety': 'Negative', 
    'Quality of construction': 'Medium', 
    'Noise': 'Negative', 
    'Engine': 'Medium', 
    'Price': 18.605,
    'tag': 'new',
    'Overall': 6
}

len(test_dict)

In [None]:
## We use the test_dict keys to create the columns of the Dataframe, then we create it.
df_new = pd.DataFrame(columns = test_dict.keys())

for result_dict in results:
    df_new.loc[len(df_new)] = result_dict
    
df_new

In [None]:
## Saving the Dataframe as a csv.

df_new.to_csv("info/car_review_new_test.csv", index = False)