### Perform removing important content and perform sentiment analysis

In [None]:
import re
import ollama
import pandas as pd

In [None]:
def extract_important_content(text):
    # Rimuovi frasi introduttive generiche
    text = re.sub(r'\bWhat Car\? says\.{0,3}', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\bIs the [^?]+\? ?', '', text, flags=re.IGNORECASE)

    # Rimuovi riferimenti a video e inviti alla lettura
    text = re.sub(r'\b[\w\s]*video review\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Read on to find out…', '', text, flags=re.IGNORECASE)

    # Rimuovi solo le frasi promozionali, senza mangiare testo prima/dopo
    text = re.sub(r'Sound interesting\?', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Check here for our best [^.]+\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'look for a used [^.]+\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'You could also check out other used [^.]+\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Want to sell your car online\? Carwow can help with that, too\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Carwow can even help you sell your car online too\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'head on over to our .*? deals page to see how much money .*? can save you\.', '', text, flags=re.IGNORECASE)

    # Rimuovi firme
    text = re.sub(r'Mat Watson Expert Car Reviewer', '', text, flags=re.IGNORECASE)

    # Rimuovi blocchi con solo listini/offerte (ma non specifiche tecniche)
    text = re.sub(r'(New car deals|Leasing deals).*?(From £[0-9,]+)?', '', text, flags=re.IGNORECASE)

    # Rimuovi intestazioni vuote (es. "Overview", "Performance & drive", se isolate)
    text = re.sub(r'\b(Overview|Our Pick|Performance & drive|Strengths|Weaknesses|Verdict|Specification)\b\s*', '', text)

    # Rimuovi caratteri superflui: - \ | ? ! $ % &
    text = re.sub(r'[-\\|?!$%&]', '', text)

    # Pulisci spazi extra
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
df = pd.read_csv("file_txt/auto_review_no_processed.csv")
df.head()

In [None]:
for index, element in df['text'].items():
    df.loc[index, 'text'] = extract_important_content(element)

In [None]:
df.head()

In [None]:
# Combine reviews from different sources for each (brand, model) pair
def combine_reviews(group):
    # If two reviews exist, append the second to the first (text and source)
    if len(group) == 2:
        combined_text = "CarWow Review: " + group.iloc[0]['text'] + "\ Whatcar Review: " + group.iloc[1]['text']
        combined_source = group.iloc[0]['source'] + " && " + group.iloc[1]['source']
        return pd.Series({
            'brand': group.iloc[0]['brand'],
            'model': group.iloc[0]['model'],
            'text': combined_text,
            'source': combined_source
        })
    else:
        # If only one review, keep as is
        combined_text = group.iloc[0]['source'] + " Review: " + group.iloc[0]['text']
        return pd.Series({
            'brand': group.iloc[0]['brand'],
            'model': group.iloc[0]['model'],
            'text': combined_text,
            'source': group.iloc[0]['source']
        })

In [None]:
df_combined = (
    df.groupby(['brand', 'model'], as_index=False, group_keys=False)
      .apply(combine_reviews)
      .reset_index(drop=True)
)

print(len(df_combined))
df_combined.head(10)

In [None]:
with open("file_txt/system_information.txt", "r") as f:
    system_information = f.read()
print(f"{system_information.strip()}")

In [None]:
reviews = []
for idx, row in df_combined.iterrows():
    brand = row['brand']
    model = row['model']
    text: str = row['text']
    source = row['source']
    review = f"{brand}, {model}, {text.strip()}, {source}"
    reviews.append(review)

reviews[:5]

In [None]:
# Creation of the dataframe
df_new = pd.DataFrame(
    columns = [
        'Brand', 
        'Model', 
        'Source', 
        'Drive', 
        'Quality of interior', 
        'Infotainment system', 
        'Comfort', 
        'Performance',
        'Handling', 
        'Practicality',  
        'Reliability', 
        'Safety', 
        'Quality of construction', 
        'Noise', 
        'Engine', 
        'Price',
        'Overall',
    ]
)

In [None]:
model_llm = ["deepseek-r1:8b", "llama3.1:8b-instruct-q5_K_M", "gemma3:4b", "gemma3:4b-it-q8_0", "llama3.1:8b-instruct-q2_K", "llama3.2:3b-instruct-q5_K_M", "gemma3:1b"]
model_llm[4]

In [None]:
def extract_insights_ollama(text):
    response = ollama.chat(
        model=model_llm[4],
        messages=[
            {'role': 'system', 'content': system_information},
            {'role': 'user', 'content': text}
        ],
    )    
    return response['message']['content'].strip()

def extract_backtick_block(s):
    m = re.search(r'```(.*?)```', s, flags=re.DOTALL)
    return m.group(1).strip() if m else s.strip()

def normalize_pipes(s):
    return re.sub(r'\s*\|\|\s*', ' || ', s)

In [None]:
fmt_prompt = [
            "Remember to return the Brand, the model and the source with the other 14 sentiment you have to output!",
            "Perform again the sentiment analysis."
]
fmt_prompt

In [None]:
error = 0
for i, review in enumerate(reviews):
    
    print(f"Processing {i+1}/{len(reviews)}…")
    raw = extract_insights_ollama(review)
    cleaned = normalize_pipes(extract_backtick_block(raw))
    parts = cleaned.split(' || ')

    retries = 0
    while len(parts) != 17 and retries < 10:
        print(f"Invalid format: {len(parts)}. Retrying…")
        print("Last malformed response:\n", raw)
        retry_prompt = (
            system_information.strip()
            + "\n\n"
            + f"You returned {len(parts)} fields instead of 17. Please return the output in this exact format:\n"
            + "```Brand || Model || Source || Drive || Quality of interior || Infotainment system || Comfort || Performance || Handling || Practicality || Reliability || Safety || Quality of construction || Noise || Engine || Price || Overall```\n\n"
            + "Perform the analysis again for the following review:\n"
            + review
        )

        raw = extract_insights_ollama(retry_prompt)
        cleaned = normalize_pipes(extract_backtick_block(raw))
        parts = cleaned.split(' || ')
        retries += 1

    if len(parts) != 17:
        print("Error: ", len(parts))
        parts = [''] * 17
        error += 1

    df_new.loc[i] = [p.strip() for p in parts]

print(error)

In [None]:
df_new.to_csv("file_txt/auto_review_sentiment.csv", index=False)