In [56]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import re

In [57]:
# Set base headers
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
}

# Brand selezionati
BRANDS = ["mg", "byd", "nio", "volkswagen", "renault", "peugeot", "skoda", "fiat", "bmw", "mercedes"]

BRAND_MODELS = {
    "mg": ["mg3", "zs", "zs-ev", "hs", "mg-4", "s5-ev", "4-xpower", "gs", "5"],
    "byd": ["dolphin", "seal-u", "atto-3", "seal", "sealion-7"],
    "volkswagen": ["touareg", "id3", "id4", "id5", "id7", "id7-tourer", "tiguan", "passat", "golf", "golf-r", "golf-gti", "t-roc", "t-cross", "polo", "polo-gti"],
    "renault": ["clio", "5-e-tech", "symbioz", "megane-etech-electric", "austral", "captur", "scenic-e-tech", "rafale", "twingo", "megane", "kadjar", "scenic", "arkana", "koleos", "zoe"],
    "peugeot": ["208", "e-208", "2008", "e-2008", "308", "e-308", "308-sw", "e-308-sw", "408", "e-408", "3008", "e-3008", "5008", "e-5008", "e-rifter", "e-traveller", "508"],
    "skoda": ["karoq", "kamiq", "kodiaq", "enyaq", "elroq", "superb"],
    "fiat": ["500", "500-electric", "500x", "600", "500l", "panda", "panda-cross", "panda-4x4", "600e", "tipo", "qubo", "500c"],
    "bmw": ["1-series", "2-series-active-tourer", "3-series", "4-series", "5-series", "5-series-touring", "7-series", "8-series", "8-series-gran-coupe", "m5", "m8-gran-coupe", "x1", "ix1", "x2", "ix2", "x3", "ix3", "x4", "x5", "x6", "x7", "ix", "i4", "i5", "i5-touring", "i7", "xm"],
    "mercedes": [
  "a-class",
  "b-class",
  "c-class",
  "e-class",
  "s-class",
  "cla",
  "cls",
  "gla",
  "glb",
  "glc",
  "gle",
  "gls",
  "g-class",
  "eqa",
  "eqb",
  "eqc",
  "eqe",
  "eqe-suv",
  "eqs",
  "eqs-suv",
  "amg-gt"
]
}


# Helper funzione per pulire testo
def clean_text(text):
    return re.sub(r'\s+', ' ', text.strip())


In [58]:
def scrape_carwow():
    results = []
    base_url = "https://www.carwow.co.uk/"
    for brand, model in BRAND_MODELS.items():
        for m in model:
            url = f"{base_url}{brand}/{m}"
            print(f"[Carwow] Scraping: {url}")
            try:
                res = requests.get(url, headers=HEADERS)
                soup = BeautifulSoup(res.content, "html.parser")
                
                # Titoli recensioni
                review_section = soup.find("section", class_="product-article-section")
                # review_section = soup.find("div", class_="content-block")
                if review_section:
                    summary = clean_text(review_section.text)
                else:
                    summary = "N/A"
                
                # rating_tag = soup.find("section", class_="cw-score__number")
                # rating = rating_tag.text.strip() if rating_tag else "N/A"

                results.append({
                    "brand": brand,
                    "model": "general",
                    "text": summary,
                    # "rating": rating,
                    "source": "Carwow"
                })
                time.sleep(1)
            except Exception as e:
                print(f"Error scraping {url}: {e}")
    return results

In [59]:
def scrape_edmunds():
    results = []
    for brand, model in BRAND_MODELS.items():
        for m in model:
            url = f"https://www.edmunds.com/{brand}/{m}/2024"
            print(f"[Edmunds] Scraping: {url}")
            try:
                res = requests.get(url, headers=HEADERS)
                soup = BeautifulSoup(res.content, "html.parser")
                blocks = soup.find_all("div", class_="review-card")
                for block in blocks[:3]:
                    title = block.find("h3").text if block.find("h3") else "N/A"
                    text = block.find("p").text if block.find("p") else "N/A"
                    results.append({
                        "brand": brand,
                        "model": "unknown",
                        "text": clean_text(text),
                        "rating": "N/A",
                        "source": "Edmunds"
                    })
                time.sleep(1)
            except Exception as e:
                print(f"Error scraping {url}: {e}")
    return results

In [60]:
def main():
    all_data = []
    all_data += scrape_carwow()

    with open("recensioni_auto.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["brand", "model", "text", "rating", "source"])
        writer.writeheader()
        writer.writerows(all_data)

    print(f"\n✅ Dataset completato con {len(all_data)} recensioni.")

In [61]:
main()

[Carwow] Scraping: https://www.carwow.co.uk/mg/mg3
[Carwow] Scraping: https://www.carwow.co.uk/mg/zs
[Carwow] Scraping: https://www.carwow.co.uk/mg/zs-ev
[Carwow] Scraping: https://www.carwow.co.uk/mg/hs
[Carwow] Scraping: https://www.carwow.co.uk/mg/mg-4
[Carwow] Scraping: https://www.carwow.co.uk/mg/s5-ev
[Carwow] Scraping: https://www.carwow.co.uk/mg/4-xpower
[Carwow] Scraping: https://www.carwow.co.uk/mg/gs
[Carwow] Scraping: https://www.carwow.co.uk/mg/5
[Carwow] Scraping: https://www.carwow.co.uk/byd/dolphin
[Carwow] Scraping: https://www.carwow.co.uk/byd/seal-u
[Carwow] Scraping: https://www.carwow.co.uk/byd/atto-3
[Carwow] Scraping: https://www.carwow.co.uk/byd/seal
[Carwow] Scraping: https://www.carwow.co.uk/byd/sealion-7
[Carwow] Scraping: https://www.carwow.co.uk/volkswagen/touareg
[Carwow] Scraping: https://www.carwow.co.uk/volkswagen/id3
[Carwow] Scraping: https://www.carwow.co.uk/volkswagen/id4
[Carwow] Scraping: https://www.carwow.co.uk/volkswagen/id5
[Carwow] Scraping: 

In [62]:
def main():
    all_data = []
    all_data += scrape_edmunds()

    with open("recensioni_auto_edmunds.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["brand", "model", "text", "rating", "source"])
        writer.writeheader()
        writer.writerows(all_data)

    print(f"\n✅ Dataset completato con {len(all_data)} recensioni.")

In [None]:
main()

[Edmunds] Scraping: https://www.edmunds.com/mg/mg3/2024
