### Scraping da carwow.com di auto cinesi ed europee

In [129]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [130]:
def get_model_urls(brand_url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    res = requests.get(brand_url, headers=headers)
    soup = BeautifulSoup(res.content, 'lxml')

    model_links = soup.select('.card-compact-review')

    full_urls = []
    for link in model_links:
        href = link.get('href')
        if href:
            if href.startswith('/'):
                full_url = 'https://www.carwow.co.uk' + href.split('?')[0]
            else:
                full_url = href.split('?')[0]
            full_urls.append(full_url)

    return list(set(full_urls))  # Rimuove duplicati

In [131]:
def scrape_model_page(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.content, 'lxml')

    title_elem = soup.select_one('h1')
    title = title_elem.get_text(strip=True) if title_elem else 'Not found'

    # ---- PREZZO E TAG ----
    price = 'Not found'
    tag = 'Info not available'
    price_elems = soup.select('.price--no-wrap')

    prices = []
    if price_elems:
        for elem in price_elems[:2]:
            text = elem.get_text(strip=True)
            num = ''.join(c for c in text if c.isdigit() or c == '.')
            if num:
                prices.append(float(num))

    if prices:
        avg_price = sum(prices) / len(prices)
        price = f"£{avg_price:,.0f}"
        tag = 'new'
        print(f"Average price calculated: {price}")
    else:
        summary_price_elem = soup.select_one('.summary-list__item dd')
        if summary_price_elem:
            price = summary_price_elem.get_text(strip=True)
            tag = 'used'
            print(f"Price from summary list: {price}")

    if price != 'Not found':
        try:
            price_num = float(price.replace('£', '').replace(',', '').strip())
            euro_price = price_num * 1.17  # cambio indicativo GBP->EUR
            price = f"€{euro_price:,.0f}"
        except Exception as e:
            print(f"Errore conversione prezzo: {e}")

    # ---- RATING ----
    rating_elem = soup.select_one('.review-overview__wowscore .wowscore-pill')
    rating = rating_elem.get_text(strip=True) if rating_elem else 'Not found'

    # ---- REVIEW ----
    review_elems = []
    i = 0
    while True:
        elems = soup.select(f'#slice-zone-{i} .content-block p, #slice-zone-{i} .product-article-section__head')
        if not elems:
            break
        review_elems.extend(elems)
        i += 1

    if not review_elems:
        review_elems = soup.select('#verdict p')

    review = " ".join([elem.get_text(separator=" ", strip=True) for elem in review_elems]) if review_elems else 'Not found'
    if review != 'Not found':
        review = " ".join(review.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').split())

    return {
        'url': url,
        'title': title,
        'price': price,
        'rating': rating,
        'tag': tag,
        'review': review
    }


In [132]:
brand_url = [
    'https://www.carwow.co.uk/mg',
    'https://www.carwow.co.uk/omoda',
    'https://www.carwow.co.uk/volkswagen', 
    'https://www.carwow.co.uk/dacia', 
    'https://www.carwow.co.uk/fiat',
    'https://www.carwow.co.uk/renault',
    'https://www.carwow.co.uk/citroen',
    'https://www.carwow.co.uk/bmw',
    'https://www.carwow.co.uk/peugeot',
    'https://www.carwow.co.uk/byd',
    'https://www.carwow.co.uk/gwm',
    'https://www.carwow.co.uk/jaecoo',
    'https://www.carwow.co.uk/leapmotor',
    'https://www.carwow.co.uk/xpeng',  
    ]

In [133]:
all_data = []  # inizializzazione esplicita

for brand in brand_url:
    print(f"\n=== Analyzing brand: {brand.split('/')[-1]} ===")
    model_urls = get_model_urls(brand)

    for i, url in enumerate(model_urls):
        print(f"\n--- Model: {url.split('/')[-1]} ({i+1}/{len(model_urls)}) ---")
        data = scrape_model_page(url)
        not_found_fields = [k for k, v in data.items() if v == 'Not found']
        if not_found_fields:
            print(f"Fields not found: {not_found_fields}")
        print(f"Data extracted: {data['tag']}")
        all_data.append(data)


=== Analyzing brand: mg ===

--- Model: mg-4 (1/11) ---
Average price calculated: £31,745
Data extracted: new

--- Model: 5 (2/11) ---
Average price calculated: £32,255
Data extracted: new

--- Model: zs-ev (3/11) ---
Average price calculated: £33,005
Data extracted: new

--- Model: mg3 (4/11) ---
Average price calculated: £18,995
Data extracted: new

--- Model: gs (5/11) ---
Average price calculated: £18,428
Data extracted: new

--- Model: zs (6/11) ---
Average price calculated: £22,495
Data extracted: new

--- Model: 2018 (7/11) ---
Average price calculated: £21,338
Data extracted: new

--- Model: hs (8/11) ---
Average price calculated: £30,245
Data extracted: new

--- Model: s5-ev (9/11) ---
Average price calculated: £30,995
Data extracted: new

--- Model: 4-xpower (10/11) ---
Average price calculated: £36,495
Data extracted: new

--- Model: cyberster (11/11) ---
Average price calculated: £57,495
Data extracted: new

=== Analyzing brand: omoda ===

--- Model: omoda-9 (1/3) ---
Aver

In [134]:
len(all_data)

206

In [135]:
# Dopo il ciclo, crea un DataFrame e salva come CSV

df = pd.DataFrame(all_data)
df.to_csv('/home/gvarnier/sviluppo/rossato_job/info/carwow_scraped_data_full.csv', index=False)
print("Dati salvati in carwow_scraped_data_full.csv")

Dati salvati in carwow_scraped_data_full.csv
