### Scraping da carwow.com di auto cinesi ed europee

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
def get_model_urls(brand_url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    res = requests.get(brand_url, headers=headers)
    soup = BeautifulSoup(res.content, 'lxml')

    model_links = soup.select('.card-compact-review')

    full_urls = []
    for link in model_links:
        href = link.get('href')
        if href:
            if href.startswith('/'):
                full_url = 'https://www.carwow.co.uk' + href.split('?')[0]
            else:
                full_url = href.split('?')[0]
            full_urls.append(full_url)

    return list(set(full_urls))  # Rimuove duplicati

In [3]:
def scrape_model_page(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.content, 'lxml')

    title = soup.select_one('h1').get_text(strip=True)
    
    price_elem = soup.select_one('.price--no-wrap')
    if price_elem:
        price = price_elem.get_text(strip=True)
    else:
        summary_price_elem = soup.select_one('.summary-list__item dd')
        price = summary_price_elem.get_text(strip=True) if summary_price_elem else 'Not found'
        
    rating_elem = soup.select_one('.review-overview__wowscore .wowscore-pill')
    rating = rating_elem.get_text(strip=True) if rating_elem else 'Not found'
    
    review_elems = soup.select('#slice-zone-0 .content-block p')
    if not review_elems:
        review_elems = soup.select('#verdict p')
    review = " ".join([elem.get_text(separator=" ", strip=True) for elem in review_elems]) if review_elems else 'Not found'

    return {
        'url': url,
        'title': title,
        'price': price,
        'rating': rating,
        'review': review
    }


In [4]:
brand_url = [
    'https://www.carwow.co.uk/mg',
    'https://www.carwow.co.uk/omoda',
    'https://www.carwow.co.uk/volkswagen', 
    'https://www.carwow.co.uk/dacia', 
    'https://www.carwow.co.uk/fiat',
    'https://www.carwow.co.uk/renault',
    'https://www.carwow.co.uk/citroen',
    'https://www.carwow.co.uk/bmw',
    'https://www.carwow.co.uk/peugeot',
    'https://www.carwow.co.uk/byd',
    'https://www.carwow.co.uk/gwm',
    'https://www.carwow.co.uk/jaecoo',
    'https://www.carwow.co.uk/leapmotor',
    'https://www.carwow.co.uk/xpeng',
    
    ]

for i, brand in enumerate(brand_url):
    print(f"\nAnalyzing: {brand.split('/')[-1]}\nNumber: {i+1}/{len(brand_url)}")
    model_urls = get_model_urls(brand)

    for i, url in enumerate(model_urls):
        print(f"\nAnalyzing: {url.split('/')[-1]}\nNumber: {i+1}/{len(model_urls)}")
        data = scrape_model_page(url)
        not_found_fields = [k for k, v in data.items() if v == 'Not found']
        if not_found_fields:
            print(f"Fields not found: {not_found_fields}")

        # Memorizza i dati in una lista
        if 'all_data' not in locals():
            all_data = []
        all_data.append(data)



Analyzing: mg
Number: 1/14

Analyzing: mg3
Number: 1/11

Analyzing: hs
Number: 2/11

Analyzing: cyberster
Number: 3/11

Analyzing: 2018
Number: 4/11

Analyzing: mg-4
Number: 5/11

Analyzing: 4-xpower
Number: 6/11

Analyzing: 5
Number: 7/11

Analyzing: gs
Number: 8/11

Analyzing: s5-ev
Number: 9/11

Analyzing: zs
Number: 10/11

Analyzing: zs-ev
Number: 11/11

Analyzing: omoda
Number: 2/14

Analyzing: omoda-5
Number: 1/3

Analyzing: omoda-9
Number: 2/3
Fields not found: ['rating']

Analyzing: omoda-e5
Number: 3/3

Analyzing: volkswagen
Number: 3/14

Analyzing: t-roc
Number: 1/24

Analyzing: polo-gti
Number: 2/24

Analyzing: golf
Number: 3/24

Analyzing: polo
Number: 4/24

Analyzing: multivan
Number: 5/24

Analyzing: id3
Number: 6/24

Analyzing: t-cross
Number: 7/24

Analyzing: tiguan
Number: 8/24

Analyzing: id7
Number: 9/24

Analyzing: t-roc-cabriolet
Number: 10/24

Analyzing: taigo
Number: 11/24

Analyzing: id5
Number: 12/24

Analyzing: touran
Number: 13/24

Analyzing: golf-gti
Number

In [5]:
len(all_data)

205

In [8]:
# Dopo il ciclo, crea un DataFrame e salva come CSV

df = pd.DataFrame(all_data)
df.to_csv('/home/guidojobinformatica/sviluppo/rossato_job/info/carwow_scraped_data.csv', index=False)
print("Dati salvati in carwow_scraped_data.csv")

Dati salvati in carwow_scraped_data.csv
