In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import asyncio
from requests_html import AsyncHTMLSession
import time
import os

scraped_FILE = "scraped_urls.txt"
CSV_FILE = "Bikez-All-Years.csv"

# Column order:
column_order = [
    # General information
    "Motorcycle name", "Model year", "Category", "Price as new",
    # Engine and transmission
    "Engine size", "Type of engine", "Power output", "Torque", "Transmission type", "Clutch", "Fuel consumption",
    # Chassis, suspension, brakes and wheels
    "Front tire", "Rear tire", "Front brakes", "Rear brakes",
    # Physical measures and capacities
    "Weight incl. oil, gas, etc", "Dry weight", "Seat height", "Overall height", "Overall length", "Fuel capacity", "Oil capacity"
]

PYPPETEER_CHROMIUM_REVISION = '1263111'

os.environ['PYPPETEER_CHROMIUM_REVISION'] = PYPPETEER_CHROMIUM_REVISION

import logging

# Deactivate websockets and urllib3 logs:
logging.getLogger("websockets").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)


In [None]:
# Function to load scraped URLS from file:
def load_scraped_urls(filename):
    """Load already scraped URLs from a text file."""
    if os.path.exists(filename):
        with open(filename, "r") as f:
            return set(line.strip() for line in f)
    return set()

# Function to save URL in text file:
def append_to_file(filename, url):
    """Save a new URL in the text file without overwriting existing content."""
    with open(filename, "a") as f:
        f.write(url + "\n")

# Function to save final DataFrame:
def save_final_data(data_list, csv_file, scraped_urls):
    if not data_list:
        print("No new data to save.")
        return

    # Load file if it already exists:
    if os.path.exists(csv_file):
        df_existing = pd.read_csv(csv_file)
    else:
        df_existing = pd.DataFrame()

    # Create DataFrame from list:
    df_new = pd.DataFrame(data_list)

    # Concatenate and delete duplicates:
    df_final = pd.concat([df_existing, df_new], ignore_index=True).drop_duplicates()

    # Order Dataframe columns:
    df_final = df_final[column_order]

    # Save CSV file:
    df_final.to_csv(csv_file, index=False)
    print(f"New {len(df_new)} motorbikes have been saved in {csv_file}")

    # Update URLs text file:
    for url in scraped_urls:  # For each URL
        append_to_file(scraped_FILE, url)

In [None]:

# Function to obtain the list of available years in Bikez.com
def obtain_years():
    url = "https://bikez.com/years/index.php"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    body = soup.find("table", class_="zebra")
    year_links = body.find_all("a", href=re.compile(r"-motorcycle-models.php"))

    years = []
    for link in year_links:
        text = link.text.strip()
        year = text.split()[0]
        years.append(year)

    return sorted(years)

"""
# TEST:
year_list = obtain_years()  
years_URLs = []
for year in year_list:
    years_URLs.append("https://bikez.com/year/" + year + "-motorcycle-models.php")
print(years_URLs)
"""

In [None]:

# Function to obtain list of motorbikes from a certain year in Bikez.com
def obtain_models(year):
    url = "https://bikez.com/year/" + str(year) + "-motorcycle-models.php"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    body = soup.find("table", class_="zebra")
    model_links = body.find_all("a", href=re.compile(str(year) + ".php"))
    
    models = set()
    scraped_models = load_scraped_urls(scraped_FILE)
    
    for link in model_links:
        href = link.get("href")
        # Extract model name after "/motorcycles/" and before ".php"
        model = href.split("/")[-1].split(".php")[0]

        if model not in scraped_models:
            models.add(model)

    return sorted(list(models))

"""
# TEST:
models_2025 = obtain_models(2025)
print(models_2025)
"""

In [None]:
# Function to obtain the information from a certain motorbike model in Bikez.com

async def obtain_model_info_asyncio(url):  
    asession = AsyncHTMLSession()
    
    try:
        r = await asession.get(url, timeout = 30)
        
        # Render JavaScript so that dynamic content is loaded.
        await r.html.arender(timeout=30)
          
        # Parse the rendered HTML content using BeautifulSoup:
        soup = BeautifulSoup(r.html.html, 'html.parser')
        
        # Remove hidden elements in usefull information:
        for hidden in soup.find_all(style=lambda value: value and "display:none" in value):
            hidden.extract()
            
        # Find all tables with class "Grid":
        tables = soup.find_all("table", class_="Grid")
    
        # Select general information table:
        selected_table = None
        for table in tables:
            if any(text in table.get_text() for text in ["General information", "General moped information"]):
                selected_table = table
                break
    
        if selected_table:
            # Dictionary
            data  = {}
        
            # Data we want to get from Bikez.com:
            table_fields = {
                # General information:
                "Motorcycle name", "Model year", "Category", "Price as new",
                # Engine and transmission:
                "Engine size", "Type of engine", "Power output", "Torque", "Transmission type", "Clutch", "Fuel consumption",
                # Chassis, suspension, brakes and wheels:
                "Front tire", "Rear tire", "Front brakes", "Rear brakes",
                # Physical measures and capacities:
                "Weight incl. oil, gas, etc", "Dry weight", "Seat height", "Overall height", "Overall length", "Fuel capacity", "Oil capacity"
            }
            
            # Name mapping (found different names for same parameters through the website):
            field_mapping = {
                "Engine size": ["Displacement"],
                "Motorcycle name": ["Model", "Model name"],
                "Model year": ["Year model", "Year of manufacture", "Year"],
                "Category": ["Type"],
                "Type of engine": ["Engine type"],
                "Power output": ["Effect", "Output", "Power"],
                "Price as new": ["Price"]
            }
            
            # Obtain rows:
            rows = selected_table.find_all("tr")
            
            # Process rows:
            for row in rows:
                columns = row.find_all("td")
                if len(columns) >= 2:  # Make sure there is at least two columns in the table (information table)
                    clave = columns[0].get_text(strip=True)
                    value = columns[1].get_text(strip=True)
                    
                    # Map field name:
                    for standard_field, alternative_names in field_mapping.items():
                        if clave in alternative_names:
                            data[standard_field] = value
                            break
                    else:
                        data[clave] = value
            
            # Dictionary with information:
            clean_data = {key: data.get(key, "") for key in table_fields}
        
            # Show info about scraping process in console:
            print(f"Year", clean_data["Model year"], "motorbike:", clean_data["Motorcycle name"], "scraped.")

            return clean_data
        
        else:
            print("Could not find 'General information' table.")
            return {}
        
    except Exception as e:
        print(f"Error rendering {url}: {e}")
        return {}
    
    finally:
        # Close scraping to free up resources (I was having issues with script crashing after a while because Chromium instances were not closing properly)
        await asession.close()

"""
# TEST:
url = 'https://bikez.com/motorcycles/aprilia_tuono_v4_2024.php'
loop = asyncio.get_event_loop()
model_info = await obtain_model_info_asyncio(url)
print(model_info)
"""

In [None]:
# MAIN Bikez.com:

start_time = time.perf_counter()

data_list = []  # List to save data temporarily
scraped_urls = load_scraped_urls(scraped_FILE)
scraped_urls_new  = []

years = obtain_years()

try:
    for year in years:
        models = obtain_models(year)
      
        for model in models:
            url = "https://bikez.com/motorcycles/" + model + ".php"
            if url in scraped_urls:    
                print(f"Motorbike {model} has already been scraped.")
                continue
                
            # Get model information:
            data_model = await obtain_model_info_asyncio(url)
            if data_model:
                data_list.append(data_model)
                scraped_urls_new.append(url) 

            # Deal with any unexpected crash saving data every 50 iterations:
            if not len(data_list) % 50:
                save_final_data(data_list, CSV_FILE, scraped_urls_new)
                print("Safety save every 50 iterations.")
                # Reset variables:
                data_list = []
                scraped_urls_new = []

    # Save CSV file at the end of scraping:
    save_final_data(data_list, CSV_FILE, scraped_urls_new)
        
except Exception as e:
    print(f"Critical error: {e}")
    # Save CSV file in case of error happening:
    save_final_data(data_list, CSV_FILE, scraped_urls_new)

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Total execution time of: {elapsed_time:.2f} seconds")