In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd



In [None]:
def scrape_aircraft_data(url):
    """
    Scrapes aircraft data from a webpage containing an HTML table.

    Args:
        url (str): The URL of the webpage to scrape.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted aircraft data.
    """
    # Send a request to the website
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Locate the table
    table = soup.find('table', class_='data-grid')
    if not table:
        raise ValueError("No table found with class 'data-grid'")
    
    # Extract table headers
    headers = [header.text.strip() for header in table.find_all('th')]
    
    # Extract table rows
    data = []
    for row in table.find_all('tr')[1:]:  # Skip the header row
        cols = row.find_all('td')
        row_data = [col.text.strip() if col.text.strip() else 'N/A' for col in cols]
        
        # Extract aircraft model name and link
        link_tag = cols[0].find('a')
        if link_tag:
            aircraft_name = link_tag.text.strip()
            aircraft_link = link_tag['href']
        else:
            aircraft_name = cols[0].text.strip()
            aircraft_link = 'N/A'
        
        # Add the extracted data
        row_data[0] = aircraft_name  # Update first column with extracted name
        row_data.insert(1, aircraft_link)  # Insert link as second column
        data.append(row_data)
    
    # Update headers to include 'Aircraft Link'
    headers.insert(1, 'Aircraft Link')
    
    # Create a DataFrame
    df = pd.DataFrame(data, columns=headers)
    return df

if __name__ == "__main__":
    # Define the URL to scrape
    url = "http://www.axonaviation.com/commercial-aircraft/aircraft-data/aircraft-pricing"
    
    # Scrape the data
    try:
        aircraft_df = scrape_aircraft_data(url)
        
        # Save to CSV
        aircraft_df.to_csv("aircraft_data.csv", index=False)
        print("Data successfully scraped and saved to 'aircraft_data.csv'")
    except Exception as e:
        print(f"Error: {e}")

Data successfully scraped and saved to 'aircraft_data.csv'


In [3]:
# Load the CSV file
file_path = "aircraft_data.csv"  # Ensure this file is in the same directory as your notebook
df = pd.read_csv(file_path)

# Define exchange rate USD to GBP
usd_to_gbp = 0.78

# Function to clean and convert "New Price"
def clean_price(price):
    if isinstance(price, str) and 'M' in price:
        return float(price.replace('$', '').replace('M', '')) * 1_000_000
    return None  # Return None for missing or invalid values

# Apply the function to transform "New Price"
df["New Price ($)"] = df["New Price"].apply(clean_price)

# Create "New Price (£)" by applying exchange rate
df["New Price (£)"] = df["New Price ($)"] * usd_to_gbp

# Drop the original "New Price" column
df.drop(columns=["New Price"], inplace=True)

# Save the cleaned dataset
cleaned_file_path = "aircraft_data_cleaned.csv"
df.to_csv(cleaned_file_path, index=False)

# Display success message
print(f"Cleaned dataset saved as '{cleaned_file_path}'")

# Show the first few rows of the cleaned dataset
df.head()


Cleaned dataset saved as 'aircraft_data_cleaned.csv'


Unnamed: 0,Aircraft model,Aircraft Link,Engine,Range,Typical Configuration,Additional Comments,New Price ($),New Price (£)
0,BOEING 737 MAX 7,http://www.axonaviation.com/commercial-aircraf...,,,172.0,,99700000.0,77766000.0
1,Airbus A318,http://www.axonaviation.com/commercial-aircraf...,"GE, Pratt & Whitney","3,100 NM",107.0,,75100000.0,58578000.0
2,Airbus A319,http://www.axonaviation.com/commercial-aircraf...,"GE, International Aero Engines","3,700 nm",124.0,,89600000.0,69888000.0
3,Airbus A320,http://www.axonaviation.com/commercial-aircraf...,"GE, International Aero Engines","3,300 NM",150.0,,98000000.0,76440000.0
4,Airbus A321,http://www.axonaviation.com/commercial-aircraf...,"GE, International Aero Engines","3,200 NM",185.0,,114900000.0,89622000.0
