In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Read the CSV file with appropriate encoding
df = pd.read_csv('The Ultimate Cars Dataset 2024.csv', encoding='latin1')

# Clean the price data
def clean_price(price_str):
    try:
        # Remove '$' and ',' and spaces
        price_str = price_str.replace('$', '').replace(',', '').strip()
        
        # If it's a range with '-', take the average
        if '-' in price_str:
            low, high = map(float, price_str.split('-'))
            return (low + high) / 2
        # If it's a range with '/', take the average
        elif '/' in price_str:
            parts = price_str.split('/')
            if any(p.strip().upper() in ['N', 'N/A', 'NA'] for p in parts):
                return np.nan
            low, high = map(lambda x: float(x.strip()), parts)
            return (low + high) / 2
        else:
            return float(price_str)
    except (ValueError, TypeError):
        return np.nan

# Clean horsepower data
def clean_horsepower(hp_str):
    try:
        # Remove 'hp' and any spaces
        hp_str = str(hp_str).replace('hp', '').strip()
        return float(hp_str)
    except (ValueError, TypeError):
        return np.nan

# Clean torque data
def clean_torque(torque_str):
    try:
        # Remove 'Nm' and any spaces and commas
        torque_str = str(torque_str).replace('Nm', '').replace(',', '').strip()
        # Handle cases with '+'
        if '+' in torque_str:
            torque_str = torque_str.replace('+', '')
        return float(torque_str)
    except (ValueError, TypeError):
        return np.nan

# Apply the cleaning functions
df['Price_Numeric'] = df['Cars Prices'].apply(clean_price)
df['Horsepower_Numeric'] = df['HorsePower'].apply(clean_horsepower)
df['Torque_Numeric'] = df['Torque'].apply(clean_torque)

# Remove rows with NaN values for visualization
df = df.dropna(subset=['Price_Numeric', 'Horsepower_Numeric', 'Torque_Numeric'])

# Set the style for all plots
plt.style.use('bmh')  # Using a built-in style

# 1. Line Chart: Horsepower trends across different price ranges
plt.figure(figsize=(12, 6))
df_sorted = df.sort_values('Price_Numeric')
plt.plot(df_sorted['Price_Numeric'], df_sorted['Horsepower_Numeric'], linewidth=2)
plt.title('Horsepower Trends Across Price Ranges', fontsize=12, pad=15)
plt.xlabel('Car Price (USD)', fontsize=10)
plt.ylabel('Horsepower', fontsize=10)
plt.grid(True)
# Format x-axis labels to show prices in thousands/millions
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('line_chart.png', dpi=300, bbox_inches='tight')
plt.close()

# 2. Bar Chart: Average Horsepower by Company (top 10 companies)
plt.figure(figsize=(12, 6))
top_companies = df.groupby('Company Names')['Horsepower_Numeric'].mean().nlargest(10)
sns.barplot(x=top_companies.index, y=top_companies.values)
plt.title('Average Horsepower by Company (Top 10)', fontsize=12, pad=15)
plt.xlabel('Company', fontsize=10)
plt.ylabel('Average Horsepower', fontsize=10)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('bar_chart.png', dpi=300, bbox_inches='tight')
plt.close()

# 3. Histogram: Distribution of Car Prices
plt.figure(figsize=(12, 6))
# Remove extreme outliers for better visualization
price_data = df[df['Price_Numeric'] < 1000000]
sns.histplot(data=price_data, x='Price_Numeric', bins=30)
plt.title('Distribution of Car Prices', fontsize=12, pad=15)
plt.xlabel('Price (USD)', fontsize=10)
plt.ylabel('Count', fontsize=10)
# Format x-axis labels to show prices in thousands
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('histogram.png', dpi=300, bbox_inches='tight')
plt.close()

# 4. Scatter Plot: Horsepower vs Torque
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='Horsepower_Numeric', y='Torque_Numeric', alpha=0.6)
plt.title('Horsepower vs Torque', fontsize=12, pad=15)
plt.xlabel('Horsepower', fontsize=10)
plt.ylabel('Torque (Nm)', fontsize=10)
plt.grid(True)
plt.tight_layout()
plt.savefig('scatter_plot.png', dpi=300, bbox_inches='tight')
plt.close()

print("Visualizations have been created and saved as PNG files.") 

  with pd.option_context('mode.use_inf_as_na', True):


Visualizations have been created and saved as PNG files.
