In [None]:
# SUMMER ANALYTICS 2025 - Week 1 Assignment

# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
cars = pd.read_csv("Cars.csv")
cars.head()

# Basic information about the dataset
cars.info()

# Summary statistics
cars.describe()

# Checking for null values
cars.isnull().sum()

# Filling or removing missing values
# If horsepower has missing values, fill with mean
df_clean = cars.copy()
df_clean['horsepower'] = pd.to_numeric(df_clean['horsepower'], errors='coerce')
df_clean['horsepower'].fillna(df_clean['horsepower'].mean(), inplace=True)

# 1. How many cars are there for each origin?
df_clean['origin'].value_counts()

# 2. What is the average MPG of cars from each origin?
df_clean.groupby('origin')['mpg'].mean()

# 3. What is the most common number of cylinders?
df_clean['cylinders'].value_counts().idxmax()

# 4. What is the correlation matrix of the numeric features?
df_clean.corr(numeric_only=True)

# 5. Plotting a heatmap of the correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(df_clean.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix Heatmap")
plt.show()

# 6. Plotting a pairplot for mpg, horsepower, weight, and acceleration
sns.pairplot(df_clean[['mpg', 'horsepower', 'weight', 'acceleration']])
plt.show()

# 7. Plot distribution of MPG
sns.histplot(df_clean['mpg'], bins=20, kde=True)
plt.title("Distribution of MPG")
plt.xlabel("MPG")
plt.ylabel("Count")
plt.show()

# 8. Trend of average MPG over the years
avg_mpg_year = df_clean.groupby('model_year')['mpg'].mean()
plt.plot(avg_mpg_year.index, avg_mpg_year.values, marker='o')
plt.title("Average MPG over Model Years")
plt.xlabel("Model Year")
plt.ylabel("Average MPG")
plt.grid(True)
plt.show()

# 9. Top 5 most fuel-efficient cars
df_clean.sort_values(by='mpg', ascending=False)[['name', 'mpg']].head()

# 10. Bottom 5 least fuel-efficient cars
df_clean.sort_values(by='mpg')[['name', 'mpg']].head()

# 11. Boxplot of MPG by origin
sns.boxplot(x='origin', y='mpg', data=df_clean)
plt.title("MPG Distribution by Origin")
plt.show()

# 12. Average horsepower for each number of cylinders
df_clean.groupby('cylinders')['horsepower'].mean()

# 13. Scatter plot between weight and mpg
sns.scatterplot(x='weight', y='mpg', data=df_clean)
plt.title("Weight vs MPG")
plt.xlabel("Weight")
plt.ylabel("MPG")
plt.show()

# 14. Create a new column: power-to-weight ratio
# (horsepower divided by weight)
df_clean['power_to_weight'] = df_clean['horsepower'] / df_clean['weight']
df_clean[['name', 'power_to_weight']].head()

# 15. Find the car with the highest power-to-weight ratio
df_clean.loc[df_clean['power_to_weight'].idxmax()][['name', 'power_to_weight']]

# 16. Save cleaned dataset (optional for submission)
df_clean.to_csv("Cars_cleaned.csv", index=False)