# 📊 Stage 3: Exploratory Data Analysis (EDA)

This notebook explores pricing trends in the used cars dataset to support modeling and pricing strategy development.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the cleaned dataset
df = pd.read_csv('/content/used_cars_cleaned.csv')

# Set plot styles
sns.set(style='whitegrid')


## 📋 Dataset Overview

In [None]:
df.info()
df.describe(include='all')

## 🧾 Dataset Summary

In [None]:
print(f"Total vehicles: {df.shape[0]}")
print(f"Unique manufacturers: {df['manufacturer'].nunique()}")
print(f"Year range: {int(df['year'].min())} - {int(df['year'].max())}")

## 📊 Summary Tables

In [None]:
# Avg price by condition
df.groupby('condition')['price'].agg(['count', 'mean', 'median']).sort_values(by='mean', ascending=False)

In [None]:
# Avg price by manufacturer
df.groupby('manufacturer')['price'].agg(['count', 'mean', 'median']).sort_values(by='mean', ascending=False).head(15)

In [None]:
# Avg price by year
df.groupby('year')['price'].agg(['count', 'mean', 'median']).sort_index(ascending=False).head(20)

## 💰 Price Distribution

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df['price'], bins=50, kde=True)
plt.title('Distribution of Vehicle Prices')
plt.xlabel('Price (USD)')
plt.ylabel('Frequency')
plt.show()

## 📉 Odometer Distribution

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df['odometer'], bins=50, kde=True)
plt.title('Distribution of Vehicle Mileage')
plt.xlabel('Odometer (Miles)')
plt.ylabel('Frequency')
plt.show()

## 📈 Price vs. Vehicle Age

In [None]:
plt.figure(figsize=(10, 5))
sns.scatterplot(x='vehicle_age', y='price', data=df, alpha=0.5)
plt.title('Price vs. Vehicle Age')
plt.xlabel('Vehicle Age (Years)')
plt.ylabel('Price (USD)')
plt.show()

## 🏷️ Average Price by Top 10 Manufacturers

In [None]:
top_makes = df['manufacturer'].value_counts().nlargest(10).index
plt.figure(figsize=(12, 6))
sns.boxplot(x='manufacturer', y='price', data=df[df['manufacturer'].isin(top_makes)])
plt.xticks(rotation=45)
plt.title('Price Distribution by Top 10 Manufacturers')
plt.xlabel('Manufacturer')
plt.ylabel('Price (USD)')
plt.show()

## 🔧 Price by Condition

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='condition', y='price', data=df)
plt.title('Price Distribution by Condition')
plt.xlabel('Condition')
plt.ylabel('Price (USD)')
plt.show()

## 🔗 Correlation Matrix

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(df[['price', 'odometer', 'vehicle_age', 'price_per_mile', 'is_clean_title']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()