# Exploratory Data Analysis (EDA)
## Fuel Consumption Dataset

This notebook performs comprehensive exploratory data analysis on the Fuel Consumption dataset.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline


In [None]:
# Load the dataset
df = pd.read_csv('../../data/FuelConsumption.csv')

# Clean column names (remove trailing spaces)
df.columns = df.columns.str.strip()

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nData types:\n{df.dtypes}")


## 1. Data Overview


In [None]:
# Display first few rows
df.head(10)


In [None]:
# Display basic information
df.info()


In [None]:
# Display descriptive statistics
df.describe(include='all')


## 2. Data Quality Assessment


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0]
if len(missing_df) > 0:
    print("Missing Values:")
    print(missing_df)
else:
    print("No missing values found!")


In [None]:
# Check for duplicates
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")


## 3. Distribution Analysis


In [None]:
# Distribution of numerical variables
numerical_cols = ['ENGINE SIZE', 'CYLINDERS', 'FUEL CONSUMPTION', 'COEMISSIONS']

# Create output directory if it doesn't exist
import os
os.makedirs('../../outputs/figures', exist_ok=True)

# Histograms with KDE
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    data = df[col].dropna()
    axes[idx].hist(data, bins=30, edgecolor='black', alpha=0.7, density=True)
    # Add KDE
    from scipy.stats import gaussian_kde
    try:
        kde = gaussian_kde(data)
        x_range = np.linspace(data.min(), data.max(), 100)
        axes[idx].plot(x_range, kde(x_range), 'r-', linewidth=2, label='KDE')
    except:
        pass
    axes[idx].set_title(f'Distribution of {col}')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Density')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../../outputs/figures/distribution_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# Box plots for outlier detection
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    axes[idx].boxplot(df[col].dropna())
    axes[idx].set_title(f'Box Plot of {col}')
    axes[idx].set_ylabel(col)
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../../outputs/figures/boxplot_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


## 4. Correlation Analysis


In [None]:
# Correlation matrix
numerical_cols = ['ENGINE SIZE', 'CYLINDERS', 'FUEL CONSUMPTION', 'COEMISSIONS']
correlation_matrix = df[numerical_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, fmt='.3f')
plt.title('Correlation Matrix of Numerical Variables')
plt.tight_layout()
plt.savefig('../../outputs/figures/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("Correlation Matrix:")
print(correlation_matrix)


## 5. Categorical Variable Analysis


In [None]:
# Analyze categorical variables
fig, axes = plt.subplots(2, 2, figsize=(20, 12))

# Top 15 Vehicle Makes
top_makes = df['MAKE'].value_counts().head(15)
axes[0, 0].barh(top_makes.index, top_makes.values, color='steelblue')
axes[0, 0].set_title('Top 15 Vehicle Makes', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Count')
axes[0, 0].grid(True, alpha=0.3, axis='x')

# Vehicle Class Distribution
vehicle_class = df['VEHICLE CLASS'].value_counts()
axes[0, 1].barh(vehicle_class.index, vehicle_class.values, color='coral')
axes[0, 1].set_title('Vehicle Class Distribution', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Count')
axes[0, 1].grid(True, alpha=0.3, axis='x')

# Fuel Type Distribution
fuel = df['FUEL'].value_counts()
axes[1, 0].bar(fuel.index, fuel.values, color='green', alpha=0.7)
axes[1, 0].set_title('Fuel Type Distribution', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Fuel Type')
axes[1, 0].set_ylabel('Count')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Cylinders Distribution
cylinders = df['CYLINDERS'].value_counts().sort_index()
axes[1, 1].bar(cylinders.index.astype(str), cylinders.values, color='purple', alpha=0.7)
axes[1, 1].set_title('Cylinders Distribution', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Number of Cylinders')
axes[1, 1].set_ylabel('Count')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../../outputs/figures/categorical_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


## 6. Temporal Trend Analysis


In [None]:
# Analyze trends over time
yearly_stats = df.groupby('Year').agg({
    'FUEL CONSUMPTION': ['mean', 'median', 'std'],
    'COEMISSIONS': ['mean', 'median', 'std'],
    'ENGINE SIZE': 'mean',
    'CYLINDERS': 'mean'
}).round(2)

print("Yearly Statistics:")
print(yearly_stats)

# Plot temporal trends
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Average Fuel Consumption Over Years
yearly_mean_fc = df.groupby('Year')['FUEL CONSUMPTION'].mean()
axes[0, 0].plot(yearly_mean_fc.index, yearly_mean_fc.values, marker='o', 
                linewidth=2, markersize=8, color='blue')
axes[0, 0].set_title('Average Fuel Consumption Over Years', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Year')
axes[0, 0].set_ylabel('Average Fuel Consumption (L/100km)')
axes[0, 0].grid(True, alpha=0.3)

# Average CO2 Emissions Over Years
yearly_mean_co2 = df.groupby('Year')['COEMISSIONS'].mean()
axes[0, 1].plot(yearly_mean_co2.index, yearly_mean_co2.values, marker='s', 
                color='red', linewidth=2, markersize=8)
axes[0, 1].set_title('Average CO2 Emissions Over Years', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Year')
axes[0, 1].set_ylabel('Average CO2 Emissions (g/km)')
axes[0, 1].grid(True, alpha=0.3)

# Average Engine Size Over Years
yearly_mean_eng = df.groupby('Year')['ENGINE SIZE'].mean()
axes[1, 0].plot(yearly_mean_eng.index, yearly_mean_eng.values, marker='^', 
                color='green', linewidth=2, markersize=8)
axes[1, 0].set_title('Average Engine Size Over Years', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Year')
axes[1, 0].set_ylabel('Average Engine Size (L)')
axes[1, 0].grid(True, alpha=0.3)

# Average Cylinders Over Years
yearly_mean_cyl = df.groupby('Year')['CYLINDERS'].mean()
axes[1, 1].plot(yearly_mean_cyl.index, yearly_mean_cyl.values, marker='d', 
                color='orange', linewidth=2, markersize=8)
axes[1, 1].set_title('Average Number of Cylinders Over Years', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Year')
axes[1, 1].set_ylabel('Average Number of Cylinders')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../../outputs/figures/yearly_trends.png', dpi=300, bbox_inches='tight')
plt.show()


## 7. Outlier Detection


In [None]:
# Detect outliers using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

print("Outlier Detection (IQR Method):\n")
numerical_cols = ['ENGINE SIZE', 'CYLINDERS', 'FUEL CONSUMPTION', 'COEMISSIONS']
for col in numerical_cols:
    outliers, lower, upper = detect_outliers_iqr(df, col)
    print(f"{col}:")
    print(f"  Lower bound: {lower:.2f}, Upper bound: {upper:.2f}")
    print(f"  Number of outliers: {len(outliers)}")
    print(f"  Percentage: {(len(outliers)/len(df))*100:.2f}%")
    print()


## 8. Summary and Insights


In [None]:
# Key insights
print("=== KEY INSIGHTS ===\n")
print(f"1. Dataset contains {len(df)} records with {len(df.columns)} features")
print(f"2. Time period: {df['Year'].min()} - {df['Year'].max()}")
print(f"3. Number of unique makes: {df['MAKE'].nunique()}")
print(f"4. Number of unique models: {df['MODEL'].nunique()}")
print(f"5. Number of unique vehicle classes: {df['VEHICLE CLASS'].nunique()}")
print(f"6. Average fuel consumption: {df['FUEL CONSUMPTION'].mean():.2f} L/100km")
print(f"7. Average CO2 emissions: {df['COEMISSIONS'].mean():.2f} g/km")
print(f"8. Strongest correlation: Fuel Consumption vs CO2 Emissions = {df['FUEL CONSUMPTION'].corr(df['COEMISSIONS']):.3f}")
print(f"9. Engine Size range: {df['ENGINE SIZE'].min()} - {df['ENGINE SIZE'].max()} L")
print(f"10. Cylinders range: {df['CYLINDERS'].min()} - {df['CYLINDERS'].max()}")

# Categorical variable analysis
print("\n=== CATEGORICAL VARIABLES ===")
print(f"\nFuel Types: {df['FUEL'].unique()}")
print(f"\nTransmission Types: {df['TRANSMISSION'].unique()}")
print(f"\nTop 10 Vehicle Makes:")
print(df['MAKE'].value_counts().head(10))
print(f"\nVehicle Classes:")
print(df['VEHICLE CLASS'].value_counts())
