# Univariate, Bivariate, and Multivariate Analysis

This notebook performs:
- Univariate Analysis: Analysis of individual variables
- Bivariate Analysis: Analysis of relationships between two variables
- Multivariate Analysis: Analysis of relationships among multiple variables


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
%matplotlib inline

df = pd.read_csv('../../data/FuelConsumption.csv')
df.columns = df.columns.str.strip()


## 1. Univariate Analysis


In [None]:
# Univariate analysis for numerical variables
numerical_cols = ['ENGINE SIZE', 'CYLINDERS', 'FUEL CONSUMPTION', 'COEMISSIONS']

# Create output directory if it doesn't exist
import os
os.makedirs('../../outputs/figures', exist_ok=True)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    # Histogram with KDE
    data = df[col].dropna()
    axes[idx].hist(data, bins=30, alpha=0.7, edgecolor='black', density=True)
    # Add KDE curve
    from scipy.stats import gaussian_kde
    try:
        kde = gaussian_kde(data)
        x_range = np.linspace(data.min(), data.max(), 100)
        axes[idx].plot(x_range, kde(x_range), 'r-', linewidth=2, label='KDE')
    except:
        pass
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Density')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../../outputs/figures/univariate_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Univariate statistics
numerical_cols = ['ENGINE SIZE', 'CYLINDERS', 'FUEL CONSUMPTION', 'COEMISSIONS']
print("Univariate Statistics:\n")
for col in numerical_cols:
    print(f"{col}:")
    print(f"  Mean: {df[col].mean():.2f}")
    print(f"  Median: {df[col].median():.2f}")
    print(f"  Std: {df[col].std():.2f}")
    print(f"  Variance: {df[col].var():.2f}")
    print(f"  Skewness: {df[col].skew():.2f}")
    print(f"  Kurtosis: {df[col].kurtosis():.2f}")
    print(f"  Q1: {df[col].quantile(0.25):.2f}, Q3: {df[col].quantile(0.75):.2f}")
    print(f"  IQR: {df[col].quantile(0.75) - df[col].quantile(0.25):.2f}\n")


## 2. Bivariate Analysis


In [None]:
# Scatter plots for bivariate analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

# Engine Size vs Fuel Consumption
axes[0].scatter(df['ENGINE SIZE'], df['FUEL CONSUMPTION'], alpha=0.5)
axes[0].set_xlabel('Engine Size')
axes[0].set_ylabel('Fuel Consumption')
axes[0].set_title('Engine Size vs Fuel Consumption')
axes[0].grid(True, alpha=0.3)
z = np.polyfit(df['ENGINE SIZE'], df['FUEL CONSUMPTION'], 1)
p = np.poly1d(z)
axes[0].plot(df['ENGINE SIZE'], p(df['ENGINE SIZE']), "r--", alpha=0.8)

# Cylinders vs Fuel Consumption
axes[1].scatter(df['CYLINDERS'], df['FUEL CONSUMPTION'], alpha=0.5)
axes[1].set_xlabel('Cylinders')
axes[1].set_ylabel('Fuel Consumption')
axes[1].set_title('Cylinders vs Fuel Consumption')
axes[1].grid(True, alpha=0.3)

# Fuel Consumption vs CO2 Emissions
axes[2].scatter(df['FUEL CONSUMPTION'], df['COEMISSIONS'], alpha=0.5, color='green')
axes[2].set_xlabel('Fuel Consumption')
axes[2].set_ylabel('CO2 Emissions')
axes[2].set_title('Fuel Consumption vs CO2 Emissions')
axes[2].grid(True, alpha=0.3)
z = np.polyfit(df['FUEL CONSUMPTION'], df['COEMISSIONS'], 1)
p = np.poly1d(z)
axes[2].plot(df['FUEL CONSUMPTION'], p(df['FUEL CONSUMPTION']), "r--", alpha=0.8)

# Engine Size vs CO2 Emissions
axes[3].scatter(df['ENGINE SIZE'], df['COEMISSIONS'], alpha=0.5, color='orange')
axes[3].set_xlabel('Engine Size')
axes[3].set_ylabel('CO2 Emissions')
axes[3].set_title('Engine Size vs CO2 Emissions')
axes[3].grid(True, alpha=0.3)
z = np.polyfit(df['ENGINE SIZE'], df['COEMISSIONS'], 1)
p = np.poly1d(z)
axes[3].plot(df['ENGINE SIZE'], p(df['ENGINE SIZE']), "r--", alpha=0.8)

plt.tight_layout()
plt.savefig('../../outputs/figures/bivariate_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Correlation coefficients
print("Bivariate Correlation Analysis:\n")
target = 'COEMISSIONS'
for col in ['ENGINE SIZE', 'CYLINDERS', 'FUEL CONSUMPTION']:
    corr = df[col].corr(df[target])
    print(f"{col} vs {target}: r = {corr:.4f}")
    
# Additional bivariate analysis
print("\nFuel Consumption vs Other Variables:")
for col in ['ENGINE SIZE', 'CYLINDERS', 'COEMISSIONS']:
    corr = df['FUEL CONSUMPTION'].corr(df[col])
    print(f"  FUEL CONSUMPTION vs {col}: r = {corr:.4f}")


## 3. Multivariate Analysis


In [None]:
# Pair plot for multivariate analysis
numerical_cols = ['ENGINE SIZE', 'CYLINDERS', 'FUEL CONSUMPTION', 'COEMISSIONS']
sns.pairplot(df[numerical_cols], diag_kind='kde', plot_kws={'alpha': 0.6}, height=2.5)
plt.suptitle('Pair Plot - Multivariate Analysis', y=1.02, fontsize=16, fontweight='bold')
plt.savefig('../../outputs/figures/multivariate_pairplot.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Heatmap with correlation
numerical_cols = ['ENGINE SIZE', 'CYLINDERS', 'FUEL CONSUMPTION', 'COEMISSIONS']
correlation_matrix = df[numerical_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, fmt='.3f',
            vmin=-1, vmax=1)
plt.title('Multivariate Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../../outputs/figures/multivariate_correlation.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Multivariate analysis by groups (e.g., by fuel type)
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# Fuel Consumption by Fuel Type and Cylinders
sns.boxplot(data=df, x='CYLINDERS', y='FUEL CONSUMPTION', hue='FUEL', ax=axes[0, 0])
axes[0, 0].set_title('Fuel Consumption by Cylinders and Fuel Type', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Number of Cylinders')
axes[0, 0].set_ylabel('Fuel Consumption (L/100km)')
axes[0, 0].legend(title='Fuel Type')

# CO2 Emissions by Vehicle Class (top 5)
top_classes = df['VEHICLE CLASS'].value_counts().head(5).index
df_filtered = df[df['VEHICLE CLASS'].isin(top_classes)]
sns.boxplot(data=df_filtered, x='VEHICLE CLASS', y='COEMISSIONS', ax=axes[0, 1])
axes[0, 1].set_title('CO2 Emissions by Vehicle Class (Top 5)', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Vehicle Class')
axes[0, 1].set_ylabel('CO2 Emissions (g/km)')
axes[0, 1].tick_params(axis='x', rotation=45)

# Engine Size vs Fuel Consumption by Transmission
sns.scatterplot(data=df, x='ENGINE SIZE', y='FUEL CONSUMPTION', hue='TRANSMISSION', 
                alpha=0.6, ax=axes[1, 0])
axes[1, 0].set_title('Engine Size vs Fuel Consumption by Transmission', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Engine Size (L)')
axes[1, 0].set_ylabel('Fuel Consumption (L/100km)')
axes[1, 0].legend(title='Transmission', bbox_to_anchor=(1.05, 1), loc='upper left')

# Fuel Consumption vs CO2 Emissions by Vehicle Class
top_5_classes = df['VEHICLE CLASS'].value_counts().head(5).index
df_class_filtered = df[df['VEHICLE CLASS'].isin(top_5_classes)]
sns.scatterplot(data=df_class_filtered, x='FUEL CONSUMPTION', y='COEMISSIONS', 
                hue='VEHICLE CLASS', alpha=0.6, ax=axes[1, 1])
axes[1, 1].set_title('Fuel Consumption vs CO2 Emissions by Vehicle Class', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Fuel Consumption (L/100km)')
axes[1, 1].set_ylabel('CO2 Emissions (g/km)')
axes[1, 1].legend(title='Vehicle Class', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig('../../outputs/figures/multivariate_grouped.png', dpi=300, bbox_inches='tight')
plt.show()
