# Univariate, Bivariate, and Multivariate Analysis

This notebook performs:
- Univariate Analysis: Analysis of individual variables
- Bivariate Analysis: Analysis of relationships between two variables
- Multivariate Analysis: Analysis of relationships among multiple variables


In [None]:
# Load necessary libraries
library(dplyr)
library(ggplot2)
library(corrplot)
library(GGally)
library(gridExtra)

# Load the dataset
df <- read.csv("../../data/FuelConsumption.csv", stringsAsFactors = FALSE)
colnames(df) <- trimws(colnames(df))

numerical_cols <- c("ENGINE.SIZE", "CYLINDERS", "FUEL.CONSUMPTION", "COEMISSIONS")


## 1. Univariate Analysis


In [None]:
# Univariate statistics
cat("Univariate Statistics:\n\n")
for(col in numerical_cols) {
  cat(col, ":\n")
  cat("  Mean:", round(mean(df[[col]], na.rm = TRUE), 2), "\n")
  cat("  Median:", round(median(df[[col]], na.rm = TRUE), 2), "\n")
  cat("  Std:", round(sd(df[[col]], na.rm = TRUE), 2), "\n")
  cat("  Skewness:", round(e1071::skewness(df[[col]], na.rm = TRUE), 2), "\n")
  cat("  Kurtosis:", round(e1071::kurtosis(df[[col]], na.rm = TRUE), 2), "\n\n")
}

# Distribution plots
p1 <- ggplot(df, aes(x = ENGINE.SIZE)) + 
  geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
  labs(title = "Distribution of Engine Size", x = "Engine Size (L)", y = "Frequency") +
  theme_minimal()

p2 <- ggplot(df, aes(x = FUEL.CONSUMPTION)) + 
  geom_histogram(bins = 30, fill = "green", alpha = 0.7) +
  labs(title = "Distribution of Fuel Consumption", x = "Fuel Consumption (L/100km)", y = "Frequency") +
  theme_minimal()

grid.arrange(p1, p2, ncol = 2)


## 2. Bivariate Analysis


In [None]:
# Correlation coefficients
cat("Bivariate Correlation Analysis:\n\n")
target <- "COEMISSIONS"
for(col in c("ENGINE.SIZE", "CYLINDERS", "FUEL.CONSUMPTION")) {
  corr <- cor(df[[col]], df[[target]], use = "complete.obs")
  cat(col, " vs ", target, ": r =", round(corr, 4), "\n")
}

# Scatter plots
p1 <- ggplot(df, aes(x = ENGINE.SIZE, y = FUEL.CONSUMPTION)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Engine Size vs Fuel Consumption", 
       x = "Engine Size (L)", y = "Fuel Consumption (L/100km)") +
  theme_minimal()

p2 <- ggplot(df, aes(x = FUEL.CONSUMPTION, y = COEMISSIONS)) +
  geom_point(alpha = 0.5, color = "green") +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Fuel Consumption vs CO2 Emissions", 
       x = "Fuel Consumption (L/100km)", y = "CO2 Emissions (g/km)") +
  theme_minimal()

grid.arrange(p1, p2, ncol = 2)


## 3. Multivariate Analysis


In [None]:
# Pair plot
numerical_data <- df[, numerical_cols]
pairs(numerical_data, pch = 19, cex = 0.5)

# Correlation heatmap
correlation_matrix <- cor(numerical_data, use = "complete.obs")
corrplot(correlation_matrix, method = "color", type = "upper", 
         order = "hclust", tl.cex = 0.8, addCoef.col = "black")
print("Correlation Matrix:")
print(round(correlation_matrix, 3))
