# Exploratory Data Analysis (EDA)
## Fuel Consumption Dataset

This notebook performs comprehensive exploratory data analysis on the Fuel Consumption dataset using R.


In [None]:
# Load necessary libraries
library(dplyr)
library(ggplot2)
library(corrplot)
library(VIM)
library(gridExtra)
library(readr)

# Suppress warnings
options(warn = -1)


In [None]:
# Load the dataset
df <- read.csv("../../data/FuelConsumption.csv", stringsAsFactors = FALSE)

# Clean column names (remove trailing spaces)
colnames(df) <- trimws(colnames(df))

cat("Dataset loaded successfully!\n")
cat("Shape:", nrow(df), "rows,", ncol(df), "columns\n")
cat("Columns:", paste(colnames(df), collapse = ", "), "\n")


## 1. Data Overview


In [None]:
# Display first few rows
head(df, 10)


In [None]:
# Display basic information
str(df)
summary(df)


## 2. Data Quality Assessment


In [None]:
# Check for missing values
missing_values <- colSums(is.na(df))
if(sum(missing_values) > 0) {
  print("Missing Values:")
  print(missing_values[missing_values > 0])
} else {
  cat("âœ“ No missing values found!\n")
}

# Check for duplicates
cat("Number of duplicate rows:", sum(duplicated(df)), "\n")


## 3. Distribution Analysis


In [None]:
# Create output directory
if(!dir.exists("../../outputs/figures")) {
  dir.create("../../outputs/figures", recursive = TRUE)
}

# Distribution plots
p1 <- ggplot(df, aes(x = ENGINE.SIZE)) + 
  geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
  labs(title = "Distribution of Engine Size", x = "Engine Size (L)", y = "Frequency") +
  theme_minimal()

p2 <- ggplot(df, aes(x = CYLINDERS)) + 
  geom_histogram(bins = 30, fill = "coral", alpha = 0.7) +
  labs(title = "Distribution of Cylinders", x = "Number of Cylinders", y = "Frequency") +
  theme_minimal()

p3 <- ggplot(df, aes(x = FUEL.CONSUMPTION)) + 
  geom_histogram(bins = 30, fill = "green", alpha = 0.7) +
  labs(title = "Distribution of Fuel Consumption", x = "Fuel Consumption (L/100km)", y = "Frequency") +
  theme_minimal()

p4 <- ggplot(df, aes(x = COEMISSIONS)) + 
  geom_histogram(bins = 30, fill = "purple", alpha = 0.7) +
  labs(title = "Distribution of CO2 Emissions", x = "CO2 Emissions (g/km)", y = "Frequency") +
  theme_minimal()

grid.arrange(p1, p2, p3, p4, ncol = 2)


## 4. Correlation Analysis


In [None]:
# Correlation matrix
numerical_data <- df[, c("ENGINE.SIZE", "CYLINDERS", "FUEL.CONSUMPTION", "COEMISSIONS")]
correlation_matrix <- cor(numerical_data, use = "complete.obs")

# Visualize
corrplot(correlation_matrix, method = "color", type = "upper", 
         order = "hclust", tl.cex = 0.8, addCoef.col = "black")

print("Correlation Matrix:")
print(round(correlation_matrix, 3))


## 5. Summary and Insights


In [None]:
# Key insights
cat("=== KEY INSIGHTS ===\n\n")
cat("1. Dataset contains", nrow(df), "records with", ncol(df), "features\n")
cat("2. Time period:", min(df$Year), "-", max(df$Year), "\n")
cat("3. Number of unique makes:", length(unique(df$MAKE)), "\n")
cat("4. Number of unique models:", length(unique(df$MODEL)), "\n")
cat("5. Average fuel consumption:", round(mean(df$FUEL.CONSUMPTION, na.rm = TRUE), 2), "L/100km\n")
cat("6. Average CO2 emissions:", round(mean(df$COEMISSIONS, na.rm = TRUE), 2), "g/km\n")
cat("7. Strongest correlation: Fuel Consumption vs CO2 Emissions =", 
    round(cor(df$FUEL.CONSUMPTION, df$COEMISSIONS, use = "complete.obs"), 3), "\n")
