# Statistical Analysis
## Descriptive, Inferential, and Exploratory Statistical Analysis

This notebook performs comprehensive statistical analysis including:
- Descriptive Statistics
- Inferential Statistics
- Exploratory Statistical Analysis


In [None]:
# Load necessary libraries
library(dplyr)
library(psych)
library(car)

# Load the dataset
df <- read.csv("../../data/FuelConsumption.csv", stringsAsFactors = FALSE)
colnames(df) <- trimws(colnames(df))


## 1. Descriptive Statistics


In [None]:
# Descriptive statistics
numerical_cols <- c("ENGINE.SIZE", "CYLINDERS", "FUEL.CONSUMPTION", "COEMISSIONS")
descriptive_stats <- describe(df[, numerical_cols])
print(descriptive_stats)

# Quartiles and IQR
cat("\nQuartiles and IQR:\n")
for(col in numerical_cols) {
  Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
  Q2 <- quantile(df[[col]], 0.50, na.rm = TRUE)
  Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  cat("\n", col, ":\n")
  cat("  Q1:", round(Q1, 2), ", Q2:", round(Q2, 2), ", Q3:", round(Q3, 2), "\n")
  cat("  IQR:", round(IQR, 2), "\n")
}


## 2. Inferential Statistics


In [None]:
# Normality Tests
cat("Normality Tests:\n")
for(col in numerical_cols) {
  data <- df[[col]][!is.na(df[[col]])]
  if(length(data) < 5000) {
    test_result <- shapiro.test(data)
    test_name <- "Shapiro-Wilk"
  } else {
    test_result <- ks.test(data, "pnorm", mean(data), sd(data))
    test_name <- "Kolmogorov-Smirnov"
  }
  cat("\n", col, " (", test_name, "):\n")
  cat("  p-value:", round(test_result$p.value, 4), "\n")
  cat("  Normal:", ifelse(test_result$p.value > 0.05, "Yes", "No"), "\n")
}

# T-test: Compare fuel consumption by fuel type
fuel_types <- unique(df$FUEL)
if(length(fuel_types) >= 2) {
  group1 <- df[df$FUEL == fuel_types[1], "FUEL.CONSUMPTION"]
  group2 <- df[df$FUEL == fuel_types[2], "FUEL.CONSUMPTION"]
  t_test <- t.test(group1, group2)
  cat("\nT-test: Fuel Consumption by Fuel Type\n")
  cat("  p-value:", round(t_test$p.value, 4), "\n")
  cat("  Significant:", ifelse(t_test$p.value < 0.05, "Yes", "No"), "\n")
}


## 3. Exploratory Statistical Analysis


In [None]:
# Confidence Intervals
cat("95% Confidence Intervals for Mean:\n")
for(col in numerical_cols) {
  data <- df[[col]][!is.na(df[[col]])]
  mean_val <- mean(data)
  n <- length(data)
  se <- sd(data) / sqrt(n)
  ci_lower <- mean_val - qt(0.975, n-1) * se
  ci_upper <- mean_val + qt(0.975, n-1) * se
  cat("\n", col, ":\n")
  cat("  Mean:", round(mean_val, 2), "\n")
  cat("  95% CI: [", round(ci_lower, 2), ", ", round(ci_upper, 2), "]\n")
}

# Correlation Analysis with p-values
cat("\nCorrelation Analysis with Significance:\n")
target <- "COEMISSIONS"
for(col in c("ENGINE.SIZE", "CYLINDERS", "FUEL.CONSUMPTION")) {
  data_subset <- df[, c(col, target)]
  data_subset <- data_subset[complete.cases(data_subset), ]
  cor_test <- cor.test(data_subset[[col]], data_subset[[target]])
  cat("\n", col, " vs ", target, ":\n")
  cat("  r =", round(cor_test$estimate, 4), ", p =", round(cor_test$p.value, 4), "\n")
  cat("  Significant:", ifelse(cor_test$p.value < 0.05, "Yes", "No"), "\n")
}
