# Univariate, Bivariate, and Multivariate Analysis
## Consumer Purchase Prediction

This notebook performs comprehensive univariate, bivariate, and multivariate analysis using R.


In [None]:
# Load necessary libraries
library(dplyr)
library(ggplot2)
library(corrplot)
library(psych)
library(e1071)


In [None]:
# Function to find project root and load data
find_project_root <- function() {
  current_dir <- getwd()
  max_levels <- 10
  for (i in 1:max_levels) {
    if (file.exists(file.path(current_dir, "data", "Advertisement.csv"))) {
      return(current_dir)
    }
    parent_dir <- dirname(current_dir)
    if (parent_dir == current_dir) break
    current_dir <- parent_dir
  }
  return(NULL)
}

project_root <- find_project_root()
if (!is.null(project_root)) {
  setwd(project_root)
}

data_paths <- c(
  file.path("data", "Advertisement.csv"),
  "Advertisement.csv"
)

data_path <- NULL
for (path in data_paths) {
  if (file.exists(path)) {
    data_path <- path
    break
  }
}

df <- read.csv(data_path, stringsAsFactors = TRUE)
cat("Dataset loaded successfully\n")
head(df)


## 1. Univariate Analysis


In [None]:
# Univariate analysis for Age
par(mfrow = c(2, 2))
hist(df$Age, main = "Age Distribution (Histogram)", xlab = "Age", col = "lightblue", breaks = 30)
boxplot(df$Age, main = "Age Distribution (Box Plot)", ylab = "Age", col = "lightblue")
qqnorm(df$Age, main = "Age Q-Q Plot")
qqline(df$Age, col = "red")
plot(density(df$Age), main = "Age Density Plot", xlab = "Age")

cat("Age Statistics:\n")
cat("Mean:", mean(df$Age), "\n")
cat("Median:", median(df$Age), "\n")
cat("Std:", sd(df$Age), "\n")
cat("Skewness:", skewness(df$Age), "\n")
cat("Kurtosis:", kurtosis(df$Age), "\n")


In [None]:
# Univariate analysis for EstimatedSalary
par(mfrow = c(2, 2))
hist(df$EstimatedSalary, main = "Estimated Salary Distribution (Histogram)", 
     xlab = "Estimated Salary", col = "lightgreen", breaks = 30)
boxplot(df$EstimatedSalary, main = "Estimated Salary Distribution (Box Plot)", 
        ylab = "Estimated Salary", col = "lightgreen")
qqnorm(df$EstimatedSalary, main = "Estimated Salary Q-Q Plot")
qqline(df$EstimatedSalary, col = "red")
plot(density(df$EstimatedSalary), main = "Estimated Salary Density Plot", 
     xlab = "Estimated Salary")

cat("Estimated Salary Statistics:\n")
cat("Mean:", mean(df$EstimatedSalary), "\n")
cat("Median:", median(df$EstimatedSalary), "\n")
cat("Std:", sd(df$EstimatedSalary), "\n")
cat("Skewness:", skewness(df$EstimatedSalary), "\n")
cat("Kurtosis:", kurtosis(df$EstimatedSalary), "\n")


## 2. Bivariate Analysis


In [None]:
# Age vs Purchased
par(mfrow = c(1, 3))
boxplot(Age ~ Purchased, data = df, main = "Age by Purchase Status",
        xlab = "Purchased", ylab = "Age", names = c("No", "Yes"))
stripchart(Age ~ Purchased, data = df, method = "jitter", 
           pch = 19, col = c("blue", "red"), vertical = TRUE,
           main = "Age Distribution by Purchase Status")
violin_plot_data <- data.frame(
  Age = df$Age,
  Purchased = as.factor(df$Purchased)
)
# Simple comparison plot
plot(df$Age ~ as.factor(df$Purchased), 
     main = "Age Comparison by Purchase Status",
     xlab = "Purchased", ylab = "Age")


In [None]:
# Statistical test for Age
t_test_age <- t.test(Age ~ Purchased, data = df)
cat("Age T-test: t =", t_test_age$statistic, ", p-value =", t_test_age$p.value, "\n")


In [None]:
# EstimatedSalary vs Purchased
par(mfrow = c(1, 3))
boxplot(EstimatedSalary ~ Purchased, data = df, 
        main = "Estimated Salary by Purchase Status",
        xlab = "Purchased", ylab = "Estimated Salary", names = c("No", "Yes"))
stripchart(EstimatedSalary ~ Purchased, data = df, method = "jitter", 
           pch = 19, col = c("blue", "red"), vertical = TRUE,
           main = "Estimated Salary Distribution by Purchase Status")
plot(df$EstimatedSalary ~ as.factor(df$Purchased), 
     main = "Estimated Salary Comparison by Purchase Status",
     xlab = "Purchased", ylab = "Estimated Salary")


In [None]:
# Statistical test for Salary
t_test_salary <- t.test(EstimatedSalary ~ Purchased, data = df)
cat("Salary T-test: t =", t_test_salary$statistic, ", p-value =", t_test_salary$p.value, "\n")


In [None]:
# Age vs EstimatedSalary
plot(df$Age, df$EstimatedSalary, 
     col = ifelse(df$Purchased == 1, 
                  adjustcolor("red", alpha.f = 0.6), 
                  adjustcolor("blue", alpha.f = 0.6)),
     pch = 19,
     xlab = "Age", ylab = "Estimated Salary",
     main = "Age vs Estimated Salary (colored by Purchase Status)")
legend("topright", legend = c("No Purchase", "Purchase"), 
       col = c("blue", "red"), pch = 19)

# Correlation
corr_age_salary <- cor(df$Age, df$EstimatedSalary)
cat("Correlation between Age and Estimated Salary:", corr_age_salary, "\n")


In [None]:
# Gender vs Purchased
gender_purchase <- table(df$Gender, df$Purchased)
print("Gender vs Purchased:")
print(gender_purchase)
print("Percentage:")
print(prop.table(gender_purchase, margin = 1) * 100)

barplot(gender_purchase, beside = TRUE, 
        main = "Gender vs Purchased",
        xlab = "Purchased", ylab = "Count",
        legend = rownames(gender_purchase),
        col = c("lightblue", "lightpink"))


## 3. Multivariate Analysis


In [None]:
# Correlation matrix
numeric_df <- df[, c("Age", "EstimatedSalary", "Purchased")]
correlation_matrix <- cor(numeric_df)
cat("Correlation Matrix:\n")
print(correlation_matrix)

# Visualize correlation matrix
corrplot(correlation_matrix, method = "color", type = "upper",
         order = "hclust", tl.cex = 0.8, tl.col = "black",
         addCoef.col = "black", number.cex = 0.7,
         main = "Correlation Matrix")


In [None]:
# Multivariate visualization: Age, Salary, and Purchased by Gender
par(mfrow = c(1, 2))
for (gender in unique(df$Gender)) {
  gender_df <- df[df$Gender == gender, ]
  plot(gender_df$Age, gender_df$EstimatedSalary, 
       col = ifelse(gender_df$Purchased == 1, 
                    adjustcolor("red", alpha.f = 0.6), 
                    adjustcolor("blue", alpha.f = 0.6)),
       pch = 19,
       xlab = "Age", ylab = "Estimated Salary",
       main = paste(gender, ": Age vs Estimated Salary"))
  legend("topright", legend = c("No Purchase", "Purchase"), 
         col = c("blue", "red"), pch = 19)
}
