# Exploratory Data Analysis (EDA)
## Consumer Purchase Prediction

This notebook performs comprehensive exploratory data analysis on the consumer behavior dataset using R.


In [None]:
# Load necessary libraries
library(dplyr)
library(ggplot2)
library(corrplot)
library(VIM)


In [None]:
# Function to find project root by looking for data directory
find_project_root <- function() {
  current_dir <- getwd()
  max_levels <- 10
  project_marker <- file.path("Consumer Purchase Prediction", "Consumer Purchase Prediction", "data", "Advertisement.csv")
  
  for (i in 1:max_levels) {
    if (file.exists(file.path(current_dir, project_marker))) {
      return(current_dir)
    }
    if (file.exists(file.path(current_dir, "data", "Advertisement.csv"))) {
      return(current_dir)
    }
    if (basename(current_dir) == "Consumer Purchase Prediction") {
      if (file.exists(file.path(current_dir, "Consumer Purchase Prediction", "data", "Advertisement.csv"))) {
        return(current_dir)
      }
      if (file.exists(file.path(current_dir, "data", "Advertisement.csv"))) {
        return(current_dir)
      }
    }
    parent_dir <- dirname(current_dir)
    if (parent_dir == current_dir) break
    current_dir <- parent_dir
  }
  return(NULL)
}

# Set working directory to project root
project_root <- find_project_root()
if (!is.null(project_root)) {
  setwd(project_root)
  cat("Working directory set to:", getwd(), "\n")
} else {
  cat("Warning: Could not find project root. Using current directory:", getwd(), "\n")
}


In [None]:
# Load the dataset - try multiple possible paths
data_paths <- c(
  file.path("Consumer Purchase Prediction", "Consumer Purchase Prediction", "data", "Advertisement.csv"),
  file.path("data", "Advertisement.csv"),
  "Advertisement.csv"
)

data_path <- NULL
for (path in data_paths) {
  if (file.exists(path)) {
    data_path <- path
    break
  }
}

if (is.null(data_path)) {
  stop(paste("Cannot find Advertisement.csv. Searched in:\n",
             paste("  -", data_paths, collapse = "\n"),
             "\nCurrent working directory:", getwd()))
}

df <- read.csv(data_path, stringsAsFactors = TRUE)

cat("Dataset loaded successfully from:", data_path, "\n")
cat("Shape:", nrow(df), "rows,", ncol(df), "columns\n")
head(df)


## 1. Data Overview


In [None]:
# Dataset Structure
str(df)


In [None]:
# Statistical Summary
summary(df)


In [None]:
# Check for missing values
cat("Missing Values:\n")
colSums(is.na(df))

cat("\nDuplicate Rows:", sum(duplicated(df)), "\n")


## 2. Target Variable Analysis


In [None]:
# Purchased Distribution
purchased_counts <- table(df$Purchased)
cat("Purchased Distribution:\n")
print(purchased_counts)
cat("\nPercentage:\n")
print(prop.table(purchased_counts) * 100)

# Visualize target variable
par(mfrow = c(1, 2))
barplot(purchased_counts, names.arg = c("No", "Yes"), 
        col = c("skyblue", "coral"),
        main = "Purchased Distribution (Bar Chart)",
        xlab = "Purchased", ylab = "Count")
pie(purchased_counts, labels = c("No", "Yes"), 
    main = "Purchased Distribution (Pie Chart)",
    col = c("skyblue", "coral"))


## 3. Numerical Variables Analysis


In [None]:
# Distribution of numerical variables
par(mfrow = c(2, 2))
hist(df$Age, main = "Age Distribution (Histogram)", xlab = "Age", col = "lightblue", breaks = 30)
boxplot(df$Age, main = "Age Distribution (Box Plot)", ylab = "Age", col = "lightblue")
hist(df$EstimatedSalary, main = "Estimated Salary Distribution (Histogram)", 
     xlab = "Estimated Salary", col = "lightgreen", breaks = 30)
boxplot(df$EstimatedSalary, main = "Estimated Salary Distribution (Box Plot)", 
        ylab = "Estimated Salary", col = "lightgreen")


## 4. Categorical Variables Analysis


In [None]:
# Gender Distribution
gender_counts <- table(df$Gender)
cat("Gender Distribution:\n")
print(gender_counts)
cat("\nPercentage:\n")
print(prop.table(gender_counts) * 100)

# Visualize
par(mfrow = c(1, 2))
barplot(gender_counts, col = c("lightblue", "lightpink"),
        main = "Gender Distribution", xlab = "Gender", ylab = "Count")
pie(gender_counts, labels = names(gender_counts),
    main = "Gender Distribution (Pie Chart)",
    col = c("lightblue", "lightpink"))


## 5. Relationship Analysis


In [None]:
# Age vs Purchased
par(mfrow = c(1, 2))
boxplot(Age ~ Purchased, data = df, main = "Age Distribution by Purchase Status",
        xlab = "Purchased", ylab = "Age", names = c("No", "Yes"))
boxplot(EstimatedSalary ~ Purchased, data = df, 
        main = "Salary Distribution by Purchase Status",
        xlab = "Purchased", ylab = "Estimated Salary", names = c("No", "Yes"))


In [None]:
# Age vs EstimatedSalary colored by Purchased
plot(df$Age, df$EstimatedSalary, 
     col = ifelse(df$Purchased == 1, 
                  adjustcolor("red", alpha.f = 0.6), 
                  adjustcolor("blue", alpha.f = 0.6)),
     pch = 19,
     xlab = "Age", ylab = "Estimated Salary",
     main = "Age vs Estimated Salary (colored by Purchase Status)")
legend("topright", legend = c("No Purchase", "Purchase"), 
       col = c("blue", "red"), pch = 19)


## 6. Correlation Analysis


In [None]:
# Correlation matrix
numeric_df <- df[, c("Age", "EstimatedSalary", "Purchased")]
correlation_matrix <- cor(numeric_df)
cat("Correlation Matrix:\n")
print(correlation_matrix)
cat("\nCorrelation with Purchased:\n")
print(sort(correlation_matrix[, "Purchased"], decreasing = TRUE))

# Visualize correlation matrix
corrplot(correlation_matrix, method = "color", type = "upper",
         order = "hclust", tl.cex = 0.8, tl.col = "black",
         addCoef.col = "black", number.cex = 0.7,
         main = "Correlation Matrix")
