# Statistical Analysis
## Consumer Purchase Prediction

This notebook performs descriptive, inferential, and exploratory statistical analysis using R.


In [None]:
# Load necessary libraries
library(dplyr)
library(ggplot2)
library(car)
library(psych)
library(e1071)


In [None]:
# Function to find project root
find_project_root <- function() {
  current_dir <- getwd()
  max_levels <- 10
  project_marker <- file.path("Consumer Purchase Prediction", "Consumer Purchase Prediction", "data", "Advertisement.csv")
  
  for (i in 1:max_levels) {
    if (file.exists(file.path(current_dir, project_marker))) {
      return(current_dir)
    }
    if (file.exists(file.path(current_dir, "data", "Advertisement.csv"))) {
      return(current_dir)
    }
    if (basename(current_dir) == "Consumer Purchase Prediction") {
      if (file.exists(file.path(current_dir, "Consumer Purchase Prediction", "data", "Advertisement.csv"))) {
        return(current_dir)
      }
      if (file.exists(file.path(current_dir, "data", "Advertisement.csv"))) {
        return(current_dir)
      }
    }
    parent_dir <- dirname(current_dir)
    if (parent_dir == current_dir) break
    current_dir <- parent_dir
  }
  return(NULL)
}

# Set working directory
project_root <- find_project_root()
if (!is.null(project_root)) {
  setwd(project_root)
}

# Load dataset
data_paths <- c(
  file.path("Consumer Purchase Prediction", "Consumer Purchase Prediction", "data", "Advertisement.csv"),
  file.path("data", "Advertisement.csv"),
  "Advertisement.csv"
)

data_path <- NULL
for (path in data_paths) {
  if (file.exists(path)) {
    data_path <- path
    break
  }
}

df <- read.csv(data_path, stringsAsFactors = TRUE)
cat("Dataset loaded successfully\n")
head(df)


## 1. Descriptive Statistics


In [None]:
# Age Statistics
cat("Age Statistics:\n")
print(summary(df$Age))
cat("Skewness:", skewness(df$Age), "\n")
cat("Kurtosis:", kurtosis(df$Age), "\n\n")

# Estimated Salary Statistics
cat("Estimated Salary Statistics:\n")
print(summary(df$EstimatedSalary))
cat("Skewness:", skewness(df$EstimatedSalary), "\n")
cat("Kurtosis:", kurtosis(df$EstimatedSalary), "\n")


In [None]:
# Descriptive Statistics by Purchase Status
cat("Descriptive Statistics by Purchase Status:\n")
describeBy(df[, c("Age", "EstimatedSalary")], df$Purchased)


## 2. Normality Tests


In [None]:
# Shapiro-Wilk Test for Age
shapiro_age <- shapiro.test(df$Age)
cat("Age - Shapiro-Wilk Test:\n")
cat("  W =", shapiro_age$statistic, ", p-value =", shapiro_age$p.value, "\n")
if (shapiro_age$p.value > 0.05) {
  cat("  Result: Data appears to be normally distributed (p > 0.05)\n")
} else {
  cat("  Result: Data does not appear to be normally distributed (p <= 0.05)\n")
}

# Shapiro-Wilk Test for Estimated Salary
shapiro_salary <- shapiro.test(df$EstimatedSalary)
cat("\nEstimated Salary - Shapiro-Wilk Test:\n")
cat("  W =", shapiro_salary$statistic, ", p-value =", shapiro_salary$p.value, "\n")
if (shapiro_salary$p.value > 0.05) {
  cat("  Result: Data appears to be normally distributed (p > 0.05)\n")
} else {
  cat("  Result: Data does not appear to be normally distributed (p <= 0.05)\n")
}


## 3. Hypothesis Testing


In [None]:
# Age: Test difference between purchased and non-purchased groups
age_purchased <- df$Age[df$Purchased == 1]
age_not_purchased <- df$Age[df$Purchased == 0]

# Check normality first
shapiro_age1 <- shapiro.test(age_purchased)
shapiro_age2 <- shapiro.test(age_not_purchased)

if (shapiro_age1$p.value > 0.05 && shapiro_age2$p.value > 0.05) {
  # Use t-test
  t_test_age <- t.test(age_purchased, age_not_purchased)
  cat("Age Difference (t-test):\n")
  cat("  t =", t_test_age$statistic, ", p-value =", t_test_age$p.value, "\n")
} else {
  # Use Mann-Whitney U test
  wilcox_test_age <- wilcox.test(age_purchased, age_not_purchased)
  cat("Age Difference (Mann-Whitney U test):\n")
  cat("  W =", wilcox_test_age$statistic, ", p-value =", wilcox_test_age$p.value, "\n")
}


In [None]:
# Salary: Test difference between purchased and non-purchased groups
salary_purchased <- df$EstimatedSalary[df$Purchased == 1]
salary_not_purchased <- df$EstimatedSalary[df$Purchased == 0]

shapiro_sal1 <- shapiro.test(salary_purchased)
shapiro_sal2 <- shapiro.test(salary_not_purchased)

if (shapiro_sal1$p.value > 0.05 && shapiro_sal2$p.value > 0.05) {
  t_test_salary <- t.test(salary_purchased, salary_not_purchased)
  cat("Salary Difference (t-test):\n")
  cat("  t =", t_test_salary$statistic, ", p-value =", t_test_salary$p.value, "\n")
} else {
  wilcox_test_salary <- wilcox.test(salary_purchased, salary_not_purchased)
  cat("Salary Difference (Mann-Whitney U test):\n")
  cat("  W =", wilcox_test_salary$statistic, ", p-value =", wilcox_test_salary$p.value, "\n")
}


## 4. Chi-Square Test for Categorical Variables


In [None]:
# Chi-square test: Gender vs Purchased
contingency_table <- table(df$Gender, df$Purchased)
cat("Contingency Table: Gender vs Purchased\n")
print(contingency_table)

chi2_test <- chisq.test(contingency_table)
cat("\nChi-square Test:\n")
cat("  Chi-square =", chi2_test$statistic, ", p-value =", chi2_test$p.value, "\n")
cat("  Degrees of freedom =", chi2_test$parameter, "\n")
if (chi2_test$p.value < 0.05) {
  cat("  Result: Significant association (p < 0.05)\n")
} else {
  cat("  Result: No significant association (p >= 0.05)\n")
}


## 5. Correlation Analysis


In [None]:
# Pearson Correlation
numeric_df <- df[, c("Age", "EstimatedSalary", "Purchased")]
pearson_corr <- cor(numeric_df, method = "pearson")
cat("Pearson Correlation:\n")
print(pearson_corr)

# Spearman Correlation
spearman_corr <- cor(numeric_df, method = "spearman")
cat("\nSpearman Correlation:\n")
print(spearman_corr)


In [None]:
# Correlation Tests
cor_test_age <- cor.test(df$Age, df$Purchased)
cor_test_salary <- cor.test(df$EstimatedSalary, df$Purchased)

cat("Correlation Tests:\n")
cat("Age vs Purchased: r =", cor_test_age$estimate, ", p =", cor_test_age$p.value, "\n")
cat("Salary vs Purchased: r =", cor_test_salary$estimate, ", p =", cor_test_salary$p.value, "\n")


In [None]:
# One-way ANOVA: Age by Purchase status
anova_age <- aov(Age ~ as.factor(Purchased), data = df)
cat("ANOVA: Age by Purchase Status\n")
print(summary(anova_age))


In [None]:
# One-way ANOVA: Salary by Purchase status
anova_salary <- aov(EstimatedSalary ~ as.factor(Purchased), data = df)
cat("ANOVA: Estimated Salary by Purchase Status\n")
print(summary(anova_salary))
