# Machine Learning Analysis
## Consumer Purchase Prediction

This notebook implements various machine learning algorithms for predicting consumer purchase behavior using R.


In [None]:
# Load necessary libraries
library(dplyr)
library(caret)
library(randomForest)
library(e1071)
library(rpart)
library(rpart.plot)
library(pROC)
library(ROCR)


In [None]:
# Function to find project root and load data
find_project_root <- function() {
  current_dir <- getwd()
  max_levels <- 10
  for (i in 1:max_levels) {
    if (file.exists(file.path(current_dir, "data", "Advertisement.csv"))) {
      return(current_dir)
    }
    parent_dir <- dirname(current_dir)
    if (parent_dir == current_dir) break
    current_dir <- parent_dir
  }
  return(NULL)
}

project_root <- find_project_root()
if (!is.null(project_root)) {
  setwd(project_root)
}

data_paths <- c(
  file.path("data", "Advertisement.csv"),
  "Advertisement.csv"
)

data_path <- NULL
for (path in data_paths) {
  if (file.exists(path)) {
    data_path <- path
    break
  }
}

df <- read.csv(data_path, stringsAsFactors = TRUE)
df$Gender <- as.factor(df$Gender)
df$Purchased <- as.factor(df$Purchased)

cat("Dataset loaded successfully\n")
cat("Shape:", nrow(df), "rows,", ncol(df), "columns\n")
head(df)


## 1. Data Preprocessing


In [None]:
# Prepare features and target
X <- df[, c("Gender", "Age", "EstimatedSalary")]
y <- df$Purchased

# Split the data
set.seed(42)
trainIndex <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[trainIndex, ]
X_test <- X[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]

# Create training data frame
train_data <- data.frame(X_train, Purchased = y_train)
test_data <- data.frame(X_test, Purchased = y_test)

cat("Training set size:", nrow(X_train), "\n")
cat("Test set size:", nrow(X_test), "\n")
cat("\nTarget distribution in training set:\n")
table(y_train)
cat("\nTarget distribution in test set:\n")
table(y_test)


## 2. Model Training and Evaluation


### 2.1 Logistic Regression


In [None]:
# Logistic Regression
logistic_model <- train(Purchased ~ ., data = train_data, method = "glm", 
                        family = "binomial", 
                        trControl = trainControl(method = "cv", number = 5))
logistic_pred <- predict(logistic_model, test_data)
logistic_pred_proba <- predict(logistic_model, test_data, type = "prob")[, 2]
logistic_cm <- confusionMatrix(logistic_pred, y_test)

cat("Logistic Regression Results:\n")
cat("Accuracy:", logistic_cm$overall["Accuracy"], "\n")
cat("Precision:", logistic_cm$byClass["Precision"], "\n")
cat("Recall:", logistic_cm$byClass["Recall"], "\n")
cat("F1 Score:", logistic_cm$byClass["F1"], "\n")
print(logistic_cm)


### 2.2 Random Forest


In [None]:
# Random Forest
rf_model <- train(Purchased ~ ., data = train_data, method = "rf", 
                  ntree = 100, 
                  trControl = trainControl(method = "cv", number = 5))
rf_pred <- predict(rf_model, test_data)
rf_pred_proba <- predict(rf_model, test_data, type = "prob")[, 2]
rf_cm <- confusionMatrix(rf_pred, y_test)

cat("Random Forest Results:\n")
cat("Accuracy:", rf_cm$overall["Accuracy"], "\n")
cat("Precision:", rf_cm$byClass["Precision"], "\n")
cat("Recall:", rf_cm$byClass["Recall"], "\n")
cat("F1 Score:", rf_cm$byClass["F1"], "\n")
print(rf_cm)


### 2.3 Support Vector Machine (SVM)


In [None]:
# SVM
svm_model <- train(Purchased ~ ., data = train_data, method = "svmRadial", 
                   trControl = trainControl(method = "cv", number = 5))
svm_pred <- predict(svm_model, test_data)
svm_pred_proba <- predict(svm_model, test_data, type = "prob")[, 2]
svm_cm <- confusionMatrix(svm_pred, y_test)

cat("SVM Results:\n")
cat("Accuracy:", svm_cm$overall["Accuracy"], "\n")
cat("Precision:", svm_cm$byClass["Precision"], "\n")
cat("Recall:", svm_cm$byClass["Recall"], "\n")
cat("F1 Score:", svm_cm$byClass["F1"], "\n")
print(svm_cm)


### 2.4 Naive Bayes


In [None]:
# Naive Bayes
nb_model <- train(Purchased ~ ., data = train_data, method = "nb", 
                  trControl = trainControl(method = "cv", number = 5))
nb_pred <- predict(nb_model, test_data)
nb_pred_proba <- predict(nb_model, test_data, type = "prob")[, 2]
nb_cm <- confusionMatrix(nb_pred, y_test)

cat("Naive Bayes Results:\n")
cat("Accuracy:", nb_cm$overall["Accuracy"], "\n")
cat("Precision:", nb_cm$byClass["Precision"], "\n")
cat("Recall:", nb_cm$byClass["Recall"], "\n")
cat("F1 Score:", nb_cm$byClass["F1"], "\n")
print(nb_cm)


### 2.5 Decision Tree


In [None]:
# Decision Tree
dt_model <- train(Purchased ~ ., data = train_data, method = "rpart", 
                  trControl = trainControl(method = "cv", number = 5))
dt_pred <- predict(dt_model, test_data)
dt_pred_proba <- predict(dt_model, test_data, type = "prob")[, 2]
dt_cm <- confusionMatrix(dt_pred, y_test)

cat("Decision Tree Results:\n")
cat("Accuracy:", dt_cm$overall["Accuracy"], "\n")
cat("Precision:", dt_cm$byClass["Precision"], "\n")
cat("Recall:", dt_cm$byClass["Recall"], "\n")
cat("F1 Score:", dt_cm$byClass["F1"], "\n")
print(dt_cm)

# Visualize decision tree
rpart.plot(dt_model$finalModel, main = "Decision Tree")


## 3. Model Comparison


In [None]:
# Create comparison data frame
results <- data.frame(
  Model = c("Logistic Regression", "Random Forest", "SVM", "Naive Bayes", "Decision Tree"),
  Accuracy = c(logistic_cm$overall["Accuracy"], rf_cm$overall["Accuracy"], 
               svm_cm$overall["Accuracy"], nb_cm$overall["Accuracy"], dt_cm$overall["Accuracy"]),
  Precision = c(logistic_cm$byClass["Precision"], rf_cm$byClass["Precision"], 
                svm_cm$byClass["Precision"], nb_cm$byClass["Precision"], dt_cm$byClass["Precision"]),
  Recall = c(logistic_cm$byClass["Recall"], rf_cm$byClass["Recall"], 
             svm_cm$byClass["Recall"], nb_cm$byClass["Recall"], dt_cm$byClass["Recall"]),
  F1 = c(logistic_cm$byClass["F1"], rf_cm$byClass["F1"], 
         svm_cm$byClass["F1"], nb_cm$byClass["F1"], dt_cm$byClass["F1"])
)

results <- results[order(-results$Accuracy), ]
print(results)

# Find best model
best_model_idx <- which.max(results$Accuracy)
best_model_name <- results$Model[best_model_idx]

cat("\nBEST MODEL:", best_model_name, "\n")
cat("Accuracy:", results$Accuracy[best_model_idx], "\n")
cat("Precision:", results$Precision[best_model_idx], "\n")
cat("Recall:", results$Recall[best_model_idx], "\n")
cat("F1 Score:", results$F1[best_model_idx], "\n")


## 4. ROC Curves


In [None]:
# Create ROC curves for all models
roc_logistic <- roc(as.numeric(y_test) - 1, logistic_pred_proba)
roc_rf <- roc(as.numeric(y_test) - 1, rf_pred_proba)
roc_svm <- roc(as.numeric(y_test) - 1, svm_pred_proba)
roc_nb <- roc(as.numeric(y_test) - 1, nb_pred_proba)
roc_dt <- roc(as.numeric(y_test) - 1, dt_pred_proba)

plot(roc_logistic, col = "blue", main = "ROC Curves for All Models")
lines(roc_rf, col = "red")
lines(roc_svm, col = "green")
lines(roc_nb, col = "purple")
lines(roc_dt, col = "orange")
legend("bottomright", 
       legend = c(paste("Logistic Regression (AUC =", round(auc(roc_logistic), 3), ")"),
                  paste("Random Forest (AUC =", round(auc(roc_rf), 3), ")"),
                  paste("SVM (AUC =", round(auc(roc_svm), 3), ")"),
                  paste("Naive Bayes (AUC =", round(auc(roc_nb), 3), ")"),
                  paste("Decision Tree (AUC =", round(auc(roc_dt), 3), ")")),
       col = c("blue", "red", "green", "purple", "orange"), lty = 1)

cat("AUC Scores:\n")
cat("Logistic Regression:", auc(roc_logistic), "\n")
cat("Random Forest:", auc(roc_rf), "\n")
cat("SVM:", auc(roc_svm), "\n")
cat("Naive Bayes:", auc(roc_nb), "\n")
cat("Decision Tree:", auc(roc_dt), "\n")


## 5. Feature Importance (Random Forest)


In [None]:
# Feature importance from Random Forest
feature_importance <- importance(rf_model$finalModel)
feature_importance_df <- data.frame(
  Feature = rownames(feature_importance),
  Importance = as.numeric(feature_importance[, "MeanDecreaseGini"])
)
feature_importance_df <- feature_importance_df[order(-feature_importance_df$Importance), ]

print("Feature Importance (Random Forest):")
print(feature_importance_df)

barplot(feature_importance_df$Importance, 
        names.arg = feature_importance_df$Feature,
        main = "Feature Importance (Random Forest)",
        xlab = "Feature",
        ylab = "Importance",
        col = "steelblue",
        las = 2)
