In [2]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# Load data from CSV file
data <- read.csv("/kaggle/input/e-commerce-churn-dataset/churn_data.csv")

In [4]:
data <- data[, !names(data) %in% 'CustomerID', drop = FALSE]

#missing values
missing_values <- colSums(is.na(data))

#fill those missing values

# Loop through columns
for (col in names(data)) {
  # Check if there are missing values in the column
  if (sum(is.na(data[[col]])) > 0) {
    # Fill missing values with the median of the column
    data[[col]][is.na(data[[col]])] <- median(data[[col]], na.rm = TRUE)
  }
}

In [7]:
# Assuming your data frame is named 'df'

# Load necessary libraries
library(dplyr)
library(caret)

# Convert categorical variables into numerical variables using label encoding
data$PreferredLoginDevice <- as.numeric(as.factor(data$PreferredLoginDevice))
data$PreferredPaymentMode <- as.numeric(as.factor(data$PreferredPaymentMode))
data$Gender <- as.numeric(as.factor(data$Gender))
data$PreferedOrderCat <- as.numeric(as.factor(data$PreferedOrderCat))
data$MaritalStatus <- as.numeric(as.factor(data$MaritalStatus))

# Split the dataset into training and testing datasets
set.seed(42)  # Setting seed for reproducibility
splitIndex <- createDataPartition(data$Churn, p = 0.7, list = FALSE)
train_data <- data[splitIndex, ]
test_data <- data[-splitIndex, ]

# Separate predictor variables (X) and response variable (y)
X_train <- train_data[, !(names(train_data) %in% c("CustomerID", "Churn"))]
y_train <- train_data$Churn
X_test <- test_data[, !(names(test_data) %in% c("CustomerID", "Churn"))]
y_test <- test_data$Churn

In [8]:
#Logistic Regression

library(stats)

# Create a logistic regression model
logreg <- glm(y_train ~ ., family = binomial, data = cbind(y_train, X_train))

# Make predictions on the test set
logreg_prediction <- predict(logreg, newdata = cbind(y_test, X_test), type = "response") > 0.5

# Print accuracy score
accuracy <- sum(logreg_prediction == y_test) / length(y_test)
cat('Accuracy Score:', accuracy, '\n')

# Print classification report
table_true_pred <- table(y_test, logreg_prediction)
print('Confusion Matrix:')
print(table_true_pred)

precision <- table_true_pred[2, 2] / sum(table_true_pred[, 2])
recall <- table_true_pred[2, 2] / sum(table_true_pred[2, ])
f1_score <- 2 * (precision * recall) / (precision + recall)

cat('\nClassification Report:\n')
cat('  Precision: ', precision, '\n')
cat('  Recall: ', recall, '\n')
cat('  F1-Score: ', f1_score, '\n')


Accuracy Score: 0.8809947 
[1] "Confusion Matrix:"
      logreg_prediction
y_test FALSE TRUE
     0  1363   53
     1   148  125

Classification Report:
  Precision:  0.7022472 
  Recall:  0.4578755 
  F1-Score:  0.5543237 
