In [127]:
qualitatives_attributes_col_idx <- c(4,6,7,9,10,11)
discretized_quantitatives_attributes_col_idx <- c(4,6)
quantitatives_attributes_col_idx <- c(2,3,5)
target_variables_idx <- 8
datasets_directory <- "../datasets/"
dataset_filename <- "HR_prediction-all.csv"
output_folder <- "output"
working_directory <- "C:/Users/huniv/jnotebook/data-mining/62-62_Data_Mining/tp02/"

In [128]:
#mettre le paquet Naive Bayes à disposition
if (!requireNamespace("e1071", quietly = TRUE)) {
  install.packages("e1071")
}


#for latex parsing of equations
if (!requireNamespace("latex2exp", quietly = TRUE)) {
  install.packages("latex2exp")
}

# Check if the package is installed
if (!require("colorspace", quietly = TRUE)) {
  install.packages("colorspace")
}

library(e1071)
library(latex2exp)
library(colorspace)

In [135]:
#read data
# avec colClasses, nous contrôlons explicitement les types des différentes variables, 
# les valeurs qui nous intéressent le plus sont 
# factor: variables qualitatives
# numeric: pour les variables quantitatives
myData <- read.table(
        "../datasets/HR_prediction-all.csv",
        header = TRUE,
        sep = ",",
        colClasses = c(
                "NULL",     # Id - Treat it as a factor to prevent numeric misinterpretation
                "numeric",    # satisfaction_level
                "numeric",    # last_evaluation
                "factor",    # number_project
                "integer",    # average_montly_hours
                "factor",    # time_spend_company
                "factor",     # Work_accident (categorical: 0 or 1)
                "factor",     # left (target variable: 0 or 1)
                "factor",     # promotion_last_5years (categorical: 0 or 1)
                "factor",     # department
                "factor"      # salary
        )
)

# Remove the 'Id' column
# myData <- myData[, -1]

In [136]:
# Ensures reproducibility
# set.seed(42)
# Extract the target variable name
target_name <- colnames(myData)[target_variables_idx]

# Step 2: Split data into training and testing sets
train_idx <- sample(1:nrow(myData), size = floor(2/3 * nrow(myData)))
train_data <- myData[train_idx, ]
test_data <- myData[-train_idx, ]

In [137]:
print("Train data")
head(train_data)
dim(train_data)

[1] "Train data"


Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
Unnamed: 0_level_1,<dbl>,<dbl>,<fct>,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
9064,0.51,0.95,4,169,3,1,0,0,sales,low
18,0.98,0.6,4,160,3,0,0,0,technical,low
2436,0.67,0.57,2,234,4,0,0,0,management,low
5940,0.86,0.51,4,211,2,1,0,0,technical,low
5277,0.44,0.49,2,145,3,0,1,0,sales,low
6760,0.5,0.51,3,169,4,0,0,0,marketing,low


In [138]:
print("Test data")
head(test_data)
dim(test_data)

[1] "Test data"


Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
Unnamed: 0_level_1,<dbl>,<dbl>,<fct>,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>
1,0.83,0.8,4,158,4,0,0,0,marketing,medium
3,0.97,0.91,3,246,2,0,0,0,accounting,low
6,0.51,0.71,2,180,3,0,0,0,product_mng,low
8,0.79,0.86,5,235,5,0,1,0,support,medium
10,0.74,0.37,2,171,4,0,0,0,support,low
15,0.63,0.79,5,215,2,1,0,0,accounting,medium


In [139]:
#Train a Naive Bayes on myData.
#The parameter:
# formula=left~.
#sets the target/class variable to be the left.
#and use as predictive variables all the others
#Entraîner un Naive Bayes sur myData.
#Le paramètre :
# formula=left~.
#définit la variable cible/classe comme étant le result
#et utilise comme variables prédictives toutes les autres variables
nb_model <- naiveBayes(formula = left~ ., data = train_data)
#Voyons ce que nous avons dans le modèle
nb_model


Naive Bayes Classifier for Discrete Predictors

Call:
naiveBayes.default(x = X, y = Y, laplace = laplace)

A-priori probabilities:
Y
        0         1 
0.7650765 0.2349235 

Conditional probabilities:
   satisfaction_level
Y        [,1]      [,2]
  0 0.6689824 0.2159486
  1 0.4404151 0.2620707

   last_evaluation
Y        [,1]      [,2]
  0 0.7179412 0.1624121
  1 0.7145466 0.1990714

   number_project
Y            2          3          4          5          6          7
  0 0.07529412 0.34725490 0.34705882 0.18490196 0.04549020 0.00000000
  1 0.45274585 0.01724138 0.10791826 0.17369093 0.17879949 0.06960409

   average_montly_hours
Y       [,1]     [,2]
  0 198.9867 45.40391
  1 205.2063 60.37473

   time_spend_company
Y           10          2          3          4          5          6
  0 0.01882353 0.28294118 0.42960784 0.14039216 0.05509804 0.04352941
  1 0.00000000 0.01660281 0.45593870 0.24457216 0.22413793 0.05874840
   time_spend_company
Y            7          8
  0 0.015

In [142]:
# Predict on test data
predictions <- predict(nb_model, test_data)
# Calculate accuracy
accuracy <- sum(predictions == test_data[[target_name]]) / nrow(test_data)
accuracy