In [1]:
source("./aux.R")

Loading required package: NLP

Loading required package: koRpus.lang.en

Loading required package: koRpus

Loading required package: sylly

For information on available language packages for 'koRpus', run

  available.koRpus.lang()

and see ?install.koRpus.lang()



Attaching package: ‘koRpus’


The following object is masked from ‘package:tm’:

    readTagged



Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
dataset <- read.csv("six_label_dataset.csv", col.names = c("Label", "Text", "Tag"))
dataset$Label <- change_labels(dataset$Label)
classes <- as.integer(sort(unique(dataset$Label)))
args <- sort(unique(unlist(strsplit(dataset$Tag, ","))))

In [3]:
occ_threshold <- 1
vocabulary <- get_vocabulary(dataset$Text, occ_threshold)
length(vocabulary)

In [4]:
dataset$Text <- clean(dataset$Text)
dataset <- clean_empty_rows(dataset)

In [5]:
occ_threshold <- 1
vocabulary <- get_vocabulary(dataset$Text, occ_threshold)
length(vocabulary)

In [6]:
seventy_percent <- floor(length(dataset$Text) * 0.7)
eightyfive_percent <- floor(length(dataset$Text) * 0.85)
n <- nrow(dataset)

training_set <- dataset[1:seventy_percent, ]
validation_set <- dataset[(seventy_percent + 1):eightyfive_percent, ]
testing_set <- dataset[(eightyfive_percent + 1):n, ]

In [7]:
occ_threshold <- 1
model <- train_multinomial_nb(classes, training_set, occ_threshold)

In [8]:
docs <- validation_set$Text

pred_labels <- sapply(docs, function(doc) {
  apply_multinomial_nb(classes, model$vocab, model$prior, model$condprob, doc)
})

In [9]:
correct_predictions <- sum(validation_set$Label == pred_labels)
total_predictions <- length(validation_set$Label)
accuracy <- correct_predictions / total_predictions

cat("Accuracy:", accuracy)

Accuracy: 0.2259115

In [10]:
eigthy_percent <- floor(length(dataset$Text) * 0.8)
n <- nrow(dataset)

dataset <- dataset[sample(n), ]

training_set <- dataset[1:eigthy_percent, ]
testing_set <- dataset[(eigthy_percent + 1):n, ]

In [11]:
crossval_results <- kfold_cross_validation(training_set, k = 5, occ_thresholds = 1:20)
print(crossval_results)

   occ_threshold mean_accuracy
1              1     0.2254123
2              2     0.2276115
3              3     0.2257789
4              4     0.2272450
5              5     0.2252902
6              6     0.2238241
7              7     0.2233354
8              8     0.2224801
9              9     0.2208919
10            10     0.2193036
11            11     0.2188149
12            12     0.2196701
13            13     0.2186927
14            14     0.2189371
15            15     0.2208919
16            16     0.2210141
17            17     0.2211362
18            18     0.2201588
19            19     0.2226023
20            20     0.2221136


In [12]:
best_threshold <- crossval_results$occ_threshold[which.max(crossval_results$mean_accuracy)]
model <- train_multinomial_nb(classes, training_set, best_threshold)

In [13]:
pred_labels <- sapply(testing_set$Text, function(doc) {
  apply_multinomial_nb(classes, model$vocab, model$prior, model$condprob, doc)
})

In [14]:
correct_predictions <- sum(testing_set$Label == pred_labels)
total_predictions <- length(testing_set$Label)
accuracy <- correct_predictions / total_predictions

cat("Accuracy:", accuracy)

Accuracy: 0.2226562

In [15]:
confusion_matrix <- table(True = testing_set$Label, Predicted = pred_labels)
cat("Confusion Matrix:\n")
print(confusion_matrix)

Confusion Matrix:
    Predicted
True   0   1   2   3   4   5
   0  15  48  25  46  23  16
   1  28 106  62 100  62  41
   2  12  79  61  87  61  37
   3  19  61  59 124 112  44
   4   8  60  43 105 100  62
   5  13  56  34  74 115  50


In [16]:
crossval_results <- kfold_cross_validation_tags(training_set, k = 5, occ_thresholds = 1:20)
print(crossval_results)

   occ_threshold mean_accuracy
1              1     0.2145388
2              2     0.2145388
3              3     0.2142944
4              4     0.2193036
5              5     0.2205254
6              6     0.2196701
7              7     0.2189371
8              8     0.2205254
9              9     0.2240684
10            10     0.2213806
11            11     0.2169823
12            12     0.2185706
13            13     0.2180819
14            14     0.2183262
15            15     0.2174710
16            16     0.2175932
17            17     0.2200367
18            18     0.2182040
19            19     0.2174710
20            20     0.2164936


In [17]:
best_threshold <- crossval_results$occ_threshold[which.max(crossval_results$mean_accuracy)]
model <- train_multinomial_nb(classes, training_set, best_threshold)

In [18]:
pred_labels <- sapply(testing_set$Text, function(doc) {
  apply_multinomial_nb(classes, model$vocab, model$prior, model$condprob, doc)
})

In [19]:
correct_predictions <- sum(testing_set$Label == pred_labels)
total_predictions <- length(testing_set$Label)
accuracy <- correct_predictions / total_predictions

cat("Accuracy:", accuracy)

Accuracy: 0.2211914

In [20]:
confusion_matrix <- table(True = testing_set$Label, Predicted = pred_labels)
cat("Confusion Matrix:\n")
print(confusion_matrix)

Confusion Matrix:
    Predicted
True   0   1   2   3   4   5
   0  23  47  25  41  23  14
   1  28 110  61  90  64  46
   2  21  91  63  78  46  38
   3  20  75  57 110 110  47
   4  19  64  41 102  87  65
   5  16  66  31  63 106  60
