In [145]:
library(tidyverse)
library(magrittr)
library(text2vec)
library(tokenizers)
library(glmnet)
library(doParallel)
library(tm)
library(caret)
library(utiml)
registerDoParallel(4)

## Data import

In [105]:
data_set <- read.csv("DATA/train.csv")
targets <- c("toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate")

In [106]:
print(dim(data_set))
#print(head(data_set))
print(names(data_set))

[1] 159571      8
[1] "id"            "comment_text"  "toxic"         "severe_toxic" 
[5] "obscene"       "threat"        "insult"        "identity_hate"


## Adding new feature to the data set

In [107]:
data <- data_set %>% 
  select(-one_of(targets)) %>% 
  mutate(length = str_length(comment_text), ncap = str_count(comment_text, "[A-Z]"), ncap_len = ncap / length, nexcl = str_count(comment_text, fixed("!")), nquest = str_count(comment_text, fixed("?")), npunct = str_count(comment_text, "[[:punct:]]"), nword = str_count(comment_text, "\\w+"), nsymb = str_count(comment_text, "&|@|#|\\$|%|\\*|\\^")) %>% 
  select(-id)

## Preprocessing and comments tokenization  

In [108]:
comment_tokens <- data %$%
  str_to_lower(comment_text) %>%
  str_replace_all("[^[:alpha:]]", " ") %>%
  str_replace_all("\\s+", " ") %>%
  gsub("\\b\\w{1}\\s","", .) %>% #Remove words with lengh less than 2
  itoken(tokenizer = tokenize_word_stems)

##  Token vectorization as vocabulary for document term matrix

In [109]:
vectorizer <- create_vocabulary(it, ngram = c(1, 1), stopwords = stopwords("en")) %>%
  prune_vocabulary(term_count_min = 3, doc_proportion_max = 0.5, vocab_term_max = 4000) %>%
  vocab_vectorizer()



## TFIDF class and fit in the document term matrix of the tokens

In [110]:
m_tfidf <- TfIdf$new(norm = "l2", sublinear_tf = T)
tfidf <- create_dtm(comment_tokens, vectorizer) %>%
  fit_transform(m_tfidf)

In [111]:
print(names(data))

[1] "comment_text" "length"       "ncap"         "ncap_len"     "nexcl"       
[6] "nquest"       "npunct"       "nword"        "nsymb"       


# Generate the final dataset

In [112]:
finaldata <- data %>%
    select(-comment_text) %>%
    sparse.model.matrix(~ . - 1, .) %>%
    cbind(tfidf)

In [113]:
set.seed(42)
smp_size <- floor(0.80 * nrow(data_set))
train_ind <- sample(seq_len(nrow(data_set)), size = smp_size)
test <- finaldata[-train_ind, ]
train <- finaldata[train_ind, ]

# Working on the Logistic regression

In [178]:
prediction <- select(data_set[-train_ind, ], -comment_text)
train_label <- select(data_set[train_ind, ], -comment_text)
test_label <- prediction

In [179]:
for (label in targets){
    y_train <- train_label[[label]]
    model <- cv.glmnet(train,  factor(y_train), family="binomial", nfolds = 4, parallel = T,  nlambda = 100, alpha=0)
    prediction[[label]] <- predict(model, test, type = "response", s = "lambda.min")
}

In [180]:
prediction[1:5,]

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
1,0000997932d777bf,0.0189705568,0.0019188034,0.0047913665,0.0007309999,0.0037624869,0.001927884
5,0001d958c54c6e35,0.0286691487,0.0009923556,0.0323233664,0.0004283634,0.0255175482,0.0040011252
7,0002bcb3da6cb337,0.9965078771,0.0758724728,0.9635744116,0.0022519545,0.594512517,0.0099592776
11,0005300084f90edc,0.0002839812,0.0004026656,0.0009164614,0.0001453301,0.0003700149,0.0002089342
15,00070ef96486d6f9,0.0755905024,0.0018826387,0.0129517935,0.0008172589,0.0129216651,0.0050152329


In [181]:
thresh = 0.5
ss2 <- sweep(as.matrix(prediction[,-1]),MARGIN=2,STATS=thresh,
             FUN=function(x,y) ifelse(x<y,0,1))

In [182]:
prediction <- data.frame(id=prediction$id,ss2)

In [183]:
prediction[1:5,]
test_label[1:5,]

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
1,0000997932d777bf,0,0,0,0,0,0
5,0001d958c54c6e35,0,0,0,0,0,0
7,0002bcb3da6cb337,1,0,1,0,1,0
11,0005300084f90edc,0,0,0,0,0,0
15,00070ef96486d6f9,0,0,0,0,0,0


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
1,0000997932d777bf,0,0,0,0,0,0
5,0001d958c54c6e35,0,0,0,0,0,0
7,0002bcb3da6cb337,1,1,1,0,1,0
11,0005300084f90edc,0,0,0,0,0,0
15,00070ef96486d6f9,0,0,0,0,0,0


In [184]:
prediction_w_id <- select(prediction, -id)
test_label_w_id <- select(test_label, -id)

In [187]:
for (label in targets) {
    cat("\n\n", label, "\n\n")
    print(confusionMatrix(test_label_w_id[[label]], prediction_w_id[[label]]))
}



 toxic 

Confusion Matrix and Statistics

          Reference
Prediction     0     1
         0 28613   279
         1  1100  1923
                                         
               Accuracy : 0.9568         
                 95% CI : (0.9545, 0.959)
    No Information Rate : 0.931          
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.7132         
 Mcnemar's Test P-Value : < 2.2e-16      
                                         
            Sensitivity : 0.9630         
            Specificity : 0.8733         
         Pos Pred Value : 0.9903         
         Neg Pred Value : 0.6361         
             Prevalence : 0.9310         
         Detection Rate : 0.8965         
   Detection Prevalence : 0.9053         
      Balanced Accuracy : 0.9181         
                                         
       'Positive' Class : 0              
                                         


 severe_toxic 

Confusion