First release 0.1

M3SOulu · Mar 8, 2018 · 5a18130 · 5a18130
1 parent 01d2ff5
commit 5a18130
Show file tree

Hide file tree

Showing 23 changed files with 1,453 additions and 0 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+data-raw
+^\.travis\.yml$
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,5 @@
+# R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
+
+language: R
+sudo: false
+cache: packages
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,17 @@
+Package: NLoN
+Type: Package
+Title: Natural Language or Not
+Version: 0.1.0
+Date: 2018-03-08
+Author: Mika Mäntylä <mika.mantyla@oulu.fi>, Fabio Calefato
+  <fabio.calefato@uniba.it>, Maëlick Claes <himself@maelick.net>
+Maintainer: Maëlick Claes <himself@maelick.net>
+Description: Identify whether text lines are natural language or not
+  using machine learning.
+Depends: data.table, R (>= 3.0)
+Imports: text2vec, tokenizers, glmnet, Matrix, modules, stats
+License: GPL-3
+Encoding: UTF-8
+Remotes: M3SOulu/NLoN
+Suggests: testthat, plyr
+RoxygenNote: 6.0.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,11 @@
+# Generated by roxygen2: do not edit by hand
+
+export(Character3Grams)
+export(FeatureExtraction)
+export(NLoN)
+export(NLoNModel)
+export(NLoNPredict)
+export(TriGramsAndFeatures)
+export(features)
+import(data.table)
+importFrom(stats,predict)
diff --git a/R/NLoN.R b/R/NLoN.R
@@ -0,0 +1,28 @@
+#' NLoN: Natural Language or Not
+#'
+#' NLoN identifies whether text lines are natural language or not
+#' using a glmnet model with simple text features and character
+#' 3-grams.
+#'
+#' @examples
+#'
+#' ## Training data provided in the package.
+#' data(nlon.data)
+#'
+#' ## Build a model with glmnet
+#' model <- with(nlon.data, NLoNModel(text, rater2, TriGramsAndFeatures))
+#'
+#' ## Use the model to preidct new data.
+#' topredict <- c("This is natural language.", "not(natural, language);")
+#' NLoNPredict(model, topredict, 0.1, features=FeatureExtraction)
+#'
+#' ## Train and predict in a single function call.
+#' NLoN(rbind(nlon.data[, list(text, response=rater2)],
+#'           list(text=topredict), fill=TRUE),
+#'     0.1, features=FeatureExtraction)
+#'
+#' @docType package
+#' @name NLoN
+#' @import data.table
+#' @importFrom stats predict
+NULL
diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,17 @@
+#' Training data used in the NLoN paper.
+#'
+#' A dataset containing 2000 lines of text from Mozilla Firefox,
+#' Lucene and Kubertenes datasets alongside two response variables
+#' from two different raters.
+#'
+#' @format A data frame with 6000 rows and 4 columns:
+#' \describe{
+#'   \item{source}{source from the text (mozilla, kubertenes or lucene).}
+#'   \item{text}{line of text.}
+#'   \item{rater1}{reponse from the first rater.}
+#'   \item{rater2}{reponse from the second rater.}
+#' }
+#' @source \url{https://bugzilla.mozilla.org/}
+#' \url{http://www.kubertenes/}
+#' \url{http://lucene}
+"nlon.data"
diff --git a/R/evaluation.R b/R/evaluation.R
diff --git a/R/features.R b/R/features.R
@@ -0,0 +1,255 @@
+mysql.stopwords <- system.file("extdata", "mysql_sw_wo_code_words.txt",
+                               package="NLoN", mustWork=TRUE)
+mysql.stopwords <- read.csv(mysql.stopwords, stringsAsFactors=FALSE,
+                            header=FALSE)$V1
+
+## emojis <- system.file("extdata", "emojis.csv",
+##                       package="NLoN", mustWork=TRUE)
+## emojis <- "data/emojis.csv"
+## emojis <- fread(emojis)
+
+ConvertFeatures <- function(data) {
+  ## Make sure that the feature data is a matrix or Matrix object.
+  ## Converts list into data.frame and then data.frame into matrix.
+  if (is.list(data)) {
+    if (length(unique(sapply(data, length))) == 1) {
+      data <- as.data.table(data)
+    } else stop("feature values don't have the same length")
+  }
+  if (is.data.frame(data)) {
+    data <- as.matrix(data)
+  }
+  if ((is.matrix(data) && is.numeric(data)) || inherits(data, "Matrix")) {
+    data
+  } else stop("feature values are not a numeric matrix")
+}
+
+ComputeFeatures <- function(text, features) {
+  ## Compute features. If features is a function, it will simply be
+  ## applied on the text (and must return a list, data.frame, matrix
+  ## or Matrix of numeric values). If feature is a list of functions,
+  ## do a sapply of the functions which must all return a numeric
+  ## vector of the same length as text.
+  if (is.function(features)) {
+    data <- features(text)
+  } else if (is.list(features) && all(sapply(features, is.function))) {
+    if (is.null(names(features))) {
+      warning("features is a list of functions without names")
+    }
+    data <- sapply(features, function(f) f(text))
+  } else stop("features must be a function or a list of functions")
+}
+
+#' Feature extraction.
+#'
+#' Computes a set of simple text-based features.
+#'
+#' The features computed are the followings:
+#' \describe{
+#'   \item{\code{ratio.caps}}{The ratio of uppercase letters.}
+#'   \item{\code{ratio.specials}}{The ratio of special characters.}
+#'   \item{\code{ratio.numbers}}{The ratio of number characters.}
+#'   \item{\code{length.words}}{The average word length.}
+#'   \item{\code{stopwords}}{The ratio of English stopwords (using first
+#'   tokenizer).}
+#'   \item{\code{stopwords2}}{The ratio of English stopwords (using second
+#'   tokenizer).}
+#'   \item{\code{last.char.nl}}{Boolean for the use of NL character at the
+#'   end of the text.}
+#'   \item{\code{last.char.code}}{Boolean for the use of code character at
+#'   the end of text.}
+#'   \item{\code{first.3.chars.letters}}{Number of letters in the three
+#'   first characters.}
+#'   \item{\code{emoticons}}{Number of emoticons}
+#'   \item{\code{first.char.at}}{Boolean for the use of @ character at
+#'   the beginning of the line.}
+#' }
+#'
+#' @param text The text.
+#' @return A data.table with values of the 11 features.
+#' @export
+FeatureExtraction <- function(text) {
+  data <- data.table(text)
+  features <- list(ratio.caps=features$CapsRatio,
+                   ratio.specials=features$SpecialCharsRatio,
+                   ratio.numbers=features$NumbersRatio,
+                   length.words=features$AverageWordLength,
+                   stopwords=features$StopwordsRatio1,
+                   stopwords2=features$StopwordsRatio2,
+                   last.char.code=features$LastCharCode,
+                   last.char.nl=features$LastCharNL,
+                   first.3.chars.letters=features$First3CharsLetter,
+                   emoticons=features$Emoticons,
+                   first.char.at=features$StartWithAt)
+  as.data.table(ComputeFeatures(text, features))
+}
+
+#' Character 3-gram extraction.
+#'
+#' Computes the document term matrix of character 3-gram.
+#'
+#' @param text The text.
+#' @return A document term matrix (sparse Matrix).
+#' @export
+Character3Grams <- function(text) {
+  Preprocessor <- function(x) {
+    gsub("[0-9]", "0", gsub("\\\032", "", x))
+  }
+  Tokenizer <- function (x) {
+    tokenizers::tokenize_character_shingles(x, n=3, strip_non_alphanum=FALSE,
+                                            lowercase=TRUE)
+  }
+  it <- text2vec::itoken(text, tokenizer=Tokenizer,
+                         preprocessor=Preprocessor,
+                         progressbar=TRUE)
+  vocab <- text2vec::create_vocabulary(it)
+  vectorizer <- text2vec::vocab_vectorizer(vocab)
+  text2vec::create_dtm(it, vectorizer)
+}
+
+#' 3-grams and feature extraction.
+#'
+#' Computes both 3-gram and simple text features.
+#'
+#' @param text The text.
+#' @return A sparse Matrix with text features and 3-gram.
+#' @seealso \code{\link{Character3Grams}}
+#' @seealso \code{\link{FeatureExtraction}}
+#' @export
+TriGramsAndFeatures <- function(text) {
+  cbind(Character3Grams(text), as.matrix(FeatureExtraction(text)))
+}
+
+#' Features.
+#'
+#' Module containing functions for individual simple text feature
+#' extraction.
+#'
+#' Most functions have a single \code{text} parameter. The module
+#' contains the following functions:
+#'
+#' \describe{
+#'   \item{\code{Stopwords}}{Number of stopwords. Uses two optional
+#'   parameters: \code{Tokenize} which is the word tokenizer to use
+#'   and \code{stopwords} which is the list of stopwords to use.}
+#'   \item{\code{Tokenize1}}{First tokenizer available for
+#'   \code{Stopwords}.}
+#'   \item{\code{Tokenize2}}{Second tokenizer available for
+#'   \code{Stopwords}.}
+#'   \item{\code{StopwordsRatio1}}{Ratio of stopwords using \code{Tokenize1}}
+#'   \item{\code{StopwordsRatio2}}{Ratio of stopwords using \code{Tokenize2}}
+#'   \item{\code{Caps}}{Number of uppercase letters.}
+#'   \item{\code{CapsRatio}}{Ratio of uppercase letters.}
+#'   \item{\code{SpecialChars}}{Number of special characters.}
+#'   \item{\code{SpecialCharsRatio}}{Ratio of special characters.}
+#'   \item{\code{Numbers}}{Number of digit characters.}
+#'   \item{\code{NumbersRatio}}{Ratio of digit characters.}
+#'   \item{\code{Words}}{Number of words.}
+#'   \item{\code{AverageWordLength}}{Average word length.}
+#'   \item{\code{LastCharCode}}{Boolean for the use of a code character at the
+#'   end of the text.}
+#'   \item{\code{LastCharNL}}{Boolean for the use of a natural language boolean
+#'   at the end of the text.}
+#'   \item{\code{First3Chars}}{Returns the first three non white characters.}
+#'   \item{\code{First3CharsLetters}}{The number of three first non white
+#'   characters that are letters.}
+#'   \item{\code{Emoticons}}{The number of emoticons}
+#'   \item{\code{StartWithAt}}{Boolean for the use of @ at the start of
+#'   the text.}
+#' }
+#'
+#' @export
+features <- modules::module({
+  modules::export("^[^.]")
+
+  .CountRegexMatches <- function(text, re) {
+    ## Count the number of match of a regex
+    sapply(stringr::str_match_all(text, re), length)
+  }
+
+  Tokenize1 <- function(text) {
+    ## Need to be fixed: add punctuation for seperator (at least .)
+    ## lapply(tolower(text), tokenize_regex, pattern="\\s+", simplify=TRUE)
+    lapply(tolower(text), tokenizers::tokenize_regex,
+           pattern="\\s+", simplify=TRUE)
+  }
+
+  Tokenize2 <- function(text) {
+    tokenizers::tokenize_words(text, simplify=TRUE)
+  }
+
+  Stopwords <- function(text, Tokenize=Tokenize, stopwords=mysql.stopwords) {
+    ## Computes the number of stopwords present in text based on a given
+    ## Tokenize function
+    sapply(Tokenize(text), function(words) sum(words %in% stopwords))
+  }
+
+  Caps <- function(text) {
+    ## Number of uppercase characters
+    .CountRegexMatches(text, "[A-Z]")
+  }
+
+  SpecialChars <- function(text) {
+    ## Number of special characters
+    .CountRegexMatches(text, "[^a-zA-Z\\d\\s]")
+  }
+
+  Numbers <- function(text) {
+    ## Number of digits
+    .CountRegexMatches(text, "[\\d]")
+  }
+
+  CapsRatio <- function(text) Caps(text) / nchar(text)
+  SpecialCharsRatio <- function(text) SpecialChars(text) / nchar(text)
+  NumbersRatio <- function(text) Numbers(text) / nchar(text)
+  StopwordsRatio1 <- function(text) Stopwords(text, Tokenize1) / Words(text)
+  StopwordsRatio2 <- function(text) Stopwords(text, Tokenize2) / Words(text)
+
+  Words <- function(text) {
+    ## Number of words
+    ## .CountRegexMatches(text, "\\w")
+    .CountRegexMatches(text, "[\\s+]") + 1
+  }
+
+  AverageWordLength <- function(text) {
+    nchar(text) / Words(text)
+  }
+
+  LastCharCode <- function(text) {
+    ## Boolean whether last character is a character code
+    ## If the line ends to emoticon this is not true :-) so we
+    ## substract. R does not support lookahead in regex which would also
+    ## solve this
+    (!grepl("(:-\\)|;-\\)|:\\)|;\\)|:-\\(|:\\()$", text) &
+     grepl("[){;]$", text))
+  }
+
+  LastCharNL <- function(text) {
+    ## Last character is related to natural language (punctuation)
+    grepl("\\.$|\\!$|\\?$|:$|,$", text)
+  }
+
+  First3Chars <- function(text) {
+    ## First three characters (after stripping white spaces)
+    substr(gsub("\\s", "", text), 1, 3)
+  }
+
+  First3CharsLetters <- function(text) {
+    ## Number of characters in the firsrt three characters
+    .CountRegexMatches(First3Chars(text), "[a-zA-Z]")
+  }
+
+  Emoticons <- function(text) {
+    ## Number of emoticons
+    ## Using more larger lis of emoticons e.g. ones built for
+    ## SentiStrength, might cause more false postive as some of them are
+    ## similar to elements that appear in code.
+    .CountRegexMatches(text, ":-\\)|;-\\)|:\\)|;\\)|:-\\(|:\\(")
+  }
+
+  StartWithAt <- function(text) {
+    ## TRUE if text starts with @ symbol
+    grepl("^@", text)
+    ## sapply(text, function(x) sum(grep("^@", x)))
+  }
+})