diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..4051aec --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +data-raw +^\.travis\.yml$ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..8d139ac --- /dev/null +++ b/.travis.yml @@ -0,0 +1,5 @@ +# R for travis: see documentation at https://docs.travis-ci.com/user/languages/r + +language: R +sudo: false +cache: packages diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..c9caf8a --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,17 @@ +Package: NLoN +Type: Package +Title: Natural Language or Not +Version: 0.1.0 +Date: 2018-03-08 +Author: Mika Mäntylä , Fabio Calefato + , Maëlick Claes +Maintainer: Maëlick Claes +Description: Identify whether text lines are natural language or not + using machine learning. +Depends: data.table, R (>= 3.0) +Imports: text2vec, tokenizers, glmnet, Matrix, modules, stats +License: GPL-3 +Encoding: UTF-8 +Remotes: M3SOulu/NLoN +Suggests: testthat, plyr +RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..7e19881 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,11 @@ +# Generated by roxygen2: do not edit by hand + +export(Character3Grams) +export(FeatureExtraction) +export(NLoN) +export(NLoNModel) +export(NLoNPredict) +export(TriGramsAndFeatures) +export(features) +import(data.table) +importFrom(stats,predict) diff --git a/R/NLoN.R b/R/NLoN.R new file mode 100644 index 0000000..44aad28 --- /dev/null +++ b/R/NLoN.R @@ -0,0 +1,28 @@ +#' NLoN: Natural Language or Not +#' +#' NLoN identifies whether text lines are natural language or not +#' using a glmnet model with simple text features and character +#' 3-grams. +#' +#' @examples +#' +#' ## Training data provided in the package. +#' data(nlon.data) +#' +#' ## Build a model with glmnet +#' model <- with(nlon.data, NLoNModel(text, rater2, TriGramsAndFeatures)) +#' +#' ## Use the model to preidct new data. +#' topredict <- c("This is natural language.", "not(natural, language);") +#' NLoNPredict(model, topredict, 0.1, features=FeatureExtraction) +#' +#' ## Train and predict in a single function call. +#' NLoN(rbind(nlon.data[, list(text, response=rater2)], +#' list(text=topredict), fill=TRUE), +#' 0.1, features=FeatureExtraction) +#' +#' @docType package +#' @name NLoN +#' @import data.table +#' @importFrom stats predict +NULL diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..3134a5a --- /dev/null +++ b/R/data.R @@ -0,0 +1,17 @@ +#' Training data used in the NLoN paper. +#' +#' A dataset containing 2000 lines of text from Mozilla Firefox, +#' Lucene and Kubertenes datasets alongside two response variables +#' from two different raters. +#' +#' @format A data frame with 6000 rows and 4 columns: +#' \describe{ +#' \item{source}{source from the text (mozilla, kubertenes or lucene).} +#' \item{text}{line of text.} +#' \item{rater1}{reponse from the first rater.} +#' \item{rater2}{reponse from the second rater.} +#' } +#' @source \url{https://bugzilla.mozilla.org/} +#' \url{http://www.kubertenes/} +#' \url{http://lucene} +"nlon.data" diff --git a/R/evaluation.R b/R/evaluation.R new file mode 100644 index 0000000..e69de29 diff --git a/R/features.R b/R/features.R new file mode 100644 index 0000000..5e8dd47 --- /dev/null +++ b/R/features.R @@ -0,0 +1,255 @@ +mysql.stopwords <- system.file("extdata", "mysql_sw_wo_code_words.txt", + package="NLoN", mustWork=TRUE) +mysql.stopwords <- read.csv(mysql.stopwords, stringsAsFactors=FALSE, + header=FALSE)$V1 + +## emojis <- system.file("extdata", "emojis.csv", +## package="NLoN", mustWork=TRUE) +## emojis <- "data/emojis.csv" +## emojis <- fread(emojis) + +ConvertFeatures <- function(data) { + ## Make sure that the feature data is a matrix or Matrix object. + ## Converts list into data.frame and then data.frame into matrix. + if (is.list(data)) { + if (length(unique(sapply(data, length))) == 1) { + data <- as.data.table(data) + } else stop("feature values don't have the same length") + } + if (is.data.frame(data)) { + data <- as.matrix(data) + } + if ((is.matrix(data) && is.numeric(data)) || inherits(data, "Matrix")) { + data + } else stop("feature values are not a numeric matrix") +} + +ComputeFeatures <- function(text, features) { + ## Compute features. If features is a function, it will simply be + ## applied on the text (and must return a list, data.frame, matrix + ## or Matrix of numeric values). If feature is a list of functions, + ## do a sapply of the functions which must all return a numeric + ## vector of the same length as text. + if (is.function(features)) { + data <- features(text) + } else if (is.list(features) && all(sapply(features, is.function))) { + if (is.null(names(features))) { + warning("features is a list of functions without names") + } + data <- sapply(features, function(f) f(text)) + } else stop("features must be a function or a list of functions") +} + +#' Feature extraction. +#' +#' Computes a set of simple text-based features. +#' +#' The features computed are the followings: +#' \describe{ +#' \item{\code{ratio.caps}}{The ratio of uppercase letters.} +#' \item{\code{ratio.specials}}{The ratio of special characters.} +#' \item{\code{ratio.numbers}}{The ratio of number characters.} +#' \item{\code{length.words}}{The average word length.} +#' \item{\code{stopwords}}{The ratio of English stopwords (using first +#' tokenizer).} +#' \item{\code{stopwords2}}{The ratio of English stopwords (using second +#' tokenizer).} +#' \item{\code{last.char.nl}}{Boolean for the use of NL character at the +#' end of the text.} +#' \item{\code{last.char.code}}{Boolean for the use of code character at +#' the end of text.} +#' \item{\code{first.3.chars.letters}}{Number of letters in the three +#' first characters.} +#' \item{\code{emoticons}}{Number of emoticons} +#' \item{\code{first.char.at}}{Boolean for the use of @ character at +#' the beginning of the line.} +#' } +#' +#' @param text The text. +#' @return A data.table with values of the 11 features. +#' @export +FeatureExtraction <- function(text) { + data <- data.table(text) + features <- list(ratio.caps=features$CapsRatio, + ratio.specials=features$SpecialCharsRatio, + ratio.numbers=features$NumbersRatio, + length.words=features$AverageWordLength, + stopwords=features$StopwordsRatio1, + stopwords2=features$StopwordsRatio2, + last.char.code=features$LastCharCode, + last.char.nl=features$LastCharNL, + first.3.chars.letters=features$First3CharsLetter, + emoticons=features$Emoticons, + first.char.at=features$StartWithAt) + as.data.table(ComputeFeatures(text, features)) +} + +#' Character 3-gram extraction. +#' +#' Computes the document term matrix of character 3-gram. +#' +#' @param text The text. +#' @return A document term matrix (sparse Matrix). +#' @export +Character3Grams <- function(text) { + Preprocessor <- function(x) { + gsub("[0-9]", "0", gsub("\\\032", "", x)) + } + Tokenizer <- function (x) { + tokenizers::tokenize_character_shingles(x, n=3, strip_non_alphanum=FALSE, + lowercase=TRUE) + } + it <- text2vec::itoken(text, tokenizer=Tokenizer, + preprocessor=Preprocessor, + progressbar=TRUE) + vocab <- text2vec::create_vocabulary(it) + vectorizer <- text2vec::vocab_vectorizer(vocab) + text2vec::create_dtm(it, vectorizer) +} + +#' 3-grams and feature extraction. +#' +#' Computes both 3-gram and simple text features. +#' +#' @param text The text. +#' @return A sparse Matrix with text features and 3-gram. +#' @seealso \code{\link{Character3Grams}} +#' @seealso \code{\link{FeatureExtraction}} +#' @export +TriGramsAndFeatures <- function(text) { + cbind(Character3Grams(text), as.matrix(FeatureExtraction(text))) +} + +#' Features. +#' +#' Module containing functions for individual simple text feature +#' extraction. +#' +#' Most functions have a single \code{text} parameter. The module +#' contains the following functions: +#' +#' \describe{ +#' \item{\code{Stopwords}}{Number of stopwords. Uses two optional +#' parameters: \code{Tokenize} which is the word tokenizer to use +#' and \code{stopwords} which is the list of stopwords to use.} +#' \item{\code{Tokenize1}}{First tokenizer available for +#' \code{Stopwords}.} +#' \item{\code{Tokenize2}}{Second tokenizer available for +#' \code{Stopwords}.} +#' \item{\code{StopwordsRatio1}}{Ratio of stopwords using \code{Tokenize1}} +#' \item{\code{StopwordsRatio2}}{Ratio of stopwords using \code{Tokenize2}} +#' \item{\code{Caps}}{Number of uppercase letters.} +#' \item{\code{CapsRatio}}{Ratio of uppercase letters.} +#' \item{\code{SpecialChars}}{Number of special characters.} +#' \item{\code{SpecialCharsRatio}}{Ratio of special characters.} +#' \item{\code{Numbers}}{Number of digit characters.} +#' \item{\code{NumbersRatio}}{Ratio of digit characters.} +#' \item{\code{Words}}{Number of words.} +#' \item{\code{AverageWordLength}}{Average word length.} +#' \item{\code{LastCharCode}}{Boolean for the use of a code character at the +#' end of the text.} +#' \item{\code{LastCharNL}}{Boolean for the use of a natural language boolean +#' at the end of the text.} +#' \item{\code{First3Chars}}{Returns the first three non white characters.} +#' \item{\code{First3CharsLetters}}{The number of three first non white +#' characters that are letters.} +#' \item{\code{Emoticons}}{The number of emoticons} +#' \item{\code{StartWithAt}}{Boolean for the use of @ at the start of +#' the text.} +#' } +#' +#' @export +features <- modules::module({ + modules::export("^[^.]") + + .CountRegexMatches <- function(text, re) { + ## Count the number of match of a regex + sapply(stringr::str_match_all(text, re), length) + } + + Tokenize1 <- function(text) { + ## Need to be fixed: add punctuation for seperator (at least .) + ## lapply(tolower(text), tokenize_regex, pattern="\\s+", simplify=TRUE) + lapply(tolower(text), tokenizers::tokenize_regex, + pattern="\\s+", simplify=TRUE) + } + + Tokenize2 <- function(text) { + tokenizers::tokenize_words(text, simplify=TRUE) + } + + Stopwords <- function(text, Tokenize=Tokenize, stopwords=mysql.stopwords) { + ## Computes the number of stopwords present in text based on a given + ## Tokenize function + sapply(Tokenize(text), function(words) sum(words %in% stopwords)) + } + + Caps <- function(text) { + ## Number of uppercase characters + .CountRegexMatches(text, "[A-Z]") + } + + SpecialChars <- function(text) { + ## Number of special characters + .CountRegexMatches(text, "[^a-zA-Z\\d\\s]") + } + + Numbers <- function(text) { + ## Number of digits + .CountRegexMatches(text, "[\\d]") + } + + CapsRatio <- function(text) Caps(text) / nchar(text) + SpecialCharsRatio <- function(text) SpecialChars(text) / nchar(text) + NumbersRatio <- function(text) Numbers(text) / nchar(text) + StopwordsRatio1 <- function(text) Stopwords(text, Tokenize1) / Words(text) + StopwordsRatio2 <- function(text) Stopwords(text, Tokenize2) / Words(text) + + Words <- function(text) { + ## Number of words + ## .CountRegexMatches(text, "\\w") + .CountRegexMatches(text, "[\\s+]") + 1 + } + + AverageWordLength <- function(text) { + nchar(text) / Words(text) + } + + LastCharCode <- function(text) { + ## Boolean whether last character is a character code + ## If the line ends to emoticon this is not true :-) so we + ## substract. R does not support lookahead in regex which would also + ## solve this + (!grepl("(:-\\)|;-\\)|:\\)|;\\)|:-\\(|:\\()$", text) & + grepl("[){;]$", text)) + } + + LastCharNL <- function(text) { + ## Last character is related to natural language (punctuation) + grepl("\\.$|\\!$|\\?$|:$|,$", text) + } + + First3Chars <- function(text) { + ## First three characters (after stripping white spaces) + substr(gsub("\\s", "", text), 1, 3) + } + + First3CharsLetters <- function(text) { + ## Number of characters in the firsrt three characters + .CountRegexMatches(First3Chars(text), "[a-zA-Z]") + } + + Emoticons <- function(text) { + ## Number of emoticons + ## Using more larger lis of emoticons e.g. ones built for + ## SentiStrength, might cause more false postive as some of them are + ## similar to elements that appear in code. + .CountRegexMatches(text, ":-\\)|;-\\)|:\\)|;\\)|:-\\(|:\\(") + } + + StartWithAt <- function(text) { + ## TRUE if text starts with @ symbol + grepl("^@", text) + ## sapply(text, function(x) sum(grep("^@", x))) + } +}) diff --git a/R/model.R b/R/model.R new file mode 100644 index 0000000..7fc94d5 --- /dev/null +++ b/R/model.R @@ -0,0 +1,84 @@ +#' NLoN Model. +#' +#' Train a NLoN model using glmnet. +#' +#' @param text A character vector containing the training text. +#' @param response A factor response with levels NL (for natural +#' language) and Not (for not natural language). +#' @param features A function computing the feature values (a matrix, +#' list of numeric vectors or data.frame) or a list of functions +#' computing individual feature values. +#' @param alpha The elasticnet mixing parameter used by +#' \code{\link[glmnet]{glmnet}}. +#' @param ... Additional parameters to pass to +#' \code{\link[glmnet]{glmnet}}. +#' @return A \code{\link[glmnet]{glmnet}} trained model. +#' @seealso \code{\link[glmnet]{glmnet}} +#' @export +NLoNModel <- function(text, response, features=TriGramsAndFeatures, + alpha=1, ...) { + data <- ComputeFeatures(text, features) + glmnet::glmnet(x=ConvertFeatures(data), y=response, family="binomial", + alpha=alpha, ...) +} + +#' NLoN Model. +#' +#' Train a NLoN model using glmnet. +#' +#' @param model A glmnet model as returned by \code{\link{NLoNModel}}. +#' @param text A character vector containing the text to predict. +#' @param lambda Lambda parameter to pass to +#' \code{\link[glmnet]{predict.glmnet}}. +#' @param type Type of prediction made by +#' \code{\link[glmnet]{predict.glmnet}}. +#' @param features A function computing the feature values (a matrix, +#' list of numeric vectors or data.frame) or a list of functions +#' computing individual feature values. +#' @return The output of \code{\link[glmnet]{predict.glmnet}}. +#' @seealso \code{\link[glmnet]{predict.glmnet}} +#' @export +NLoNPredict <- function(model, text, lambda=NULL, type="class", + features=TriGramsAndFeatures) { + data <- ConvertFeatures(ComputeFeatures(text, features)) + missing <- setdiff(rownames(model$beta), colnames(data)) + if (length(missing)) { + data <- cbind(Matrix::sparseMatrix(i=c(), j=c(), + dims=c(nrow(data), length(missing)), + dimnames=list(NULL, missing)), data) + } + data <- data[, rownames(model$beta)] + predict(model, data, s=lambda, type=type) +} + +#' NLoN Model. +#' +#' Train a NLoN model and gives the predicton for the data without +#' response. +#' +#' The data.frame must contain a column \code{text} with both training +#' and test data and a column response with the response value (factor +#' with levels NL and Not). The response is NA for test data. +#' +#' @param data A data.frame containing the training and test data. +#' @param lambda Lambda parameter to pass to +#' \code{\link[glmnet]{predict.glmnet}}. +#' @param type Type of prediction made by +#' \code{\link[glmnet]{predict.glmnet}}. +#' @param features A function computing the feature values (a matrix, +#' list of numeric vectors or data.frame) or a list of functions +#' computing individual feature values. +#' @param ... Additional parameters to pass to \code{NLoNModel}. +#' @return A vector of length \code{sum(is.na(data$response))} with +#' the prediction of the test data. +#' @seealso \code{\link{NLoNModel}} +#' @seealso \code{\link{NLoNPredict}} +#' @seealso \code{\link[glmnet]{glmnet}} +#' @seealso \code{\link[glmnet]{predict.glmnet}} +#' @export +NLoN <- function(data, lambda=NULL, type="class", + features=TriGramsAndFeatures, ...) { + data <- as.data.table(data) + model <- data[!is.na(response), NLoNModel(text, response, features, ...)] + NLoNPredict(model, data[is.na(response), text], lambda, type, features) +} diff --git a/data/nlon.data.rda b/data/nlon.data.rda new file mode 100644 index 0000000..f9bd106 Binary files /dev/null and b/data/nlon.data.rda differ diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 0000000..a742283 --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,20 @@ +citHeader("To cite NLoN in publications use:") + +citEntry(entry = "Article", + title = "Natural Language or Not (NLoN) - A Package for Software Engineering Text Analysis Pipeline", + author = personList(as.person("Mikä Mäntylä"), + as.person("Fabio Calefato"), + as.person("Maëlick Claes")), + journal = "MSR", + year = "2018", + volume = "", + number = "", + pages = "", + url = "http://TODO", + + textVersion = + paste("Mikä Mäntylä, Fabio Calefato, Maëlick Claes (2018).", + "Natural Language or Not (NLoN) - A Package for", + "Software Engineering Text Analysis Pipeline.", + "MSR", + "URL http://TODO")) diff --git a/inst/extdata/mysql_sw_wo_code_words.txt b/inst/extdata/mysql_sw_wo_code_words.txt new file mode 100644 index 0000000..8e7585b --- /dev/null +++ b/inst/extdata/mysql_sw_wo_code_words.txt @@ -0,0 +1,531 @@ +a's +able +about +above +according +accordingly +across +actually +after +afterwards +again +against +ain't +all +allow +allows +almost +alone +along +already +also +although +always +am +among +amongst +an +and +another +any +anybody +anyhow +anyone +anything +anyway +anyways +anywhere +apart +appear +appreciate +appropriate +are +aren't +around +as +aside +ask +asking +associated +at +available +away +awfully +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +believe +below +beside +besides +best +better +between +beyond +both +brief +but +by +c'mon +came +can +can't +cannot +cant +cause +causes +certain +certainly +changes +clearly +com +come +comes +concerning +consequently +consider +considering +contain +containing +contains +corresponding +could +couldn't +course +currently +definitely +described +despite +did +didn't +different +do +does +doesn't +doing +don't +done +down +downwards +during +each +edu +eg +eight +either +else +elsewhere +enough +entirely +especially +et +etc +even +ever +every +everybody +everyone +everything +everywhere +exactly +example +except +far +few +fifth +first +five +followed +following +follows +former +formerly +forth +four +from +further +furthermore +get +gets +getting +given +gives +go +goes +going +gone +got +gotten +greetings +had +hadn't +happens +hardly +has +hasn't +have +haven't +having +he +he's +hello +help +hence +her +here +here's +hereafter +hereby +herein +hereupon +hers +herself +hi +him +himself +his +hither +hopefully +how +howbeit +however +i'd +i'll +i'm +i've +ie +if +ignored +immediate +in +inasmuch +inc +indeed +indicate +indicated +indicates +inner +insofar +instead +into +inward +is +isn't +it +it'd +it'll +it's +its +itself +just +keep +keeps +kept +know +known +knows +last +lately +later +latter +latterly +least +less +lest +let +let's +like +liked +likely +little +look +looking +looks +ltd +mainly +many +may +maybe +me +mean +meanwhile +merely +might +more +moreover +most +mostly +much +must +my +myself +name +namely +near +nearly +necessary +need +needs +neither +never +nevertheless +new +next +nine +no +nobody +non +none +noone +nor +normally +not +nothing +novel +now +nowhere +obviously +of +off +often +oh +ok +okay +old +on +once +one +ones +only +onto +or +other +others +otherwise +ought +our +ours +ourselves +out +outside +over +overall +own +particular +particularly +per +perhaps +placed +please +plus +possible +presumably +probably +provides +que +quite +rather +really +reasonably +regarding +regardless +regards +relatively +respectively +right +said +same +saw +say +saying +says +second +secondly +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sensible +sent +serious +seriously +seven +several +shall +she +should +shouldn't +since +six +so +some +somebody +somehow +someone +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specified +specify +specifying +still +sub +such +sup +sure +t's +take +taken +tell +tends +than +thank +thanks +thanx +that +that's +thats +the +their +theirs +them +themselves +then +thence +there +there's +thereafter +thereby +therefore +therein +theres +thereupon +these +they +they'd +they'll +they're +they've +think +third +this +thorough +thoroughly +those +though +three +through +throughout +thru +thus +to +together +too +took +toward +towards +tried +tries +truly +trying +twice +two +under +unfortunately +unless +unlikely +until +unto +up +upon +us +use +used +useful +uses +using +usually +value +various +very +via +viz +vs +want +wants +was +wasn't +way +we +we'd +we'll +we're +we've +welcome +well +went +were +weren't +what +what's +whatever +when +whence +whenever +where +where's +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +whither +who +who's +whoever +whole +whom +whose +why +will +willing +wish +with +within +without +won't +wonder +would +wouldn't +yes +yet +you +you'd +you'll +you're +you've +your +yours +yourself +yourselves +zero diff --git a/man/Character3Grams.Rd b/man/Character3Grams.Rd new file mode 100644 index 0000000..1ac70c5 --- /dev/null +++ b/man/Character3Grams.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/features.R +\name{Character3Grams} +\alias{Character3Grams} +\title{Character 3-gram extraction.} +\usage{ +Character3Grams(text) +} +\arguments{ +\item{text}{The text.} +} +\value{ +A document term matrix (sparse Matrix). +} +\description{ +Computes the document term matrix of character 3-gram. +} diff --git a/man/FeatureExtraction.Rd b/man/FeatureExtraction.Rd new file mode 100644 index 0000000..eb0541f --- /dev/null +++ b/man/FeatureExtraction.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/features.R +\name{FeatureExtraction} +\alias{FeatureExtraction} +\title{Feature extraction.} +\usage{ +FeatureExtraction(text) +} +\arguments{ +\item{text}{The text.} +} +\value{ +A data.table with values of the 11 features. +} +\description{ +Computes a set of simple text-based features. +} +\details{ +The features computed are the followings: +\describe{ + \item{\code{ratio.caps}}{The ratio of uppercase letters.} + \item{\code{ratio.specials}}{The ratio of special characters.} + \item{\code{ratio.numbers}}{The ratio of number characters.} + \item{\code{length.words}}{The average word length.} + \item{\code{stopwords}}{The ratio of English stopwords (using first + tokenizer).} + \item{\code{stopwords2}}{The ratio of English stopwords (using second + tokenizer).} + \item{\code{last.char.nl}}{Boolean for the use of NL character at the + end of the text.} + \item{\code{last.char.code}}{Boolean for the use of code character at + the end of text.} + \item{\code{first.3.chars.letters}}{Number of letters in the three + first characters.} + \item{\code{emoticons}}{Number of emoticons} + \item{\code{first.char.at}}{Boolean for the use of @ character at + the beginning of the line.} +} +} diff --git a/man/NLoN.Rd b/man/NLoN.Rd new file mode 100644 index 0000000..f039d3a --- /dev/null +++ b/man/NLoN.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/NLoN.R, R/model.R +\docType{package} +\name{NLoN} +\alias{NLoN} +\alias{NLoN-package} +\alias{NLoN} +\title{NLoN: Natural Language or Not} +\usage{ +NLoN(data, lambda = NULL, type = "class", features = TriGramsAndFeatures, + ...) +} +\arguments{ +\item{data}{A data.frame containing the training and test data.} + +\item{lambda}{Lambda parameter to pass to +\code{\link[glmnet]{predict.glmnet}}.} + +\item{type}{Type of prediction made by +\code{\link[glmnet]{predict.glmnet}}.} + +\item{features}{A function computing the feature values (a matrix, +list of numeric vectors or data.frame) or a list of functions +computing individual feature values.} + +\item{...}{Additional parameters to pass to \code{NLoNModel}.} +} +\value{ +A vector of length \code{sum(is.na(data$response))} with + the prediction of the test data. +} +\description{ +NLoN identifies whether text lines are natural language or not +using a glmnet model with simple text features and character +3-grams. + +Train a NLoN model and gives the predicton for the data without +response. +} +\details{ +The data.frame must contain a column \code{text} with both training +and test data and a column response with the response value (factor +with levels NL and Not). The response is NA for test data. +} +\examples{ + +## Training data provided in the package. +data(nlon.data) + +## Build a model with glmnet +model <- with(nlon.data, NLoNModel(text, rater2, TriGramsAndFeatures)) + +## Use the model to preidct new data. +topredict <- c("This is natural language.", "not(natural, language);") +NLoNPredict(model, topredict, 0.1, features=FeatureExtraction) + +## Train and predict in a single function call. +NLoN(rbind(nlon.data[, list(text, response=rater2)], + list(text=topredict), fill=TRUE), + 0.1, features=FeatureExtraction) + +} +\seealso{ +\code{\link{NLoNModel}} + +\code{\link{NLoNPredict}} + +\code{\link[glmnet]{glmnet}} + +\code{\link[glmnet]{predict.glmnet}} +} diff --git a/man/NLoNModel.Rd b/man/NLoNModel.Rd new file mode 100644 index 0000000..af75cb0 --- /dev/null +++ b/man/NLoNModel.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/model.R +\name{NLoNModel} +\alias{NLoNModel} +\title{NLoN Model.} +\usage{ +NLoNModel(text, response, features = TriGramsAndFeatures, alpha = 1, ...) +} +\arguments{ +\item{text}{A character vector containing the training text.} + +\item{response}{A factor response with levels NL (for natural +language) and Not (for not natural language).} + +\item{features}{A function computing the feature values (a matrix, +list of numeric vectors or data.frame) or a list of functions +computing individual feature values.} + +\item{alpha}{The elasticnet mixing parameter used by +\code{\link[glmnet]{glmnet}}.} + +\item{...}{Additional parameters to pass to +\code{\link[glmnet]{glmnet}}.} +} +\value{ +A \code{\link[glmnet]{glmnet}} trained model. +} +\description{ +Train a NLoN model using glmnet. +} +\seealso{ +\code{\link[glmnet]{glmnet}} +} diff --git a/man/NLoNPredict.Rd b/man/NLoNPredict.Rd new file mode 100644 index 0000000..d429070 --- /dev/null +++ b/man/NLoNPredict.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/model.R +\name{NLoNPredict} +\alias{NLoNPredict} +\title{NLoN Model.} +\usage{ +NLoNPredict(model, text, lambda = NULL, type = "class", + features = TriGramsAndFeatures) +} +\arguments{ +\item{model}{A glmnet model as returned by \code{\link{NLoNModel}}.} + +\item{text}{A character vector containing the text to predict.} + +\item{lambda}{Lambda parameter to pass to +\code{\link[glmnet]{predict.glmnet}}.} + +\item{type}{Type of prediction made by +\code{\link[glmnet]{predict.glmnet}}.} + +\item{features}{A function computing the feature values (a matrix, +list of numeric vectors or data.frame) or a list of functions +computing individual feature values.} +} +\value{ +The output of \code{\link[glmnet]{predict.glmnet}}. +} +\description{ +Train a NLoN model using glmnet. +} +\seealso{ +\code{\link[glmnet]{predict.glmnet}} +} diff --git a/man/TriGramsAndFeatures.Rd b/man/TriGramsAndFeatures.Rd new file mode 100644 index 0000000..f17f9cc --- /dev/null +++ b/man/TriGramsAndFeatures.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/features.R +\name{TriGramsAndFeatures} +\alias{TriGramsAndFeatures} +\title{3-grams and feature extraction.} +\usage{ +TriGramsAndFeatures(text) +} +\arguments{ +\item{text}{The text.} +} +\value{ +A sparse Matrix with text features and 3-gram. +} +\description{ +Computes both 3-gram and simple text features. +} +\seealso{ +\code{\link{Character3Grams}} + +\code{\link{FeatureExtraction}} +} diff --git a/man/features.Rd b/man/features.Rd new file mode 100644 index 0000000..7d6410a --- /dev/null +++ b/man/features.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/features.R +\docType{data} +\name{features} +\alias{features} +\title{Features.} +\format{An object of class \code{module} (inherits from \code{list}) of length 19.} +\usage{ +features +} +\description{ +Module containing functions for individual simple text feature +extraction. +} +\details{ +Most functions have a single \code{text} parameter. The module +contains the following functions: + +\describe{ + \item{\code{Stopwords}}{Number of stopwords. Uses two optional + parameters: \code{Tokenize} which is the word tokenizer to use + and \code{stopwords} which is the list of stopwords to use.} + \item{\code{Tokenize1}}{First tokenizer available for + \code{Stopwords}.} + \item{\code{Tokenize2}}{Second tokenizer available for + \code{Stopwords}.} + \item{\code{StopwordsRatio1}}{Ratio of stopwords using \code{Tokenize1}} + \item{\code{StopwordsRatio2}}{Ratio of stopwords using \code{Tokenize2}} + \item{\code{Caps}}{Number of uppercase letters.} + \item{\code{CapsRatio}}{Ratio of uppercase letters.} + \item{\code{SpecialChars}}{Number of special characters.} + \item{\code{SpecialCharsRatio}}{Ratio of special characters.} + \item{\code{Numbers}}{Number of digit characters.} + \item{\code{NumbersRatio}}{Ratio of digit characters.} + \item{\code{Words}}{Number of words.} + \item{\code{AverageWordLength}}{Average word length.} + \item{\code{LastCharCode}}{Boolean for the use of a code character at the + end of the text.} + \item{\code{LastCharNL}}{Boolean for the use of a natural language boolean + at the end of the text.} + \item{\code{First3Chars}}{Returns the first three non white characters.} + \item{\code{First3CharsLetters}}{The number of three first non white + characters that are letters.} + \item{\code{Emoticons}}{The number of emoticons} + \item{\code{StartWithAt}}{Boolean for the use of @ at the start of + the text.} +} +} +\keyword{datasets} diff --git a/man/nlon.data.Rd b/man/nlon.data.Rd new file mode 100644 index 0000000..5187a3e --- /dev/null +++ b/man/nlon.data.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{nlon.data} +\alias{nlon.data} +\title{Training data used in the NLoN paper.} +\format{A data frame with 6000 rows and 4 columns: +\describe{ + \item{source}{source from the text (mozilla, kubertenes or lucene).} + \item{text}{line of text.} + \item{rater1}{reponse from the first rater.} + \item{rater2}{reponse from the second rater.} +}} +\source{ +\url{https://bugzilla.mozilla.org/} +\url{http://www.kubertenes/} +\url{http://lucene} +} +\usage{ +nlon.data +} +\description{ +A dataset containing 2000 lines of text from Mozilla Firefox, +Lucene and Kubertenes datasets alongside two response variables +from two different raters. +} +\keyword{datasets} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..7fae518 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(NLoN) + +test_check("NLoN") diff --git a/tests/testthat/test-features.R b/tests/testthat/test-features.R new file mode 100644 index 0000000..5304b55 --- /dev/null +++ b/tests/testthat/test-features.R @@ -0,0 +1,137 @@ +context("features") + +test_that("Stopwords works", { + text <- c("", "text", "123", "!@#$", "This is some text.", + "This isn't some text.", "This is.") + expect_equal(features$Stopwords(text, features$Tokenize1), + c(0, 0, 0, 0, 3, 3, 1)) + expect_equal(features$Stopwords(text, features$Tokenize2), + c(0, 0, 0, 0, 3, 3, 2)) +}) + +test_that("Caps works", { + text <- c("", "text", "123", "!@#$", "This is some text.", + "A", "ABC", "aaABCaa") + expect_equal(features$Caps(text), + c(0, 0, 0, 0, 1, 1, 3, 3)) +}) + +test_that("SpecialChars works", { + text <- c("", "text", "123", "!@#$", "This is some text.", + "x-y", "test;", "just some text") + expect_equal(features$SpecialChars(text), + c(0, 0, 0, 4, 1, 1, 1, 0)) +}) + +test_that("Numbers works", { + text <- c("", "text", "123", "!@#$", "This is some text.", + "There is 1 number.") + expect_equal(features$Numbers(text), + c(0, 0, 3, 0, 0, 1)) +}) + +test_that("Words works", { + text <- c("", "text", "123", "!@#$", "This is some text.", + "one-word.", "abc!def") + expect_equal(features$Words(text), + c(1, 1, 1, 1, 4, 1, 1)) +}) + +test_that("AverageWordLength works", { + text <- c("", "123", "123 123", "1", "!2c$", "abc def!", "1 234") + expect_equal(features$AverageWordLength(text), + c(0, 3, 3.5, 1, 4, 4, 2.5)) +}) + +test_that("LastCharCode works", { + text <- c("", "This is text.", "func(x, y);", "if (true) {", + "func()", ":-)", "Hello ;-)", ":)", ":-(", ":(", ":-))") + expect_equal(features$LastCharCode(text), + c(FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, + FALSE, FALSE, TRUE)) +}) + +test_that("LastCharNL works", { + text <- c("", "abc", "1", ".", "!", "?", "? ", ":", ",", "Hello!!") + expect_equal(features$LastCharNL(text), + c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, + TRUE, TRUE)) +}) + +test_that("LastCharCode and LastCharNL are not both true", { + text <- c("", "This is text.", "func(x, y);", "if (true) {", + "func()", ":-)", "Hello ;-)", ":)", ":-(", ":(", ":-))", + "", "abc", "1", ".", "!", "?", "? ", ":", ",", "Hello!!") + expect_true(all(!features$LastCharCode(text) | !features$LastCharNL(text))) +}) + +test_that("First3Chars works", { + text <- c("", "1", "1 2345", " 12345", " 12345") + expect_equal(features$First3Chars(text), + c("", "1", "123", "123", "123")) +}) + +test_that("First3CharsLetters works", { + text <- c("", "a", " a", " abc", "!@#abc", " a2#d", "Hello", + "123", "!@#", "H3ll0") + expect_equal(features$First3CharsLetters(text), + c(0, 1, 1, 3, 0, 1, 3, 0, 0, 2)) +}) + +test_that("Emoticons works", { + text <- c("", "123", "abc", ":--)", ":-)", "Hello ;-)", "\":-)\";", + ":)", ";)", ":-(", ":(", ":(:)", ":) :) :)", ":):(:") + expect_equal(features$Emoticons(text), + c(0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2)) +}) + +test_that("StartWithAt works", { + text <- c("", "abc", "123", "!@#", "@abc", "@", " @") + expect_equal(features$StartWithAt(text), + c(FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE)) +}) + +test_that("FeatureExtraction works", { + text <- c("", "abc", "123", "!@#") + expect_equal(names(FeatureExtraction(text)), + c("ratio.caps", "ratio.specials", "ratio.numbers", + "length.words", "stopwords", "stopwords2", + "last.char.code", "last.char.nl", + "first.3.chars.letters", "emoticons", "first.char.at")) +}) + +test_that("Character3Grams works", { + text <- c("", "abcd", "1234", "!@#$", "1234abcd") + res <- Character3Grams(text) + expect_equal(dim(res), c(5, 7)) + expect_equal(colnames(res), + c("0ab", "@#$", "00a", "!@#", "bcd", "abc", "000")) +}) + +test_that("ConvertFeatures works", { + text <- c("", "abcd", "1234", "!@#$", "1234abcd") + expect_true(inherits(ConvertFeatures(Character3Grams(text)), "Matrix")) + expect_true(inherits(ConvertFeatures(TriGramsAndFeatures(text)), "Matrix")) + expect_true(is.matrix(ConvertFeatures(as.list(FeatureExtraction(text))))) + expect_true(is.matrix(ConvertFeatures(FeatureExtraction(text)))) + expect_error(ConvertFeatures(list(1:9, 1:10))) + expect_error(ConvertFeatures(data.table(1:26, letters))) +}) + +test_that("ComputeFeatures works", { + text <- c("", "abcd", "1234", "!@#$", "1234abcd") + expect_true(inherits(ComputeFeatures(text, Character3Grams), "Matrix")) + expect_true(inherits(ComputeFeatures(text, TriGramsAndFeatures), "Matrix")) + expect_true(is.data.frame(ComputeFeatures(text, FeatureExtraction))) + + res <- ComputeFeatures(text, list(caps=features$Caps, + special.chars=features$SpecialChars, + numbers=features$Numbers, + words=features$Words)) + expect_true(is.matrix(res)) + expect_warning(ComputeFeatures(text, list(features$Caps, + features$SpecialChars, + features$Numbers, + features$Words))) + expect_error(ComputeFeatures(text, list(1, 2, 3))) +}) diff --git a/tests/testthat/test-old-features.R b/tests/testthat/test-old-features.R new file mode 100644 index 0000000..909b196 --- /dev/null +++ b/tests/testthat/test-old-features.R @@ -0,0 +1,51 @@ +context("Regression test for feature extraction") + +FeatureExtractionOld <- function(labeled) { + labeled <- copy(labeled) + myslq_sw <- system.file("extdata", "mysql_sw_wo_code_words.txt", + package="NLoN", mustWork=TRUE) + myslq_sw <- read.csv(myslq_sw, stringsAsFactors=FALSE, header=FALSE) + #Not same encoding as in cubernets. not used. + labeled$length <- nchar(labeled$text) + labeled$ratio.caps <- plyr::ldply(stringr::str_match_all(labeled$text,"[A-Z]"),length)/labeled$length + labeled$ratio.specials <- plyr::ldply(stringr::str_match_all(labeled$text,"[^a-zA-Z\\d\\s]"),length)/labeled$length + labeled$ratio.numbers <- plyr::ldply(stringr::str_match_all(labeled$text,"[\\d]"),length)/labeled$length + + labeled$length.words <- labeled$length /(plyr::ldply(stringr::str_match_all(labeled$text,"[\\s+]"),length) +1) + labeled$words <- plyr::ldply(stringr::str_match_all(labeled$text,"[\\s+]"),length) +1 + #we count stopword twice with two different tokenizers. + labeled[,stopwords:= sapply(text, function(x) sum(tolower(tokenizers::tokenize_regex(x, pattern = "\\s+", simplify = TRUE)) %in% myslq_sw$V1))/ labeled$words] + labeled[,stopwords2:= sapply(text, function(x) sum(tokenizers::tokenize_words(x, simplify = TRUE) %in% myslq_sw$V1))/ labeled$words] + #If the line ends to emoticon this is not true :-) so we substract. R does not support lookahead in regex which would also solve this + labeled[,last.char.code:= sapply(text, function(x) max(c(sum(grep("\\)$|\\{$|;$", x)) - sum (grep (":-\\)$|;-\\)|:\\)$|;\\)$|:-\\($|:\\($", x)), 0)))] + + labeled[,last.char.nl:= sapply(text, function(x) sum(grep("\\.$|\\!$|\\?$|:$|,$", x)))] + labeled$first.3.chars.letters <- plyr::ldply (stringr::str_match_all(substr(gsub("\\s", "", labeled$text), 1,3), "[a-zA-Z]"),length) + labeled$emoticons <- plyr::ldply(stringr::str_match_all(labeled$text,":-\\)|;-\\)|:\\)|;\\)|:-\\(|:\\("),length) + #Sanity chek should empty + #labeled[last.char.nl==1 & last.char.code==1]$text + + #New features from Kubernets + #Starts with @ sign + labeled[,first.char.at:= sapply(text, function(x) sum(grep("^@", x)))] + labeled +} + +test_that("FeatureExtraction gives same results as FeatureExtractionOld", { + skip_on_cran() + data(nlon.data) + res <- FeatureExtraction(nlon.data$text) + res.old <- FeatureExtractionOld(nlon.data) + + expect_equal(res$ratio.caps, res.old$ratio.caps) + expect_equal(res$ratio.specials, res.old$ratio.specials) + expect_equal(res$ratio.numbers, res.old$ratio.numbers) + expect_equal(res$length.words, res.old$length.words) + expect_equal(res$stopwords, res.old$stopwords) + expect_equal(res$stopwords2, res$stopwords2) + expect_equal(res$last.char.code, res.old$last.char.code) + expect_equal(res$last.char.nl, res.old$last.char.nl) + expect_equal(res$first.3.chars.letters, res.old$first.3.chars.letters) + expect_equal(res$emoticons, res.old$emoticons) + expect_equal(res$first.char.at, res.old$first.char.at) +})