Skip to content

Commit

Permalink
First release 0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
maelick committed Mar 8, 2018
1 parent 01d2ff5 commit 5a18130
Show file tree
Hide file tree
Showing 23 changed files with 1,453 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
@@ -0,0 +1,2 @@
data-raw
^\.travis\.yml$
5 changes: 5 additions & 0 deletions .travis.yml
@@ -0,0 +1,5 @@
# R for travis: see documentation at https://docs.travis-ci.com/user/languages/r

language: R
sudo: false
cache: packages
17 changes: 17 additions & 0 deletions DESCRIPTION
@@ -0,0 +1,17 @@
Package: NLoN
Type: Package
Title: Natural Language or Not
Version: 0.1.0
Date: 2018-03-08
Author: Mika Mäntylä <mika.mantyla@oulu.fi>, Fabio Calefato
<fabio.calefato@uniba.it>, Maëlick Claes <himself@maelick.net>
Maintainer: Maëlick Claes <himself@maelick.net>
Description: Identify whether text lines are natural language or not
using machine learning.
Depends: data.table, R (>= 3.0)
Imports: text2vec, tokenizers, glmnet, Matrix, modules, stats
License: GPL-3
Encoding: UTF-8
Remotes: M3SOulu/NLoN
Suggests: testthat, plyr
RoxygenNote: 6.0.1
11 changes: 11 additions & 0 deletions NAMESPACE
@@ -0,0 +1,11 @@
# Generated by roxygen2: do not edit by hand

export(Character3Grams)
export(FeatureExtraction)
export(NLoN)
export(NLoNModel)
export(NLoNPredict)
export(TriGramsAndFeatures)
export(features)
import(data.table)
importFrom(stats,predict)
28 changes: 28 additions & 0 deletions R/NLoN.R
@@ -0,0 +1,28 @@
#' NLoN: Natural Language or Not
#'
#' NLoN identifies whether text lines are natural language or not
#' using a glmnet model with simple text features and character
#' 3-grams.
#'
#' @examples
#'
#' ## Training data provided in the package.
#' data(nlon.data)
#'
#' ## Build a model with glmnet
#' model <- with(nlon.data, NLoNModel(text, rater2, TriGramsAndFeatures))
#'
#' ## Use the model to preidct new data.
#' topredict <- c("This is natural language.", "not(natural, language);")
#' NLoNPredict(model, topredict, 0.1, features=FeatureExtraction)
#'
#' ## Train and predict in a single function call.
#' NLoN(rbind(nlon.data[, list(text, response=rater2)],
#' list(text=topredict), fill=TRUE),
#' 0.1, features=FeatureExtraction)
#'
#' @docType package
#' @name NLoN
#' @import data.table
#' @importFrom stats predict
NULL
17 changes: 17 additions & 0 deletions R/data.R
@@ -0,0 +1,17 @@
#' Training data used in the NLoN paper.
#'
#' A dataset containing 2000 lines of text from Mozilla Firefox,
#' Lucene and Kubertenes datasets alongside two response variables
#' from two different raters.
#'
#' @format A data frame with 6000 rows and 4 columns:
#' \describe{
#' \item{source}{source from the text (mozilla, kubertenes or lucene).}
#' \item{text}{line of text.}
#' \item{rater1}{reponse from the first rater.}
#' \item{rater2}{reponse from the second rater.}
#' }
#' @source \url{https://bugzilla.mozilla.org/}
#' \url{http://www.kubertenes/}
#' \url{http://lucene}
"nlon.data"
Empty file added R/evaluation.R
Empty file.
255 changes: 255 additions & 0 deletions R/features.R
@@ -0,0 +1,255 @@
mysql.stopwords <- system.file("extdata", "mysql_sw_wo_code_words.txt",
package="NLoN", mustWork=TRUE)
mysql.stopwords <- read.csv(mysql.stopwords, stringsAsFactors=FALSE,
header=FALSE)$V1

## emojis <- system.file("extdata", "emojis.csv",
## package="NLoN", mustWork=TRUE)
## emojis <- "data/emojis.csv"
## emojis <- fread(emojis)

ConvertFeatures <- function(data) {
## Make sure that the feature data is a matrix or Matrix object.
## Converts list into data.frame and then data.frame into matrix.
if (is.list(data)) {
if (length(unique(sapply(data, length))) == 1) {
data <- as.data.table(data)
} else stop("feature values don't have the same length")
}
if (is.data.frame(data)) {
data <- as.matrix(data)
}
if ((is.matrix(data) && is.numeric(data)) || inherits(data, "Matrix")) {
data
} else stop("feature values are not a numeric matrix")
}

ComputeFeatures <- function(text, features) {
## Compute features. If features is a function, it will simply be
## applied on the text (and must return a list, data.frame, matrix
## or Matrix of numeric values). If feature is a list of functions,
## do a sapply of the functions which must all return a numeric
## vector of the same length as text.
if (is.function(features)) {
data <- features(text)
} else if (is.list(features) && all(sapply(features, is.function))) {
if (is.null(names(features))) {
warning("features is a list of functions without names")
}
data <- sapply(features, function(f) f(text))
} else stop("features must be a function or a list of functions")
}

#' Feature extraction.
#'
#' Computes a set of simple text-based features.
#'
#' The features computed are the followings:
#' \describe{
#' \item{\code{ratio.caps}}{The ratio of uppercase letters.}
#' \item{\code{ratio.specials}}{The ratio of special characters.}
#' \item{\code{ratio.numbers}}{The ratio of number characters.}
#' \item{\code{length.words}}{The average word length.}
#' \item{\code{stopwords}}{The ratio of English stopwords (using first
#' tokenizer).}
#' \item{\code{stopwords2}}{The ratio of English stopwords (using second
#' tokenizer).}
#' \item{\code{last.char.nl}}{Boolean for the use of NL character at the
#' end of the text.}
#' \item{\code{last.char.code}}{Boolean for the use of code character at
#' the end of text.}
#' \item{\code{first.3.chars.letters}}{Number of letters in the three
#' first characters.}
#' \item{\code{emoticons}}{Number of emoticons}
#' \item{\code{first.char.at}}{Boolean for the use of @ character at
#' the beginning of the line.}
#' }
#'
#' @param text The text.
#' @return A data.table with values of the 11 features.
#' @export
FeatureExtraction <- function(text) {
data <- data.table(text)
features <- list(ratio.caps=features$CapsRatio,
ratio.specials=features$SpecialCharsRatio,
ratio.numbers=features$NumbersRatio,
length.words=features$AverageWordLength,
stopwords=features$StopwordsRatio1,
stopwords2=features$StopwordsRatio2,
last.char.code=features$LastCharCode,
last.char.nl=features$LastCharNL,
first.3.chars.letters=features$First3CharsLetter,
emoticons=features$Emoticons,
first.char.at=features$StartWithAt)
as.data.table(ComputeFeatures(text, features))
}

#' Character 3-gram extraction.
#'
#' Computes the document term matrix of character 3-gram.
#'
#' @param text The text.
#' @return A document term matrix (sparse Matrix).
#' @export
Character3Grams <- function(text) {
Preprocessor <- function(x) {
gsub("[0-9]", "0", gsub("\\\032", "", x))
}
Tokenizer <- function (x) {
tokenizers::tokenize_character_shingles(x, n=3, strip_non_alphanum=FALSE,
lowercase=TRUE)
}
it <- text2vec::itoken(text, tokenizer=Tokenizer,
preprocessor=Preprocessor,
progressbar=TRUE)
vocab <- text2vec::create_vocabulary(it)
vectorizer <- text2vec::vocab_vectorizer(vocab)
text2vec::create_dtm(it, vectorizer)
}

#' 3-grams and feature extraction.
#'
#' Computes both 3-gram and simple text features.
#'
#' @param text The text.
#' @return A sparse Matrix with text features and 3-gram.
#' @seealso \code{\link{Character3Grams}}
#' @seealso \code{\link{FeatureExtraction}}
#' @export
TriGramsAndFeatures <- function(text) {
cbind(Character3Grams(text), as.matrix(FeatureExtraction(text)))
}

#' Features.
#'
#' Module containing functions for individual simple text feature
#' extraction.
#'
#' Most functions have a single \code{text} parameter. The module
#' contains the following functions:
#'
#' \describe{
#' \item{\code{Stopwords}}{Number of stopwords. Uses two optional
#' parameters: \code{Tokenize} which is the word tokenizer to use
#' and \code{stopwords} which is the list of stopwords to use.}
#' \item{\code{Tokenize1}}{First tokenizer available for
#' \code{Stopwords}.}
#' \item{\code{Tokenize2}}{Second tokenizer available for
#' \code{Stopwords}.}
#' \item{\code{StopwordsRatio1}}{Ratio of stopwords using \code{Tokenize1}}
#' \item{\code{StopwordsRatio2}}{Ratio of stopwords using \code{Tokenize2}}
#' \item{\code{Caps}}{Number of uppercase letters.}
#' \item{\code{CapsRatio}}{Ratio of uppercase letters.}
#' \item{\code{SpecialChars}}{Number of special characters.}
#' \item{\code{SpecialCharsRatio}}{Ratio of special characters.}
#' \item{\code{Numbers}}{Number of digit characters.}
#' \item{\code{NumbersRatio}}{Ratio of digit characters.}
#' \item{\code{Words}}{Number of words.}
#' \item{\code{AverageWordLength}}{Average word length.}
#' \item{\code{LastCharCode}}{Boolean for the use of a code character at the
#' end of the text.}
#' \item{\code{LastCharNL}}{Boolean for the use of a natural language boolean
#' at the end of the text.}
#' \item{\code{First3Chars}}{Returns the first three non white characters.}
#' \item{\code{First3CharsLetters}}{The number of three first non white
#' characters that are letters.}
#' \item{\code{Emoticons}}{The number of emoticons}
#' \item{\code{StartWithAt}}{Boolean for the use of @ at the start of
#' the text.}
#' }
#'
#' @export
features <- modules::module({
modules::export("^[^.]")

.CountRegexMatches <- function(text, re) {
## Count the number of match of a regex
sapply(stringr::str_match_all(text, re), length)
}

Tokenize1 <- function(text) {
## Need to be fixed: add punctuation for seperator (at least .)
## lapply(tolower(text), tokenize_regex, pattern="\\s+", simplify=TRUE)
lapply(tolower(text), tokenizers::tokenize_regex,
pattern="\\s+", simplify=TRUE)
}

Tokenize2 <- function(text) {
tokenizers::tokenize_words(text, simplify=TRUE)
}

Stopwords <- function(text, Tokenize=Tokenize, stopwords=mysql.stopwords) {
## Computes the number of stopwords present in text based on a given
## Tokenize function
sapply(Tokenize(text), function(words) sum(words %in% stopwords))
}

Caps <- function(text) {
## Number of uppercase characters
.CountRegexMatches(text, "[A-Z]")
}

SpecialChars <- function(text) {
## Number of special characters
.CountRegexMatches(text, "[^a-zA-Z\\d\\s]")
}

Numbers <- function(text) {
## Number of digits
.CountRegexMatches(text, "[\\d]")
}

CapsRatio <- function(text) Caps(text) / nchar(text)
SpecialCharsRatio <- function(text) SpecialChars(text) / nchar(text)
NumbersRatio <- function(text) Numbers(text) / nchar(text)
StopwordsRatio1 <- function(text) Stopwords(text, Tokenize1) / Words(text)
StopwordsRatio2 <- function(text) Stopwords(text, Tokenize2) / Words(text)

Words <- function(text) {
## Number of words
## .CountRegexMatches(text, "\\w")
.CountRegexMatches(text, "[\\s+]") + 1
}

AverageWordLength <- function(text) {
nchar(text) / Words(text)
}

LastCharCode <- function(text) {
## Boolean whether last character is a character code
## If the line ends to emoticon this is not true :-) so we
## substract. R does not support lookahead in regex which would also
## solve this
(!grepl("(:-\\)|;-\\)|:\\)|;\\)|:-\\(|:\\()$", text) &
grepl("[){;]$", text))
}

LastCharNL <- function(text) {
## Last character is related to natural language (punctuation)
grepl("\\.$|\\!$|\\?$|:$|,$", text)
}

First3Chars <- function(text) {
## First three characters (after stripping white spaces)
substr(gsub("\\s", "", text), 1, 3)
}

First3CharsLetters <- function(text) {
## Number of characters in the firsrt three characters
.CountRegexMatches(First3Chars(text), "[a-zA-Z]")
}

Emoticons <- function(text) {
## Number of emoticons
## Using more larger lis of emoticons e.g. ones built for
## SentiStrength, might cause more false postive as some of them are
## similar to elements that appear in code.
.CountRegexMatches(text, ":-\\)|;-\\)|:\\)|;\\)|:-\\(|:\\(")
}

StartWithAt <- function(text) {
## TRUE if text starts with @ symbol
grepl("^@", text)
## sapply(text, function(x) sum(grep("^@", x)))
}
})

0 comments on commit 5a18130

Please sign in to comment.