diff --git a/.Rbuildignore b/.Rbuildignore
new file mode 100644
index 0000000..4051aec
--- /dev/null
+++ b/.Rbuildignore
@@ -0,0 +1,2 @@
+data-raw
+^\.travis\.yml$
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..8d139ac
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,5 @@
+# R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
+
+language: R
+sudo: false
+cache: packages
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..c9caf8a
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,17 @@
+Package: NLoN
+Type: Package
+Title: Natural Language or Not
+Version: 0.1.0
+Date: 2018-03-08
+Author: Mika Mäntylä <mika.mantyla@oulu.fi>, Fabio Calefato
+  <fabio.calefato@uniba.it>, Maëlick Claes <himself@maelick.net>
+Maintainer: Maëlick Claes <himself@maelick.net>
+Description: Identify whether text lines are natural language or not
+  using machine learning.
+Depends: data.table, R (>= 3.0)
+Imports: text2vec, tokenizers, glmnet, Matrix, modules, stats
+License: GPL-3
+Encoding: UTF-8
+Remotes: M3SOulu/NLoN
+Suggests: testthat, plyr
+RoxygenNote: 6.0.1
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..7e19881
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,11 @@
+# Generated by roxygen2: do not edit by hand
+
+export(Character3Grams)
+export(FeatureExtraction)
+export(NLoN)
+export(NLoNModel)
+export(NLoNPredict)
+export(TriGramsAndFeatures)
+export(features)
+import(data.table)
+importFrom(stats,predict)
diff --git a/R/NLoN.R b/R/NLoN.R
new file mode 100644
index 0000000..44aad28
--- /dev/null
+++ b/R/NLoN.R
@@ -0,0 +1,28 @@
+#' NLoN: Natural Language or Not
+#'
+#' NLoN identifies whether text lines are natural language or not
+#' using a glmnet model with simple text features and character
+#' 3-grams.
+#'
+#' @examples
+#'
+#' ## Training data provided in the package.
+#' data(nlon.data)
+#'
+#' ## Build a model with glmnet
+#' model <- with(nlon.data, NLoNModel(text, rater2, TriGramsAndFeatures))
+#'
+#' ## Use the model to preidct new data.
+#' topredict <- c("This is natural language.", "not(natural, language);")
+#' NLoNPredict(model, topredict, 0.1, features=FeatureExtraction)
+#'
+#' ## Train and predict in a single function call.
+#' NLoN(rbind(nlon.data[, list(text, response=rater2)],
+#'           list(text=topredict), fill=TRUE),
+#'     0.1, features=FeatureExtraction)
+#'
+#' @docType package
+#' @name NLoN
+#' @import data.table
+#' @importFrom stats predict
+NULL
diff --git a/R/data.R b/R/data.R
new file mode 100644
index 0000000..3134a5a
--- /dev/null
+++ b/R/data.R
@@ -0,0 +1,17 @@
+#' Training data used in the NLoN paper.
+#'
+#' A dataset containing 2000 lines of text from Mozilla Firefox,
+#' Lucene and Kubertenes datasets alongside two response variables
+#' from two different raters.
+#'
+#' @format A data frame with 6000 rows and 4 columns:
+#' \describe{
+#'   \item{source}{source from the text (mozilla, kubertenes or lucene).}
+#'   \item{text}{line of text.}
+#'   \item{rater1}{reponse from the first rater.}
+#'   \item{rater2}{reponse from the second rater.}
+#' }
+#' @source \url{https://bugzilla.mozilla.org/}
+#' \url{http://www.kubertenes/}
+#' \url{http://lucene}
+"nlon.data"
diff --git a/R/evaluation.R b/R/evaluation.R
new file mode 100644
index 0000000..e69de29
diff --git a/R/features.R b/R/features.R
new file mode 100644
index 0000000..5e8dd47
--- /dev/null
+++ b/R/features.R
@@ -0,0 +1,255 @@
+mysql.stopwords <- system.file("extdata", "mysql_sw_wo_code_words.txt",
+                               package="NLoN", mustWork=TRUE)
+mysql.stopwords <- read.csv(mysql.stopwords, stringsAsFactors=FALSE,
+                            header=FALSE)$V1
+
+## emojis <- system.file("extdata", "emojis.csv",
+##                       package="NLoN", mustWork=TRUE)
+## emojis <- "data/emojis.csv"
+## emojis <- fread(emojis)
+
+ConvertFeatures <- function(data) {
+  ## Make sure that the feature data is a matrix or Matrix object.
+  ## Converts list into data.frame and then data.frame into matrix.
+  if (is.list(data)) {
+    if (length(unique(sapply(data, length))) == 1) {
+      data <- as.data.table(data)
+    } else stop("feature values don't have the same length")
+  }
+  if (is.data.frame(data)) {
+    data <- as.matrix(data)
+  }
+  if ((is.matrix(data) && is.numeric(data)) || inherits(data, "Matrix")) {
+    data
+  } else stop("feature values are not a numeric matrix")
+}
+
+ComputeFeatures <- function(text, features) {
+  ## Compute features. If features is a function, it will simply be
+  ## applied on the text (and must return a list, data.frame, matrix
+  ## or Matrix of numeric values). If feature is a list of functions,
+  ## do a sapply of the functions which must all return a numeric
+  ## vector of the same length as text.
+  if (is.function(features)) {
+    data <- features(text)
+  } else if (is.list(features) && all(sapply(features, is.function))) {
+    if (is.null(names(features))) {
+      warning("features is a list of functions without names")
+    }
+    data <- sapply(features, function(f) f(text))
+  } else stop("features must be a function or a list of functions")
+}
+
+#' Feature extraction.
+#'
+#' Computes a set of simple text-based features.
+#'
+#' The features computed are the followings:
+#' \describe{
+#'   \item{\code{ratio.caps}}{The ratio of uppercase letters.}
+#'   \item{\code{ratio.specials}}{The ratio of special characters.}
+#'   \item{\code{ratio.numbers}}{The ratio of number characters.}
+#'   \item{\code{length.words}}{The average word length.}
+#'   \item{\code{stopwords}}{The ratio of English stopwords (using first
+#'   tokenizer).}
+#'   \item{\code{stopwords2}}{The ratio of English stopwords (using second
+#'   tokenizer).}
+#'   \item{\code{last.char.nl}}{Boolean for the use of NL character at the
+#'   end of the text.}
+#'   \item{\code{last.char.code}}{Boolean for the use of code character at
+#'   the end of text.}
+#'   \item{\code{first.3.chars.letters}}{Number of letters in the three
+#'   first characters.}
+#'   \item{\code{emoticons}}{Number of emoticons}
+#'   \item{\code{first.char.at}}{Boolean for the use of @ character at
+#'   the beginning of the line.}
+#' }
+#'
+#' @param text The text.
+#' @return A data.table with values of the 11 features.
+#' @export
+FeatureExtraction <- function(text) {
+  data <- data.table(text)
+  features <- list(ratio.caps=features$CapsRatio,
+                   ratio.specials=features$SpecialCharsRatio,
+                   ratio.numbers=features$NumbersRatio,
+                   length.words=features$AverageWordLength,
+                   stopwords=features$StopwordsRatio1,
+                   stopwords2=features$StopwordsRatio2,
+                   last.char.code=features$LastCharCode,
+                   last.char.nl=features$LastCharNL,
+                   first.3.chars.letters=features$First3CharsLetter,
+                   emoticons=features$Emoticons,
+                   first.char.at=features$StartWithAt)
+  as.data.table(ComputeFeatures(text, features))
+}
+
+#' Character 3-gram extraction.
+#'
+#' Computes the document term matrix of character 3-gram.
+#'
+#' @param text The text.
+#' @return A document term matrix (sparse Matrix).
+#' @export
+Character3Grams <- function(text) {
+  Preprocessor <- function(x) {
+    gsub("[0-9]", "0", gsub("\\\032", "", x))
+  }
+  Tokenizer <- function (x) {
+    tokenizers::tokenize_character_shingles(x, n=3, strip_non_alphanum=FALSE,
+                                            lowercase=TRUE)
+  }
+  it <- text2vec::itoken(text, tokenizer=Tokenizer,
+                         preprocessor=Preprocessor,
+                         progressbar=TRUE)
+  vocab <- text2vec::create_vocabulary(it)
+  vectorizer <- text2vec::vocab_vectorizer(vocab)
+  text2vec::create_dtm(it, vectorizer)
+}
+
+#' 3-grams and feature extraction.
+#'
+#' Computes both 3-gram and simple text features.
+#'
+#' @param text The text.
+#' @return A sparse Matrix with text features and 3-gram.
+#' @seealso \code{\link{Character3Grams}}
+#' @seealso \code{\link{FeatureExtraction}}
+#' @export
+TriGramsAndFeatures <- function(text) {
+  cbind(Character3Grams(text), as.matrix(FeatureExtraction(text)))
+}
+
+#' Features.
+#'
+#' Module containing functions for individual simple text feature
+#' extraction.
+#'
+#' Most functions have a single \code{text} parameter. The module
+#' contains the following functions:
+#'
+#' \describe{
+#'   \item{\code{Stopwords}}{Number of stopwords. Uses two optional
+#'   parameters: \code{Tokenize} which is the word tokenizer to use
+#'   and \code{stopwords} which is the list of stopwords to use.}
+#'   \item{\code{Tokenize1}}{First tokenizer available for
+#'   \code{Stopwords}.}
+#'   \item{\code{Tokenize2}}{Second tokenizer available for
+#'   \code{Stopwords}.}
+#'   \item{\code{StopwordsRatio1}}{Ratio of stopwords using \code{Tokenize1}}
+#'   \item{\code{StopwordsRatio2}}{Ratio of stopwords using \code{Tokenize2}}
+#'   \item{\code{Caps}}{Number of uppercase letters.}
+#'   \item{\code{CapsRatio}}{Ratio of uppercase letters.}
+#'   \item{\code{SpecialChars}}{Number of special characters.}
+#'   \item{\code{SpecialCharsRatio}}{Ratio of special characters.}
+#'   \item{\code{Numbers}}{Number of digit characters.}
+#'   \item{\code{NumbersRatio}}{Ratio of digit characters.}
+#'   \item{\code{Words}}{Number of words.}
+#'   \item{\code{AverageWordLength}}{Average word length.}
+#'   \item{\code{LastCharCode}}{Boolean for the use of a code character at the
+#'   end of the text.}
+#'   \item{\code{LastCharNL}}{Boolean for the use of a natural language boolean
+#'   at the end of the text.}
+#'   \item{\code{First3Chars}}{Returns the first three non white characters.}
+#'   \item{\code{First3CharsLetters}}{The number of three first non white
+#'   characters that are letters.}
+#'   \item{\code{Emoticons}}{The number of emoticons}
+#'   \item{\code{StartWithAt}}{Boolean for the use of @ at the start of
+#'   the text.}
+#' }
+#'
+#' @export
+features <- modules::module({
+  modules::export("^[^.]")
+
+  .CountRegexMatches <- function(text, re) {
+    ## Count the number of match of a regex
+    sapply(stringr::str_match_all(text, re), length)
+  }
+
+  Tokenize1 <- function(text) {
+    ## Need to be fixed: add punctuation for seperator (at least .)
+    ## lapply(tolower(text), tokenize_regex, pattern="\\s+", simplify=TRUE)
+    lapply(tolower(text), tokenizers::tokenize_regex,
+           pattern="\\s+", simplify=TRUE)
+  }
+
+  Tokenize2 <- function(text) {
+    tokenizers::tokenize_words(text, simplify=TRUE)
+  }
+
+  Stopwords <- function(text, Tokenize=Tokenize, stopwords=mysql.stopwords) {
+    ## Computes the number of stopwords present in text based on a given
+    ## Tokenize function
+    sapply(Tokenize(text), function(words) sum(words %in% stopwords))
+  }
+
+  Caps <- function(text) {
+    ## Number of uppercase characters
+    .CountRegexMatches(text, "[A-Z]")
+  }
+
+  SpecialChars <- function(text) {
+    ## Number of special characters
+    .CountRegexMatches(text, "[^a-zA-Z\\d\\s]")
+  }
+
+  Numbers <- function(text) {
+    ## Number of digits
+    .CountRegexMatches(text, "[\\d]")
+  }
+
+  CapsRatio <- function(text) Caps(text) / nchar(text)
+  SpecialCharsRatio <- function(text) SpecialChars(text) / nchar(text)
+  NumbersRatio <- function(text) Numbers(text) / nchar(text)
+  StopwordsRatio1 <- function(text) Stopwords(text, Tokenize1) / Words(text)
+  StopwordsRatio2 <- function(text) Stopwords(text, Tokenize2) / Words(text)
+
+  Words <- function(text) {
+    ## Number of words
+    ## .CountRegexMatches(text, "\\w")
+    .CountRegexMatches(text, "[\\s+]") + 1
+  }
+
+  AverageWordLength <- function(text) {
+    nchar(text) / Words(text)
+  }
+
+  LastCharCode <- function(text) {
+    ## Boolean whether last character is a character code
+    ## If the line ends to emoticon this is not true :-) so we
+    ## substract. R does not support lookahead in regex which would also
+    ## solve this
+    (!grepl("(:-\\)|;-\\)|:\\)|;\\)|:-\\(|:\\()$", text) &
+     grepl("[){;]$", text))
+  }
+
+  LastCharNL <- function(text) {
+    ## Last character is related to natural language (punctuation)
+    grepl("\\.$|\\!$|\\?$|:$|,$", text)
+  }
+
+  First3Chars <- function(text) {
+    ## First three characters (after stripping white spaces)
+    substr(gsub("\\s", "", text), 1, 3)
+  }
+
+  First3CharsLetters <- function(text) {
+    ## Number of characters in the firsrt three characters
+    .CountRegexMatches(First3Chars(text), "[a-zA-Z]")
+  }
+
+  Emoticons <- function(text) {
+    ## Number of emoticons
+    ## Using more larger lis of emoticons e.g. ones built for
+    ## SentiStrength, might cause more false postive as some of them are
+    ## similar to elements that appear in code.
+    .CountRegexMatches(text, ":-\\)|;-\\)|:\\)|;\\)|:-\\(|:\\(")
+  }
+
+  StartWithAt <- function(text) {
+    ## TRUE if text starts with @ symbol
+    grepl("^@", text)
+    ## sapply(text, function(x) sum(grep("^@", x)))
+  }
+})
diff --git a/R/model.R b/R/model.R
new file mode 100644
index 0000000..7fc94d5
--- /dev/null
+++ b/R/model.R
@@ -0,0 +1,84 @@
+#' NLoN Model.
+#'
+#' Train a NLoN model using glmnet.
+#'
+#' @param text A character vector containing the training text.
+#' @param response A factor response with levels NL (for natural
+#'   language) and Not (for not natural language).
+#' @param features A function computing the feature values (a matrix,
+#'   list of numeric vectors or data.frame) or a list of functions
+#'   computing individual feature values.
+#' @param alpha The elasticnet mixing parameter used by
+#'   \code{\link[glmnet]{glmnet}}.
+#' @param ... Additional parameters to pass to
+#'   \code{\link[glmnet]{glmnet}}.
+#' @return A \code{\link[glmnet]{glmnet}} trained model.
+#' @seealso \code{\link[glmnet]{glmnet}}
+#' @export
+NLoNModel <- function(text, response, features=TriGramsAndFeatures,
+                      alpha=1, ...) {
+  data <- ComputeFeatures(text, features)
+  glmnet::glmnet(x=ConvertFeatures(data), y=response, family="binomial",
+                 alpha=alpha, ...)
+}
+
+#' NLoN Model.
+#'
+#' Train a NLoN model using glmnet.
+#'
+#' @param model A glmnet model as returned by \code{\link{NLoNModel}}.
+#' @param text A character vector containing the text to predict.
+#' @param lambda Lambda parameter to pass to
+#'   \code{\link[glmnet]{predict.glmnet}}.
+#' @param type Type of prediction made by
+#'   \code{\link[glmnet]{predict.glmnet}}.
+#' @param features A function computing the feature values (a matrix,
+#'   list of numeric vectors or data.frame) or a list of functions
+#'   computing individual feature values.
+#' @return The output of \code{\link[glmnet]{predict.glmnet}}.
+#' @seealso \code{\link[glmnet]{predict.glmnet}}
+#' @export
+NLoNPredict <- function(model, text, lambda=NULL, type="class",
+                        features=TriGramsAndFeatures) {
+  data <- ConvertFeatures(ComputeFeatures(text, features))
+  missing <- setdiff(rownames(model$beta), colnames(data))
+  if (length(missing)) {
+    data <- cbind(Matrix::sparseMatrix(i=c(), j=c(),
+                                       dims=c(nrow(data), length(missing)),
+                                       dimnames=list(NULL, missing)), data)
+  }
+  data <- data[, rownames(model$beta)]
+  predict(model, data, s=lambda, type=type)
+}
+
+#' NLoN Model.
+#'
+#' Train a NLoN model and gives the predicton for the data without
+#' response.
+#'
+#' The data.frame must contain a column \code{text} with both training
+#' and test data and a column response with the response value (factor
+#' with levels NL and Not). The response is NA for test data.
+#'
+#' @param data A data.frame containing the training and test data.
+#' @param lambda Lambda parameter to pass to
+#'   \code{\link[glmnet]{predict.glmnet}}.
+#' @param type Type of prediction made by
+#'   \code{\link[glmnet]{predict.glmnet}}.
+#' @param features A function computing the feature values (a matrix,
+#'   list of numeric vectors or data.frame) or a list of functions
+#'   computing individual feature values.
+#' @param ... Additional parameters to pass to \code{NLoNModel}.
+#' @return A vector of length \code{sum(is.na(data$response))} with
+#'   the prediction of the test data.
+#' @seealso \code{\link{NLoNModel}}
+#' @seealso \code{\link{NLoNPredict}}
+#' @seealso \code{\link[glmnet]{glmnet}}
+#' @seealso \code{\link[glmnet]{predict.glmnet}}
+#' @export
+NLoN <- function(data, lambda=NULL, type="class",
+                 features=TriGramsAndFeatures, ...) {
+  data <- as.data.table(data)
+  model <- data[!is.na(response), NLoNModel(text, response, features, ...)]
+  NLoNPredict(model, data[is.na(response), text], lambda, type, features)
+}
diff --git a/data/nlon.data.rda b/data/nlon.data.rda
new file mode 100644
index 0000000..f9bd106
Binary files /dev/null and b/data/nlon.data.rda differ
diff --git a/inst/CITATION b/inst/CITATION
new file mode 100644
index 0000000..a742283
--- /dev/null
+++ b/inst/CITATION
@@ -0,0 +1,20 @@
+citHeader("To cite NLoN in publications use:")
+
+citEntry(entry = "Article",
+         title = "Natural Language or Not (NLoN) - A Package for Software Engineering Text Analysis Pipeline",
+         author       = personList(as.person("Mikä Mäntylä"),
+                                   as.person("Fabio Calefato"),
+                                   as.person("Maëlick Claes")),
+         journal      = "MSR",
+         year         = "2018",
+         volume       = "",
+         number       = "",
+         pages        = "",
+         url          = "http://TODO",
+
+         textVersion  =
+         paste("Mikä Mäntylä, Fabio Calefato, Maëlick Claes (2018).",
+               "Natural Language or Not (NLoN) - A Package for",
+               "Software Engineering Text Analysis Pipeline.",
+               "MSR",
+               "URL http://TODO"))
diff --git a/inst/extdata/mysql_sw_wo_code_words.txt b/inst/extdata/mysql_sw_wo_code_words.txt
new file mode 100644
index 0000000..8e7585b
--- /dev/null
+++ b/inst/extdata/mysql_sw_wo_code_words.txt
@@ -0,0 +1,531 @@
+a's
+able
+about
+above
+according
+accordingly
+across
+actually
+after
+afterwards
+again
+against
+ain't
+all
+allow
+allows
+almost
+alone
+along
+already
+also
+although
+always
+am
+among
+amongst
+an
+and
+another
+any
+anybody
+anyhow
+anyone
+anything
+anyway
+anyways
+anywhere
+apart
+appear
+appreciate
+appropriate
+are
+aren't
+around
+as
+aside
+ask
+asking
+associated
+at
+available
+away
+awfully
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+behind
+being
+believe
+below
+beside
+besides
+best
+better
+between
+beyond
+both
+brief
+but
+by
+c'mon
+came
+can
+can't
+cannot
+cant
+cause
+causes
+certain
+certainly
+changes
+clearly
+com
+come
+comes
+concerning
+consequently
+consider
+considering
+contain
+containing
+contains
+corresponding
+could
+couldn't
+course
+currently
+definitely
+described
+despite
+did
+didn't
+different
+do
+does
+doesn't
+doing
+don't
+done
+down
+downwards
+during
+each
+edu
+eg
+eight
+either
+else
+elsewhere
+enough
+entirely
+especially
+et
+etc
+even
+ever
+every
+everybody
+everyone
+everything
+everywhere
+exactly
+example
+except
+far
+few
+fifth
+first
+five
+followed
+following
+follows
+former
+formerly
+forth
+four
+from
+further
+furthermore
+get
+gets
+getting
+given
+gives
+go
+goes
+going
+gone
+got
+gotten
+greetings
+had
+hadn't
+happens
+hardly
+has
+hasn't
+have
+haven't
+having
+he
+he's
+hello
+help
+hence
+her
+here
+here's
+hereafter
+hereby
+herein
+hereupon
+hers
+herself
+hi
+him
+himself
+his
+hither
+hopefully
+how
+howbeit
+however
+i'd
+i'll
+i'm
+i've
+ie
+if
+ignored
+immediate
+in
+inasmuch
+inc
+indeed
+indicate
+indicated
+indicates
+inner
+insofar
+instead
+into
+inward
+is
+isn't
+it
+it'd
+it'll
+it's
+its
+itself
+just
+keep
+keeps
+kept
+know
+known
+knows
+last
+lately
+later
+latter
+latterly
+least
+less
+lest
+let
+let's
+like
+liked
+likely
+little
+look
+looking
+looks
+ltd
+mainly
+many
+may
+maybe
+me
+mean
+meanwhile
+merely
+might
+more
+moreover
+most
+mostly
+much
+must
+my
+myself
+name
+namely
+near
+nearly
+necessary
+need
+needs
+neither
+never
+nevertheless
+new
+next
+nine
+no
+nobody
+non
+none
+noone
+nor
+normally
+not
+nothing
+novel
+now
+nowhere
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+on
+once
+one
+ones
+only
+onto
+or
+other
+others
+otherwise
+ought
+our
+ours
+ourselves
+out
+outside
+over
+overall
+own
+particular
+particularly
+per
+perhaps
+placed
+please
+plus
+possible
+presumably
+probably
+provides
+que
+quite
+rather
+really
+reasonably
+regarding
+regardless
+regards
+relatively
+respectively
+right
+said
+same
+saw
+say
+saying
+says
+second
+secondly
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+self
+selves
+sensible
+sent
+serious
+seriously
+seven
+several
+shall
+she
+should
+shouldn't
+since
+six
+so
+some
+somebody
+somehow
+someone
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+sorry
+specified
+specify
+specifying
+still
+sub
+such
+sup
+sure
+t's
+take
+taken
+tell
+tends
+than
+thank
+thanks
+thanx
+that
+that's
+thats
+the
+their
+theirs
+them
+themselves
+then
+thence
+there
+there's
+thereafter
+thereby
+therefore
+therein
+theres
+thereupon
+these
+they
+they'd
+they'll
+they're
+they've
+think
+third
+this
+thorough
+thoroughly
+those
+though
+three
+through
+throughout
+thru
+thus
+to
+together
+too
+took
+toward
+towards
+tried
+tries
+truly
+trying
+twice
+two
+under
+unfortunately
+unless
+unlikely
+until
+unto
+up
+upon
+us
+use
+used
+useful
+uses
+using
+usually
+value
+various
+very
+via
+viz
+vs
+want
+wants
+was
+wasn't
+way
+we
+we'd
+we'll
+we're
+we've
+welcome
+well
+went
+were
+weren't
+what
+what's
+whatever
+when
+whence
+whenever
+where
+where's
+whereafter
+whereas
+whereby
+wherein
+whereupon
+wherever
+whether
+which
+whither
+who
+who's
+whoever
+whole
+whom
+whose
+why
+will
+willing
+wish
+with
+within
+without
+won't
+wonder
+would
+wouldn't
+yes
+yet
+you
+you'd
+you'll
+you're
+you've
+your
+yours
+yourself
+yourselves
+zero
diff --git a/man/Character3Grams.Rd b/man/Character3Grams.Rd
new file mode 100644
index 0000000..1ac70c5
--- /dev/null
+++ b/man/Character3Grams.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/features.R
+\name{Character3Grams}
+\alias{Character3Grams}
+\title{Character 3-gram extraction.}
+\usage{
+Character3Grams(text)
+}
+\arguments{
+\item{text}{The text.}
+}
+\value{
+A document term matrix (sparse Matrix).
+}
+\description{
+Computes the document term matrix of character 3-gram.
+}
diff --git a/man/FeatureExtraction.Rd b/man/FeatureExtraction.Rd
new file mode 100644
index 0000000..eb0541f
--- /dev/null
+++ b/man/FeatureExtraction.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/features.R
+\name{FeatureExtraction}
+\alias{FeatureExtraction}
+\title{Feature extraction.}
+\usage{
+FeatureExtraction(text)
+}
+\arguments{
+\item{text}{The text.}
+}
+\value{
+A data.table with values of the 11 features.
+}
+\description{
+Computes a set of simple text-based features.
+}
+\details{
+The features computed are the followings:
+\describe{
+  \item{\code{ratio.caps}}{The ratio of uppercase letters.}
+  \item{\code{ratio.specials}}{The ratio of special characters.}
+  \item{\code{ratio.numbers}}{The ratio of number characters.}
+  \item{\code{length.words}}{The average word length.}
+  \item{\code{stopwords}}{The ratio of English stopwords (using first
+  tokenizer).}
+  \item{\code{stopwords2}}{The ratio of English stopwords (using second
+  tokenizer).}
+  \item{\code{last.char.nl}}{Boolean for the use of NL character at the
+  end of the text.}
+  \item{\code{last.char.code}}{Boolean for the use of code character at
+  the end of text.}
+  \item{\code{first.3.chars.letters}}{Number of letters in the three
+  first characters.}
+  \item{\code{emoticons}}{Number of emoticons}
+  \item{\code{first.char.at}}{Boolean for the use of @ character at
+  the beginning of the line.}
+}
+}
diff --git a/man/NLoN.Rd b/man/NLoN.Rd
new file mode 100644
index 0000000..f039d3a
--- /dev/null
+++ b/man/NLoN.Rd
@@ -0,0 +1,71 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/NLoN.R, R/model.R
+\docType{package}
+\name{NLoN}
+\alias{NLoN}
+\alias{NLoN-package}
+\alias{NLoN}
+\title{NLoN: Natural Language or Not}
+\usage{
+NLoN(data, lambda = NULL, type = "class", features = TriGramsAndFeatures,
+  ...)
+}
+\arguments{
+\item{data}{A data.frame containing the training and test data.}
+
+\item{lambda}{Lambda parameter to pass to
+\code{\link[glmnet]{predict.glmnet}}.}
+
+\item{type}{Type of prediction made by
+\code{\link[glmnet]{predict.glmnet}}.}
+
+\item{features}{A function computing the feature values (a matrix,
+list of numeric vectors or data.frame) or a list of functions
+computing individual feature values.}
+
+\item{...}{Additional parameters to pass to \code{NLoNModel}.}
+}
+\value{
+A vector of length \code{sum(is.na(data$response))} with
+  the prediction of the test data.
+}
+\description{
+NLoN identifies whether text lines are natural language or not
+using a glmnet model with simple text features and character
+3-grams.
+
+Train a NLoN model and gives the predicton for the data without
+response.
+}
+\details{
+The data.frame must contain a column \code{text} with both training
+and test data and a column response with the response value (factor
+with levels NL and Not). The response is NA for test data.
+}
+\examples{
+
+## Training data provided in the package.
+data(nlon.data)
+
+## Build a model with glmnet
+model <- with(nlon.data, NLoNModel(text, rater2, TriGramsAndFeatures))
+
+## Use the model to preidct new data.
+topredict <- c("This is natural language.", "not(natural, language);")
+NLoNPredict(model, topredict, 0.1, features=FeatureExtraction)
+
+## Train and predict in a single function call.
+NLoN(rbind(nlon.data[, list(text, response=rater2)],
+          list(text=topredict), fill=TRUE),
+    0.1, features=FeatureExtraction)
+
+}
+\seealso{
+\code{\link{NLoNModel}}
+
+\code{\link{NLoNPredict}}
+
+\code{\link[glmnet]{glmnet}}
+
+\code{\link[glmnet]{predict.glmnet}}
+}
diff --git a/man/NLoNModel.Rd b/man/NLoNModel.Rd
new file mode 100644
index 0000000..af75cb0
--- /dev/null
+++ b/man/NLoNModel.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/model.R
+\name{NLoNModel}
+\alias{NLoNModel}
+\title{NLoN Model.}
+\usage{
+NLoNModel(text, response, features = TriGramsAndFeatures, alpha = 1, ...)
+}
+\arguments{
+\item{text}{A character vector containing the training text.}
+
+\item{response}{A factor response with levels NL (for natural
+language) and Not (for not natural language).}
+
+\item{features}{A function computing the feature values (a matrix,
+list of numeric vectors or data.frame) or a list of functions
+computing individual feature values.}
+
+\item{alpha}{The elasticnet mixing parameter used by
+\code{\link[glmnet]{glmnet}}.}
+
+\item{...}{Additional parameters to pass to
+\code{\link[glmnet]{glmnet}}.}
+}
+\value{
+A \code{\link[glmnet]{glmnet}} trained model.
+}
+\description{
+Train a NLoN model using glmnet.
+}
+\seealso{
+\code{\link[glmnet]{glmnet}}
+}
diff --git a/man/NLoNPredict.Rd b/man/NLoNPredict.Rd
new file mode 100644
index 0000000..d429070
--- /dev/null
+++ b/man/NLoNPredict.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/model.R
+\name{NLoNPredict}
+\alias{NLoNPredict}
+\title{NLoN Model.}
+\usage{
+NLoNPredict(model, text, lambda = NULL, type = "class",
+  features = TriGramsAndFeatures)
+}
+\arguments{
+\item{model}{A glmnet model as returned by \code{\link{NLoNModel}}.}
+
+\item{text}{A character vector containing the text to predict.}
+
+\item{lambda}{Lambda parameter to pass to
+\code{\link[glmnet]{predict.glmnet}}.}
+
+\item{type}{Type of prediction made by
+\code{\link[glmnet]{predict.glmnet}}.}
+
+\item{features}{A function computing the feature values (a matrix,
+list of numeric vectors or data.frame) or a list of functions
+computing individual feature values.}
+}
+\value{
+The output of \code{\link[glmnet]{predict.glmnet}}.
+}
+\description{
+Train a NLoN model using glmnet.
+}
+\seealso{
+\code{\link[glmnet]{predict.glmnet}}
+}
diff --git a/man/TriGramsAndFeatures.Rd b/man/TriGramsAndFeatures.Rd
new file mode 100644
index 0000000..f17f9cc
--- /dev/null
+++ b/man/TriGramsAndFeatures.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/features.R
+\name{TriGramsAndFeatures}
+\alias{TriGramsAndFeatures}
+\title{3-grams and feature extraction.}
+\usage{
+TriGramsAndFeatures(text)
+}
+\arguments{
+\item{text}{The text.}
+}
+\value{
+A sparse Matrix with text features and 3-gram.
+}
+\description{
+Computes both 3-gram and simple text features.
+}
+\seealso{
+\code{\link{Character3Grams}}
+
+\code{\link{FeatureExtraction}}
+}
diff --git a/man/features.Rd b/man/features.Rd
new file mode 100644
index 0000000..7d6410a
--- /dev/null
+++ b/man/features.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/features.R
+\docType{data}
+\name{features}
+\alias{features}
+\title{Features.}
+\format{An object of class \code{module} (inherits from \code{list}) of length 19.}
+\usage{
+features
+}
+\description{
+Module containing functions for individual simple text feature
+extraction.
+}
+\details{
+Most functions have a single \code{text} parameter. The module
+contains the following functions:
+
+\describe{
+  \item{\code{Stopwords}}{Number of stopwords. Uses two optional
+  parameters: \code{Tokenize} which is the word tokenizer to use
+  and \code{stopwords} which is the list of stopwords to use.}
+  \item{\code{Tokenize1}}{First tokenizer available for
+  \code{Stopwords}.}
+  \item{\code{Tokenize2}}{Second tokenizer available for
+  \code{Stopwords}.}
+  \item{\code{StopwordsRatio1}}{Ratio of stopwords using \code{Tokenize1}}
+  \item{\code{StopwordsRatio2}}{Ratio of stopwords using \code{Tokenize2}}
+  \item{\code{Caps}}{Number of uppercase letters.}
+  \item{\code{CapsRatio}}{Ratio of uppercase letters.}
+  \item{\code{SpecialChars}}{Number of special characters.}
+  \item{\code{SpecialCharsRatio}}{Ratio of special characters.}
+  \item{\code{Numbers}}{Number of digit characters.}
+  \item{\code{NumbersRatio}}{Ratio of digit characters.}
+  \item{\code{Words}}{Number of words.}
+  \item{\code{AverageWordLength}}{Average word length.}
+  \item{\code{LastCharCode}}{Boolean for the use of a code character at the
+  end of the text.}
+  \item{\code{LastCharNL}}{Boolean for the use of a natural language boolean
+  at the end of the text.}
+  \item{\code{First3Chars}}{Returns the first three non white characters.}
+  \item{\code{First3CharsLetters}}{The number of three first non white
+  characters that are letters.}
+  \item{\code{Emoticons}}{The number of emoticons}
+  \item{\code{StartWithAt}}{Boolean for the use of @ at the start of
+  the text.}
+}
+}
+\keyword{datasets}
diff --git a/man/nlon.data.Rd b/man/nlon.data.Rd
new file mode 100644
index 0000000..5187a3e
--- /dev/null
+++ b/man/nlon.data.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{nlon.data}
+\alias{nlon.data}
+\title{Training data used in the NLoN paper.}
+\format{A data frame with 6000 rows and 4 columns:
+\describe{
+  \item{source}{source from the text (mozilla, kubertenes or lucene).}
+  \item{text}{line of text.}
+  \item{rater1}{reponse from the first rater.}
+  \item{rater2}{reponse from the second rater.}
+}}
+\source{
+\url{https://bugzilla.mozilla.org/}
+\url{http://www.kubertenes/}
+\url{http://lucene}
+}
+\usage{
+nlon.data
+}
+\description{
+A dataset containing 2000 lines of text from Mozilla Firefox,
+Lucene and Kubertenes datasets alongside two response variables
+from two different raters.
+}
+\keyword{datasets}
diff --git a/tests/testthat.R b/tests/testthat.R
new file mode 100644
index 0000000..7fae518
--- /dev/null
+++ b/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(NLoN)
+
+test_check("NLoN")
diff --git a/tests/testthat/test-features.R b/tests/testthat/test-features.R
new file mode 100644
index 0000000..5304b55
--- /dev/null
+++ b/tests/testthat/test-features.R
@@ -0,0 +1,137 @@
+context("features")
+
+test_that("Stopwords works", {
+  text <- c("", "text", "123", "!@#$", "This is some text.",
+            "This isn't some text.", "This is.")
+  expect_equal(features$Stopwords(text, features$Tokenize1),
+               c(0, 0, 0, 0, 3, 3, 1))
+  expect_equal(features$Stopwords(text, features$Tokenize2),
+               c(0, 0, 0, 0, 3, 3, 2))
+})
+
+test_that("Caps works", {
+  text <- c("", "text", "123", "!@#$", "This is some text.",
+            "A", "ABC", "aaABCaa")
+  expect_equal(features$Caps(text),
+               c(0, 0, 0, 0, 1, 1, 3, 3))
+})
+
+test_that("SpecialChars works", {
+  text <- c("", "text", "123", "!@#$", "This is some text.",
+            "x-y", "test;", "just some text")
+  expect_equal(features$SpecialChars(text),
+               c(0, 0, 0, 4, 1, 1, 1, 0))
+})
+
+test_that("Numbers works", {
+  text <- c("", "text", "123", "!@#$", "This is some text.",
+            "There is 1 number.")
+  expect_equal(features$Numbers(text),
+               c(0, 0, 3, 0, 0, 1))
+})
+
+test_that("Words works", {
+  text <- c("", "text", "123", "!@#$", "This is some text.",
+            "one-word.", "abc!def")
+  expect_equal(features$Words(text),
+               c(1, 1, 1, 1, 4, 1, 1))
+})
+
+test_that("AverageWordLength works", {
+  text <- c("", "123", "123 123", "1", "!2c$", "abc def!", "1 234")
+  expect_equal(features$AverageWordLength(text),
+               c(0, 3, 3.5, 1, 4, 4, 2.5))
+})
+
+test_that("LastCharCode works", {
+  text <- c("", "This is text.", "func(x, y);", "if (true) {",
+            "func()", ":-)", "Hello ;-)", ":)", ":-(", ":(", ":-))")
+  expect_equal(features$LastCharCode(text),
+               c(FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE,
+                 FALSE, FALSE, TRUE))
+})
+
+test_that("LastCharNL works", {
+  text <- c("", "abc", "1", ".", "!", "?", "? ", ":", ",", "Hello!!")
+  expect_equal(features$LastCharNL(text),
+               c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE,
+                 TRUE, TRUE))
+})
+
+test_that("LastCharCode and LastCharNL are not both true", {
+  text <- c("", "This is text.", "func(x, y);", "if (true) {",
+            "func()", ":-)", "Hello ;-)", ":)", ":-(", ":(", ":-))",
+            "", "abc", "1", ".", "!", "?", "? ", ":", ",", "Hello!!")
+  expect_true(all(!features$LastCharCode(text) | !features$LastCharNL(text)))
+})
+
+test_that("First3Chars works", {
+  text <- c("", "1", "1  2345", " 12345", "      12345")
+  expect_equal(features$First3Chars(text),
+               c("", "1", "123", "123", "123"))
+})
+
+test_that("First3CharsLetters works", {
+  text <- c("", "a", " a", "   abc", "!@#abc", " a2#d", "Hello",
+            "123", "!@#", "H3ll0")
+  expect_equal(features$First3CharsLetters(text),
+               c(0, 1, 1, 3, 0, 1, 3, 0, 0, 2))
+})
+
+test_that("Emoticons works", {
+  text <- c("", "123", "abc", ":--)", ":-)", "Hello ;-)", "\":-)\";",
+            ":)", ";)", ":-(", ":(", ":(:)", ":) :) :)", ":):(:")
+  expect_equal(features$Emoticons(text),
+               c(0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2))
+})
+
+test_that("StartWithAt works", {
+  text <- c("", "abc", "123", "!@#", "@abc", "@", " @")
+  expect_equal(features$StartWithAt(text),
+               c(FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE))
+})
+
+test_that("FeatureExtraction works", {
+  text <- c("", "abc", "123", "!@#")
+  expect_equal(names(FeatureExtraction(text)),
+               c("ratio.caps", "ratio.specials", "ratio.numbers",
+                 "length.words", "stopwords", "stopwords2",
+                 "last.char.code", "last.char.nl",
+                 "first.3.chars.letters", "emoticons", "first.char.at"))
+})
+
+test_that("Character3Grams works", {
+  text <- c("", "abcd", "1234", "!@#$", "1234abcd")
+  res <- Character3Grams(text)
+  expect_equal(dim(res), c(5, 7))
+  expect_equal(colnames(res),
+               c("0ab", "@#$", "00a", "!@#", "bcd", "abc", "000"))
+})
+
+test_that("ConvertFeatures works", {
+  text <- c("", "abcd", "1234", "!@#$", "1234abcd")
+  expect_true(inherits(ConvertFeatures(Character3Grams(text)), "Matrix"))
+  expect_true(inherits(ConvertFeatures(TriGramsAndFeatures(text)), "Matrix"))
+  expect_true(is.matrix(ConvertFeatures(as.list(FeatureExtraction(text)))))
+  expect_true(is.matrix(ConvertFeatures(FeatureExtraction(text))))
+  expect_error(ConvertFeatures(list(1:9, 1:10)))
+  expect_error(ConvertFeatures(data.table(1:26, letters)))
+})
+
+test_that("ComputeFeatures works", {
+  text <- c("", "abcd", "1234", "!@#$", "1234abcd")
+  expect_true(inherits(ComputeFeatures(text, Character3Grams), "Matrix"))
+  expect_true(inherits(ComputeFeatures(text, TriGramsAndFeatures), "Matrix"))
+  expect_true(is.data.frame(ComputeFeatures(text, FeatureExtraction)))
+
+  res <- ComputeFeatures(text, list(caps=features$Caps,
+                                    special.chars=features$SpecialChars,
+                                    numbers=features$Numbers,
+                                    words=features$Words))
+  expect_true(is.matrix(res))
+  expect_warning(ComputeFeatures(text, list(features$Caps,
+                                            features$SpecialChars,
+                                            features$Numbers,
+                                            features$Words)))
+  expect_error(ComputeFeatures(text, list(1, 2, 3)))
+})
diff --git a/tests/testthat/test-old-features.R b/tests/testthat/test-old-features.R
new file mode 100644
index 0000000..909b196
--- /dev/null
+++ b/tests/testthat/test-old-features.R
@@ -0,0 +1,51 @@
+context("Regression test for feature extraction")
+
+FeatureExtractionOld <- function(labeled) {
+  labeled <- copy(labeled)
+  myslq_sw <- system.file("extdata", "mysql_sw_wo_code_words.txt",
+                          package="NLoN", mustWork=TRUE)
+  myslq_sw <- read.csv(myslq_sw, stringsAsFactors=FALSE, header=FALSE)
+  #Not same encoding as in cubernets. not used.
+  labeled$length <- nchar(labeled$text)
+  labeled$ratio.caps <- plyr::ldply(stringr::str_match_all(labeled$text,"[A-Z]"),length)/labeled$length
+  labeled$ratio.specials <- plyr::ldply(stringr::str_match_all(labeled$text,"[^a-zA-Z\\d\\s]"),length)/labeled$length
+  labeled$ratio.numbers <- plyr::ldply(stringr::str_match_all(labeled$text,"[\\d]"),length)/labeled$length
+
+  labeled$length.words <- labeled$length /(plyr::ldply(stringr::str_match_all(labeled$text,"[\\s+]"),length) +1)
+  labeled$words <- plyr::ldply(stringr::str_match_all(labeled$text,"[\\s+]"),length) +1
+  #we count stopword twice with two different tokenizers.
+  labeled[,stopwords:= sapply(text, function(x) sum(tolower(tokenizers::tokenize_regex(x, pattern = "\\s+", simplify = TRUE)) %in%  myslq_sw$V1))/ labeled$words]
+  labeled[,stopwords2:= sapply(text, function(x) sum(tokenizers::tokenize_words(x, simplify = TRUE) %in%  myslq_sw$V1))/ labeled$words]
+  #If the line ends to emoticon this is not true :-) so we substract. R does not support lookahead in regex which would also solve this
+  labeled[,last.char.code:= sapply(text, function(x) max(c(sum(grep("\\)$|\\{$|;$", x)) - sum (grep (":-\\)$|;-\\)|:\\)$|;\\)$|:-\\($|:\\($", x)), 0)))]
+
+  labeled[,last.char.nl:= sapply(text, function(x) sum(grep("\\.$|\\!$|\\?$|:$|,$", x)))]
+  labeled$first.3.chars.letters <- plyr::ldply  (stringr::str_match_all(substr(gsub("\\s", "", labeled$text), 1,3), "[a-zA-Z]"),length)
+  labeled$emoticons <- plyr::ldply(stringr::str_match_all(labeled$text,":-\\)|;-\\)|:\\)|;\\)|:-\\(|:\\("),length)
+  #Sanity chek should empty
+  #labeled[last.char.nl==1 & last.char.code==1]$text
+
+  #New features from Kubernets
+  #Starts with @ sign
+  labeled[,first.char.at:= sapply(text, function(x) sum(grep("^@", x)))]
+  labeled
+}
+
+test_that("FeatureExtraction gives same results as FeatureExtractionOld", {
+  skip_on_cran()
+  data(nlon.data)
+  res <- FeatureExtraction(nlon.data$text)
+  res.old <- FeatureExtractionOld(nlon.data)
+
+  expect_equal(res$ratio.caps, res.old$ratio.caps)
+  expect_equal(res$ratio.specials, res.old$ratio.specials)
+  expect_equal(res$ratio.numbers, res.old$ratio.numbers)
+  expect_equal(res$length.words, res.old$length.words)
+  expect_equal(res$stopwords, res.old$stopwords)
+  expect_equal(res$stopwords2, res$stopwords2)
+  expect_equal(res$last.char.code, res.old$last.char.code)
+  expect_equal(res$last.char.nl, res.old$last.char.nl)
+  expect_equal(res$first.3.chars.letters, res.old$first.3.chars.letters)
+  expect_equal(res$emoticons, res.old$emoticons)
+  expect_equal(res$first.char.at, res.old$first.char.at)
+})