JuliaText · aviks · May 15, 2019 · May 9, 2019 · May 9, 2019 · May 15, 2019
diff --git a/docs/make.jl b/docs/make.jl
@@ -14,6 +14,7 @@ makedocs(
         "Corpus" => "corpus.md",
         "Features" => "features.md",
         "Semantic Analysis" => "semantic.md",
+        "Classifier" => "classify.md",
         "Extended Example" => "example.md"
     ],
 )

diff --git a/docs/src/classify.md b/docs/src/classify.md
@@ -0,0 +1,57 @@
+# Classifier
+
+Text Analysis currently offers a Naive Bayes Classifier for text classification.
+
+To load the Naive Bayes Classifier, use the following command -
+
+    using TextAnalysis: NaiveBayesClassifier, fit!, predict
+
+## Basic Usage
+
+Its usage can be done in the following 3 steps.
+
+1- Create an instance of the Naive Bayes Classifier model -
+
+    model = NaiveBayesClassifier(dict, classes)
+
+
+It takes two arguments-
+
+* `classes`: An array of possible classes that the concerned data could belong to.
+* `dict`:(Optional Argument) An Array of possible tokens (words). This is automatically updated if a new token is detected in the Step 2) or 3)
+
+
+
+
+2- Fitting the model weights on input -
+
+    fit!(model, str, class)
+
+3- Predicting for the input case -
+
+    predict(model, str)
+
+## Example
+
+```julia
+julia> m = NaiveBayesClassifier([:legal, :financial])
+NaiveBayesClassifier{Symbol}(String[], Symbol[:legal, :financial], Array{Int64}(0,2))
+
+```
+
+```julia
+julia> fit!(m, "this is financial doc", :financial)
+NaiveBayesClassifier{Symbol}(["financial", "this", "is", "doc"], Symbol[:legal, :financial], [1 2; 1 2; 1 2; 1 2])
+
+julia> fit!(m, "this is legal doc", :legal)
+NaiveBayesClassifier{Symbol}(["financial", "this", "is", "doc", "legal"], Symbol[:legal, :financial], [1 2; 2 2; … ; 2 2; 2 1])
+
+```
+
+```julia
+julia> predict(m, "this should be predicted as a legal document")
+Dict{Symbol,Float64} with 2 entries:
+  :legal     => 0.666667
+  :financial => 0.333333
+
+```
diff --git a/src/bayes.jl b/src/bayes.jl
@@ -2,8 +2,11 @@ using WordTokenizers
 
 export NaiveBayesClassifier
 
-spam_tokenise(s) = WordTokenizers.tokenize(lowercase(replace(s, "."=>"")))
+simpleTokenise(s) = WordTokenizers.tokenize(lowercase(replace(s, "."=>"")))
 
+"""
+Create a dict that maps elements in input array to their frequencies.
+"""
 function frequencies(xs)
     frequencies = Dict{eltype(xs),Int}()
     for x in xs
@@ -12,6 +15,11 @@ function frequencies(xs)
     return frequencies
 end
 
+"""
+    features(::AbstractDict, dict)
+
+Compute an Array, mapping the value corresponding to elements of `dict` to the input `AbstractDict`.
+"""
 function features(fs::AbstractDict, dict)
     bag = zeros(Int, size(dict))
     for i = 1:length(dict)
@@ -20,7 +28,7 @@ function features(fs::AbstractDict, dict)
     return bag
 end
 
-features(s::AbstractString, dict) = features(frequencies(spam_tokenise(s)), dict)
+features(s::AbstractString, dict) = features(frequencies(simpleTokenise(s)), dict)
 
 Features{T<:Integer} = AbstractVector{T}
 
@@ -30,6 +38,29 @@ mutable struct NaiveBayesClassifier{T}
     weights::Matrix{Int}
 end
 
+"""
+    NaiveBayesClassifier([dict, ]classes)
+
+A Naive Bayes Classifier for classifying documents.
+
+# Example
+```julia-repl
+julia> using TextAnalysis: NaiveBayesClassifier, fit!, predict
+julia> m = NaiveBayesClassifier([:spam, :non_spam])
+NaiveBayesClassifier{Symbol}(String[], Symbol[:spam, :non_spam], Array{Int64}(0,2))
+
+julia> fit!(m, "this is spam", :spam)
+NaiveBayesClassifier{Symbol}(["this", "is", "spam"], Symbol[:spam, :non_spam], [2 1; 2 1; 2 1])
+
+julia> fit!(m, "this is not spam", :non_spam)
+NaiveBayesClassifier{Symbol}(["this", "is", "spam", "not"], Symbol[:spam, :non_spam], [2 2; 2 2; 2 2; 1 2])
+
+julia> predict(m, "is this a spam")
+Dict{Symbol,Float64} with 2 entries:
+  :spam     => 0.59883
+  :non_spam => 0.40117
+```
+"""
 NaiveBayesClassifier(dict, classes) =
     NaiveBayesClassifier(dict, classes,
              ones(Int, length(dict), length(classes)))
@@ -38,26 +69,43 @@ NaiveBayesClassifier(classes) = NaiveBayesClassifier(String[], classes)
 
 probabilities(c::NaiveBayesClassifier) = c.weights ./ sum(c.weights, dims = 1)
 
-function extend!(c::NaiveBayesClassifier, class)
-    push!(c.dict, class)
+"""
+    extend!(model::NaiveBayesClassifier, dictElement)
+
+Add the dictElement to dictionary of the Classifier `model`.
+"""
+function extend!(c::NaiveBayesClassifier, dictElement)
+    push!(c.dict, dictElement)
     c.weights = vcat(c.weights, ones(Int, length(c.classes))')
     return c
 end
 
+"""
+    fit!(model::NaiveBayesClassifier, str, class)
+    fit!(model::NaiveBayesClassifier, ::Features, class)
+
+Fit the weights for the model on the input data.
+"""
 function fit!(c::NaiveBayesClassifier, x::Features, class)
     n = findfirst(==(class), c.classes)
     c.weights[:, n] .+= x
     return c
 end
 
 function fit!(c::NaiveBayesClassifier, s::String, class)
-    fs = frequencies(spam_tokenise(s))
+    fs = frequencies(simpleTokenise(s))
     for k in keys(fs)
         k in c.dict || extend!(c, k)
     end
     fit!(c, features(s, c.dict), class)
 end
 
+"""
+    predict(::NaiveBayesClassifier, str)
+    predict(::NaiveBayesClassifier, ::Features)
+
+Predict probabilities for each class on the input Features or String.
+"""
 function predict(c::NaiveBayesClassifier, x::Features)
     ps = prod(probabilities(c) .^ x, dims = 1)
     ps ./= sum(ps)