# Imports 

In [1]:
using Pkg
haskey(Pkg.installed(), "ProgressBars") || Pkg.add("ProgressBars")
haskey(Pkg.installed(), "Languages") || Pkg.add("Languages")
haskey(Pkg.installed(), "RecursiveArrayTools") || Pkg.add("RecursiveArrayTools")

└ @ Pkg /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.5/Pkg/src/Pkg.jl:554
└ @ Pkg /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.5/Pkg/src/Pkg.jl:554
└ @ Pkg /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.5/Pkg/src/Pkg.jl:554


true

In [2]:
using Languages, RecursiveArrayTools
using Random: shuffle!

# Utils

In [3]:
function load(data_path::String)
    reviews = [] # tuples of rating and review
    for file in readdir(data_path)
        if endswith(file, ".txt")
            file_path = joinpath(data_path, file)
            rating = (split(split(file, "_")[2], ".")[1])
            rating = parse(Int, rating)
            f  = open(file_path, "r")
            comment = read(f, String)
            push!(reviews, [rating < 5 ? 1 : 2, split(comment, " ")])
            close(f)
        end
    end
    return reviews
end

w2i!(x) = get!(wdict, x, length(wdict)+1)
w2i(x) = get(wdict, x, wdict["UNK"])

struct NaiveBayes; prior; cnt; ngram; eps; end

# Pre-porcessing

In [4]:
function remove_stop_words!(review)
    review = review[(x -> (x ∉ _stopwords) && (x ∉ puncs)).(review)]
    return review
end

remove_stop_words! (generic function with 1 method)

In [5]:
function clean_words!(lst)
    for review in lst
        review[2] = review[2] |> remove_stop_words! |> (x -> lowercase.(x)) |> (x -> replace.(x, r"[^a-zA-Z\s-]" => ""))
    end
end

clean_words! (generic function with 1 method)

In [6]:
function count_words(lst)
    wcnt = Dict()
    for review in lst
        (x -> wcnt[x] = get(wcnt, x, 0) + 1).(review[2])
    end
    return wcnt
end

count_words (generic function with 1 method)

In [7]:
function convert_to_UNK!(lst)
# convert all less frequent words to UNK then to integer
    wcnt = count_words(dtrn)
    for review in lst
        review[2] = (x -> if get(wcnt, x, 0) < UNK_THRESHOLD; "UNK" ; else; x end).(review[2])
    end
end

convert_to_UNK! (generic function with 1 method)

In [8]:
# hash n consecutive words
hash(x) = ((i, x) -> (x * length(wcnt) ^ (i-1)) % 1e9).(1:length(x), reverse(x)) |> sum

hash (generic function with 1 method)

In [9]:
function cnt_n_grams(lst, ngram)
    # count the words according to review
    tmp = []
    cnt = [Dict() for i in 1:2]
    for review in lst
        for word in review[2]
            length(tmp) >= ngram && deleteat!(tmp, 1)
            push!(tmp, word)
            for j in reverse(1:length(tmp))
                val = join(tmp[j:length(tmp)], "-")
                cnt[review[1]][val] = get(cnt[review[1]], val, 0) + 1
            end
        end
    end
    return cnt
end

cnt_n_grams (generic function with 1 method)

In [54]:
# predict probability according to a weight matrix
function predict(review, nb)
# count the words according to review
    tmp, scores = [], zeros(length(nb.cnt))
    for word in review
        length(tmp) >= nb.ngram && deleteat!(tmp, 1)
        push!(tmp, word)
        for j in reverse(1:length(tmp))
            val = join(tmp[j:length(tmp)], "-")
            scores += [log((get(nb.cnt[i], val, 0) + nb.eps)) for i in 1:length(nb.cnt)]
        end
    end
    scores += log.(nb.prior)
    return argmax(scores)
end

predict (generic function with 1 method)

In [11]:
function accuracy(dtst, nb)
    cnt = 0
    for re in dtst
        p = predict(re[2], nb)
        cnt += (p == re[1])
    end
    return cnt/length(dtst)
end

accuracy (generic function with 1 method)

# Config

In [12]:
train_data_path = "aclImdb/test"
test_data_path = "aclImdb/train"
UNK_THRESHOLD = 10
_stopwords = vcat(stopwords(Languages.English()), prepositions(Languages.English()), pronouns(Languages.English()), articles(Languages.English()))
puncs = [".", ",", "&", "--", ""]
n_gram = 4
nothing

# Prediction

In [13]:
dtrn = vcat(load(joinpath(train_data_path, "pos")), load(joinpath(train_data_path, "neg")))
nothing

In [14]:
dtst = vcat(load(joinpath(test_data_path, "pos")), load(joinpath(test_data_path, "neg")))
nothing

In [15]:
clean_words!(vcat(dtrn, dtst))

In [17]:
convert_to_UNK!(dtrn)

In [18]:
# keep a copy of data
cp_dtrn, cp_dtst = deepcopy.([dtrn, dtst])
nothing

In [19]:
# run for restoring the data
dtrn, dtst = deepcopy.([cp_dtrn, cp_dtst])

2-element Array{Array{Any,1},1}:
 [Any[2, ["movie", "night", "UNK", "friends", "mine", "ill", "admit", "reluctant", "ashton", "kutcher"  …  "grown", "well", "trying", "desperately", "crying", "this", "movie", "great", "suggest", "judge"]], Any[2, ["actor", "director", "bill", "paxton", "follows", "promising", "debut", "UNK", "frailty", "family"  …  "it", "despite", "formulaic", "nature", "nice", "easy", "film", "root", "deserves", "audience"]], Any[2, ["as", "UNK", "UNK", "knowledge", "sports", "history", "pleased", "disneys", "sensitivity", "issues"  …  "viewer", "disney", "the", "ending", "miracle", "disney", "creation", "human", "history", "written"]], Any[2, ["film", "sneak", "preview", "delightful", "the", "cinematography", "unusually", "creative", "acting", "good"  …  "dime", "dozen", "stands", "out", "br", "br", "this", "id", "recommend", "anyone"]], Any[2, ["bill", "paxton", "true", "story", "", "us", "golf", "film", "UNK", "game"  …  "grips", "-", "final", "scene", "affectiona

In [20]:
# convert words to integers
wdict = Dict()
w2i!("UNK")
for review in dtrn
    review[2] = w2i!.(review[2])
end

In [21]:
for review in dtst
    review[2] = w2i.(review[2])
end

In [22]:
# test word mapping
words = Array{String}(undef, length(wdict))
for (str, ind) in wdict; words[ind]=str;end
println.([words[first(dtst)[2]], first(cp_dtst)[2]])
nothing

["UNK", "high", "cartoon", "comedy", "it", "ran", "time", "programs", "school", "life", "teachers", "my", "", "teaching", "profession", "lead", "believe", "UNK", "highs", "satire", "closer", "reality", "teachers", "the", "UNK", "survive", "financially", "insightful", "students", "pathetic", "teachers", "UNK", "UNK", "situation", "remind", "schools", "students", "when", "episode", "student", "repeatedly", "tried", "burn", "school", "immediately", "recalled", "", "", "high", "a", "classic", "line", "inspector", "im", "sack", "teachers", "student", "welcome", "UNK", "high", "expect", "adults", "age", "UNK", "high", "fetched", "what", "pity", "isnt"]
["bromwell", "high", "cartoon", "comedy", "it", "ran", "time", "programs", "school", "life", "teachers", "my", "", "teaching", "profession", "lead", "believe", "bromwell", "highs", "satire", "closer", "reality", "teachers", "the", "scramble", "survive", "financially", "insightful", "students", "pathetic", "teachers", "pomp", "pettiness", "situ

In [55]:
# find class priors
prior = [sum([re[1] == i for re in dtrn]) for i in 1:2]

2-element Array{Int64,1}:
 12500
 12500

In [24]:
cnt = cnt_n_grams(dtrn, n_gram)
nothing

In [25]:
# test
cnt[1][join([w2i("hate"), w2i("movie")], "-")]

37

In [42]:
model = NaiveBayes(prior, cnt, n_gram, 10)
nothing

In [27]:
shuffle!.([dtst, dtrn])
nothing

In [37]:
trn_ac = accuracy(dtrn, model)

0.99644

In [36]:
tst_ac = accuracy(dtst, model)

0.84436

In [38]:
println("Acuraccy on test set: $tst_ac")

Acuraccy on test set: 0.84436


In [39]:
println("Acuraccy on train set: $trn_ac")

Acuraccy on train set: 0.99644
