# Packages

In [6]:
using Pkg
#haskey(Pkg.installed(), "Languages") || Pkg.add("Languages")
using Knet, Plots, Statistics, LinearAlgebra, Random
using Languages

# Constants

In [131]:
#=Citation: Publications Using the Dataset. Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).=#
DATA_URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
FILE_NAME = "aclImdb"
DATA_DIR = "./Data"
NGRAM = 5
nothing

# Download and preprocess data

In [127]:
#define a struct for a review that has a text (list of words)and a label
struct review txt; label;end

#download review from DARA_URL
function _download_reviews()
    if !isdir(DATA_DIR)
        mkdir(DATA_DIR)
    end
    url = DATA_URL
    path = "$DATA_DIR/$FILE_NAME"
    if !isdir(path)
        println("Downloading the data from the internet...")
        file = "aclImdb_v1.tar"
        #file = download(url, path)
        println("Extracting the data...")
        run(`tar -xvf $(joinpath(DATA_DIR, file)) -C ./DATA_DIR/ -C`)
        println("Finished.")
        rm(file)
    end
    return path
end

#creat a dictionary of stopwords
_stopwords = Dict(x => true for x in stopwords(Languages.English()))

#clean a review by 1-removing punctionations and extra spaces, split into an array or words 
#removing stopwords, and covert ot lowercase. Then retern a list of words
function clean_text(txt)
    txt = replace(txt, r"[^a-zA-Z\s-]" => " ")
    txt = replace(txt, r"--" => " ")
    txt = replace(txt, r"\s+" => " ")
    lst = split(txt)
    lst = map(lowercase, lst)
    #&& get.(_stopwords) != true 
    lst = lst[[(length(i) > 2 && get(_stopwords, i, false) != true) for i in lst]]
    return lst
end

#extract review from the files insided a specific directory, clean the data and return a list of reviews struct
function _extract_reviews(path, label = 0)
    content = []
    for f in readdir(path)
        open(string(path,"/",f)) do re
            txt = read(re,String)
            #println(txt)
            push!(content, review(clean_text(txt), label))
        end
    end
    return content
end


Finsied extracting and cleaning 25000 training reviews and 25000 testing reviews


# Define an ngram model

In [133]:
function ngram(lst, n)
    new_lst = copy(lst)
    for i in 2:min(n,length(lst))
        for j in i:length(lst)
            push!(new_lst, join(lst[j-i+1:j], "_"))
        end
    end
    return new_lst
end
#proccess the data

25000-element Array{review,1}:
 review(["bromwell", "cartoon", "comedy", "ran", "time", "programs", "school", "life", "teachers", "teaching"  …  "classic_line_line_inspector_inspector_sack_sack_teachers_teachers_student", "line_inspector_inspector_sack_sack_teachers_teachers_student_student_welcome", "inspector_sack_sack_teachers_teachers_student_student_welcome_welcome_bromwell", "sack_teachers_teachers_student_student_welcome_welcome_bromwell_bromwell_expect", "teachers_student_student_welcome_welcome_bromwell_bromwell_expect_expect_adults", "student_welcome_welcome_bromwell_bromwell_expect_expect_adults_adults_age", "welcome_bromwell_bromwell_expect_expect_adults_adults_age_age_bromwell", "bromwell_expect_expect_adults_adults_age_age_bromwell_bromwell_fetched", "expect_adults_adults_age_age_bromwell_bromwell_fetched_fetched_pity", "adults_age_age_bromwell_bromwell_fetched_fetched_pity_pity_isn"], 1)                                                                                     

# Training

In [134]:
#define Naive Bayes model
struct NaiveBayes2 class_prob; map_prob; end

#map words into numbers and calcualte the frequency
function map_word(dtrn)
    w2n = Dict()
    n2w = Dict()
    t = 1
    words_cnt = Dict()
    for re in dtrn
        lst = re.txt
        lbl = re.label
        for word in lst
            if !haskey(words_cnt, word)
                get!(w2n, word, t)
                get!(n2w, t, word)
                get!(words_cnt, word, fill(0, length(c2i)))
            end
            words_cnt[word][lbl] += 1
        end
    end
    words_cnt, w2n ,n2w
end
nothing

InterruptException: InterruptException:

# Predict

In [None]:
#define predict function
function predict(txt, nb, eps = 10)
    prob = [log(nb.class_prob[c]) + sum([log(haskey(nb.map_prob, w) ? nb.map_prob[w][c]+eps : eps) for w in txt]) for c in 1:length(c2i)]
    ind = argmax(prob)
    #println("Model prediction is $(i2c[ind])")
    return ind
end
predict(txt::String, nb) = predict(txt)

#define accuracy function
function accuracy(dtst, nb)
    return mean([re.label == predict(re.txt, nb) for re in dtst])
end
nothing

# Benchmarks for different NGRAMS models

In [None]:
println("Downloading data..")
path = _download_reviews()
println("Extracting data..")
c2i = Dict("Positive" => 1, "Negative" => 2)
i2c = Dict(1 => "Positive", 2 => "Negative")
dtrn = [_extract_reviews(joinpath(path, "train", "pos"), 1); _extract_reviews(joinpath(path, "train", "neg"), 2)]
dtst = [_extract_reviews(joinpath(path, "test", "pos"), 1); _extract_reviews(joinpath(path, "test", "neg"), 2)]
println("Finsied extracting and cleaning $(length(dtrn)) training reviews and $(length(dtrn)) testing reviews")
for i in 1:6
    println("modeling data in $i gram model")
    dtrn2 = [review(ngram(re.txt),re.label) for re in dtrn]
    words_cnt, w2n ,n2w = map_word(dtrn)
    #find class probability
    c_p = [sum([re.label == c for re in dtrn]) for c in 1:length(c2i)]
    println("Naive Bayes model with $i gram model prodcued accuracy of $(NaiveBayes(c_p, words_cnt))")