In [1]:
# Mohammed Kazamel

In [2]:
function load_reviews(parent_path)
    println("loading reviews...")
    file_paths = ["/train/pos/", "/train/neg/", "/test/pos/", "/test/neg/"]
    four_sets = []
    for file_path in parent_path .* file_paths # for each directory
        review_list = []
        for file_name in readdir(file_path) # for each file in a directory
            s = open(file_path * file_name) do file 
                read(file, String)
            end
            push!(review_list, s) # append review
        end
        push!(four_sets, review_list)
    end
    print("lengths of the four sets are: ", length.(four_sets))
    return four_sets
end

load_reviews (generic function with 1 method)

In [3]:
parent_path = "./aclImdb"
train_pos, train_neg, test_pos, test_neg = load_reviews(parent_path);

loading reviews...
lengths of the four sets are: [12500, 12500, 12500, 12500]

In [4]:
function clean_and_tokenize(review) # returns the review as an array of words
    review = lowercase(review)
    review = replace(review, r"<.*?>" => "")
    review = [r.match for r in collect(eachmatch(r"[a-zA-Z0-9-']+|([!?])", review))]
end

clean_and_tokenize (generic function with 1 method)

In [5]:
train_pos = clean_and_tokenize.(train_pos);
train_neg = clean_and_tokenize.(train_neg);
test_pos = clean_and_tokenize.(test_pos);
test_neg = clean_and_tokenize.(test_neg);

In [6]:
function train_frequency_calc(train_pos, train_neg)
    num_of_words_pos = 0
    num_of_words_neg = 0
    positive_word_frequencies = Dict() # no. of occurences in positive reviews
    negative_word_frequencies = Dict() # no. of occurences in negative reviews
    total_word_frequencies = Dict() # no. of occurences in all training reviews
    for review in train_pos
        num_of_words_pos += length(review)
        for token in review
            if haskey(positive_word_frequencies, token)
                positive_word_frequencies[token] += 1
            else
                positive_word_frequencies[token] = 1
            end
            
            if haskey(total_word_frequencies, token)
                total_word_frequencies[token] += 1
            else
                total_word_frequencies[token] = 1
            end
        end
    end
    
    for review in train_neg
        num_of_words_neg += length(review)
        for token in review
            if haskey(negative_word_frequencies, token)
                negative_word_frequencies[token] += 1
            else
                negative_word_frequencies[token] = 1
            end
            
            if haskey(total_word_frequencies, token)
                total_word_frequencies[token] += 1
            else
                total_word_frequencies[token] = 1
            end
        end
    end
    
    for (k,v) in total_word_frequencies
        if !haskey(positive_word_frequencies, k)
            positive_word_frequencies[k] = 0
        end
        if !haskey(negative_word_frequencies, k)
            negative_word_frequencies[k] = 0
        end
        # add-one smoothing 
        total_word_frequencies[k] += 1
        positive_word_frequencies[k] += 1
        negative_word_frequencies[k] += 1
    end
    
    
    return (num_of_words_pos, num_of_words_neg,
        positive_word_frequencies, negative_word_frequencies, total_word_frequencies)
end

train_frequency_calc (generic function with 1 method)

In [7]:
(num_of_words_pos, num_of_words_neg, positive_word_frequencies,
    negative_word_frequencies, total_word_frequencies) = train_frequency_calc(train_pos, train_neg);

In [8]:
# P(W|C1), P(W|C2), P(C1), P(C2)
# p(w|C) = num of w in c / num of words in c
# p(C) = num of C examples / num of all examples
function calc_probs(num_of_words_pos, num_of_words_neg, positive_word_frequencies,
        negative_word_frequencies, num_pos_examples, num_neg_examples)
    
    pos_word_probs, neg_word_probs = Dict(), Dict()
    prob_of_pos_class = num_pos_examples/(num_pos_examples + num_neg_examples)
    prob_of_neg_class = num_neg_examples/(num_pos_examples + num_neg_examples)
    
    for (k,v) in positive_word_frequencies
        pos_word_probs[k] = v/num_of_words_pos
    end
    
    for (k,v) in negative_word_frequencies
        neg_word_probs[k] = v/num_of_words_neg
    end
    # adding UNK
    pos_word_probs["UNK"] = 0.5
    neg_word_probs["UNK"] = 0.5
    # 0.5 since we didn't assign anything to it
    
    return pos_word_probs, neg_word_probs, prob_of_pos_class, prob_of_neg_class
end

calc_probs (generic function with 1 method)

In [9]:
pos_word_probs, neg_word_probs, prob_of_pos_class, prob_of_neg_class = calc_probs(
    num_of_words_pos, num_of_words_neg, positive_word_frequencies,
    negative_word_frequencies, length(train_pos), length(train_neg));

In [10]:
function predict_class(review, pos_word_probs, neg_word_probs, prob_of_pos_class, prob_of_neg_class) # cleaned
    pos_log_prob = 0
    neg_log_prob = 0
    for word in review
        pos_log_prob += log(get(pos_word_probs, word, pos_word_probs["UNK"]))
        neg_log_prob += log(get(neg_word_probs, word, neg_word_probs["UNK"]))
    end
    pos_log_prob += log(prob_of_pos_class)
    neg_log_prob += log(prob_of_neg_class)
    return (pos_log_prob > neg_log_prob) ? 1 : 0 # 1 for positive, 0 for negative
end

predict_class (generic function with 1 method)

In [11]:
function calc_acc(test_pos, test_neg, pos_word_probs, neg_word_probs, 
        prob_of_pos_class, prob_of_neg_class, num_pos_examples, num_neg_examples)
    accuracy = 0
    for review in test_pos
        if predict_class(review, pos_word_probs, neg_word_probs, prob_of_pos_class, prob_of_neg_class) == 1
            accuracy += 1
        end
    end
    for review in test_neg
        if predict_class(review, pos_word_probs, neg_word_probs, prob_of_pos_class, prob_of_neg_class) == 0
            accuracy += 1
        end
    end
    accuracy /= (num_pos_examples + num_neg_examples)
    print(accuracy)
    return accuracy
end

calc_acc (generic function with 1 method)

In [12]:
accuracy = calc_acc(test_pos, test_neg, pos_word_probs, neg_word_probs, 
    prob_of_pos_class, prob_of_neg_class, length(train_pos), length(train_neg))

0.81372

0.81372