
# Assignment 3

## Text Classification

In [43]:
import os
import sys
import copy
import math
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import string
from urllib import request
import numpy as np

<hr>
Datasets for this assignment stores at the same directory with this file, into two folders as "train" and "test". There are two predefined classes, which is "spam" and "ham", and also, each class has one folder in both "train" folder and "test" folder. Within classes' folders, there are several text file(.txt) store email content which will be trained and test. <br>

### 0. Run algorithm here

In [909]:
C = ["spam", "ham"]
D = "./train/"
D_test = "./test/"
doc1 = "./test/spam/0073.2003-12-24.GP.spam.txt"
doc2 = "./test/ham/0003.1999-12-14.farmer.ham.txt"
doc3 = "./test/spam/0889.2004-04-19.GP.spam.txt"

# voc, prior, condprob = TrainMultinomialNB(C, D)

# score = ApplyMultinomialNB(C, voc, prior, condprob, doc2)

result, overall = evaluateMultinomialNB(C, D, D_test)
# print(result)
# print("overall accuracy: " + str(overall))


In [911]:
print("----------------------------------------------------------")
print("Multinomial Naive Bayes")
print("----------------------------------------------------------")
print("Words filter:\t\tstop_words")
print("Result: ")
print("  spam: ")
print("\tpositive:\t" + str(result['spam']['positive']) + "\n\tnegative:\t" + str(result['spam']['negative']))
print("\taccuracy: " + "{:.4%}".format(result['spam']['accuracy']))
print("  ham: ")
print("\tpositive:\t" + str(result['ham']['positive']) + "\n\tnegative:\t" + str(result['ham']['negative']))
print("\taccuracy: " + "{:.4%}".format(result['ham']['accuracy']))
print("  overall: ")
print("\taccuracy: " + "{:.4%}".format(overall))
print("----------------------------------------------------------")

----------------------------------------------------------
Multinomial Naive Bayes
----------------------------------------------------------
Words filter:		stop_words
Result: 
  spam: 
	positive:	113
	negative:	17
	accuracy: 86.9231%
  ham: 
	positive:	337
	negative:	11
	accuracy: 96.8391%
  overall: 
	accuracy: 94.1423%
----------------------------------------------------------


<hr>

## 1. Multinomial Naive Bayes
#### ref: http://nlp.stanford.edu/IR-book/pdf/13bayes.pdf

### 1.1 Training the data
The function below implement multinomial Naive Bayes algorithm for text classification. <br><br>
There are three inputs: <br>
<b>C</b>: class tag <br>
<b>D</b>: document path <br>
<b>stop_word_remove</b>: decide whether remove some insignificant terms like punctuations or stop words, there are three possible values: none(default), punctuations, and stop_words <br><br>
And there are three outputs: <br>
<b>vocabulary</b>: read every terms in training dataset with its frequency (number of times appear in the text) <br>
<b>prior</b>: prior probability calculate the probability of each class (number of documents for each class) <br>
<b>condprob</b>: conditional probability calculate the probability of each term's frequency in one class <br>

In [44]:
def TrainMultinomialNB(C, D):
    
    no_remove = ""
    punctuations = list(string.punctuation)
    stop_words = stopwords.words('english')
    stop_words_punc = stopwords.words('english') + list(string.punctuation)
    myFilter = stop_words
    
    
    # create a dictionary to read all documents' full path and make its class as keys
    files = {c: [] for c in C}
    # r: root, d: directories, f: files
    for r, d, f in os.walk(D):
        for file in f:
            for c in C:
                if '.'+c in file:
                    files[c].append(os.path.join(r, file))
    
    
    
    # create a dictionary to read all words from each class
    vocabulary = {}
    for c in C:                                                                           # for each class
        for path in files[c]:                                                             # for each document in the class
            file = open(path, 'r', encoding='utf-8', errors='ignore')                     # ignore some decoding issues (especialy in emails)
            text = ""                                                                     # read into a string: 'text'
#             for line in file:
#                 text = text + line.strip().lower() + " "
            text = file.read().lower().replace('\n', ' ')
            file.close()
            tokens = word_tokenize(text)
            filtered_keys = [i for i in word_tokenize(text) if i not in myFilter]         #

            for k in filtered_keys:                                                       # apply to dictionary
                if k in vocabulary:
                    if c in vocabulary[k]:
                        vocabulary[k][c] += 1
                    else:
                        vocabulary[k][c] = 1
                else:
                    vocabulary[k] = {c: 1}
    # regular vocabulary dict by adding 0 value
    for k in vocabulary:
        for c in C:
            if c not in vocabulary[k]:
                vocabulary[k][c] = 0
    
    
    
    # prior probability of each class
    prior = {}
    totalFiles = 0;
    for c in C:                                                                           # calculate total number of documents
        totalFiles += len(files[c])
    for c in C:
        prior[c] = len(files[c]) / totalFiles
    
    
    
    # calculate prabability of each word/term
    condprob = copy.deepcopy(vocabulary)
    denominator = {}
    for c in C:                                                                           # calculate total number of words/terms
        denominator[c] = 0
        for k in vocabulary:
            denominator[c] += vocabulary[k][c] + 1                                        # apply laplace smoothing by add 1 to each count
    for c in C:
        for k in vocabulary:
            condprob[k][c] = (vocabulary[k][c] + 1) / denominator[c]
            
        
    return vocabulary, prior, condprob

### 1.2 Testing the data

In [45]:
def ApplyMultinomialNB(C, V, prior, condprob, d):
    
    no_remove = ""
    punctuations = list(string.punctuation)
    stop_words = stopwords.words('english')
    stop_words_punc = stopwords.words('english') + list(string.punctuation)
    myFilter = stop_words

    score = {c: math.log(prior[c]) for c in C}
    
    file = open(d, 'r', encoding='utf-8', errors='ignore')
    text = ""
#     for line in file:
#         text = text + line.strip().lower() + " "
    text = file.read().lower().replace('\n', ' ')
    tokens = word_tokenize(text)
    filtered_keys = [i for i in word_tokenize(text) if i not in myFilter]
    
    for c in C:
        for k in filtered_keys:
            if k in V: 
                score[c] += math.log(condprob[k][c])
    
    return score

In [46]:
def evaluateMultinomialNB(C, D, D_test):
    
    voc, prior, condprob = TrainMultinomialNB(C, D)
    
    
    # create a dictionary to read all documents' full path and make its class as keys
    files = {c: [] for c in C}
    # r: root, d: directories, f: files
    for r, d, f in os.walk(D_test):
        for file in f:
            for c in C:
                if '.'+c in file:
                    files[c].append(os.path.join(r, file))
    
    result = {c: {'positive': 0, 'negative': 0, 'accuracy': 0} for c in C}
    for c in C:
        for f in files[c]:
            score = ApplyMultinomialNB(C, voc, prior, condprob, f)
            if score[c] == max(score.values()):
                result[c]['positive'] += 1
            else:
                result[c]['negative'] += 1

    pos = 0
    neg = 0
    for c in C:
        result[c]['accuracy'] = result[c]['positive'] / (result[c]['positive'] + result[c]['negative'])
        pos += result[c]['positive']
        neg += result[c]['negative']
    
    overall = pos / (pos + neg)
    
    return result, overall

<hr>

## 2. MCAP Logistic Regression

In [52]:
def TrainMCAPLogisticRegression(C, D):
    
    # params for filter out stop words
    no_remove = ""
    punctuations = list(string.punctuation)
    stop_words = stopwords.words('english')
    stop_words_punc = stopwords.words('english') + list(string.punctuation)
    
    myFilter = stop_words
    
    # threshold for gradient ascent
    threshold = 50
    
    
    
    # create a dictionary to read all documents' full path
    files = []
    for r, d, f in os.walk(D):
        for file in f:
            if '.txt' in file:
                files.append(os.path.join(r, file))
                
    
    
    # create a dictionary to read all distinct words from training set
    idx = 0
    vocabulary = {}
    for path in files:
        file = open(path, 'r', encoding='utf-8', errors='ignore')
        text = ""
#         for line in file:
#             text = text + line.strip().lower() + " "
        text = file.read().lower().replace('\n', ' ')
        file.close()
        tokens = word_tokenize(text)
        filtered_keys = [i for i in word_tokenize(text) if i not in myFilter]
        for k in filtered_keys:
            if k not in vocabulary:
                vocabulary[k] = idx
                idx += 1
    
    
    
    # now there are:
    # len(vocabulary) == idx distinct words  (size of array X)
    # len(files) documents                   (number of array X)
    # idx+1 weights                          (size of array w)
    # so we generate matrix X, class y, and vector w
    
    w = np.ones(idx+1)
    X = np.zeros(shape=(len(files), idx))
    y = np.zeros(len(files))
    
    # read all files and update X and y
    idx = 0
    for path in files:
        file = open(path, 'r', encoding='utf-8', errors='ignore')
        text = ""
#         for line in file:
#             text = text + line.strip().lower() + " "
        text = file.read().lower().replace('\n', ' ')
        file.close()
        tokens = word_tokenize(text)
        filtered_keys = [i for i in word_tokenize(text) if i not in myFilter]
        # update X
        for k in filtered_keys:
            X[idx][vocabulary[k]] += 1
        # update y only if spam
        if '.spam' in path:                            # y = 1 for spam email
            y[idx] = 1
        idx += 1
    
    
    
    # now we get w, X, and y
    # implement a function for calculate P from w and X[i]
    # set η and λ
    η = 0.007
    λ = 0.005
    
    # when n > 36, exp(36) / (1 + exp(36)) = 1.0
    
    w_prev = w
    trend = copy.deepcopy(w)
    for i in range(threshold):
        # ease the final function
        exp = w[0] + np.dot(X, w[1:])
        exp = np.clip(exp,-36,36)
        numerator = np.exp( exp )                                   # for y predict
        denominator = 1 + numerator                                 #
        y_pred = np.true_divide(numerator, denominator)          # y predict
        y_diff = y - y_pred                                         # y diff
        func = np.transpose(np.transpose(X) * y_diff)               # sum function
        func = func.sum(axis=0)
        
#         w[0] = w[0] - η * λ * w[0]
        w[1:] = w[1:] + η * func - η * λ * w[1:]                    # final function
#         if sum(abs(w_prev[1:] - w[1:])) < 1e-6:
#             break
#         w_prev = w
        trend = np.vstack((trend,copy.deepcopy(w)))
#         w_prev = w
        
    
    return vocabulary, w, trend

In [49]:
def ApplyMCAPLogisticRegression(C, V, w, d):
    
    # params for filter out stop words
    no_remove = ""
    punctuations = list(string.punctuation)
    stop_words = stopwords.words('english')
    stop_words_punc = stopwords.words('english') + list(string.punctuation)
    myFilter = stop_words
    
    
    x = np.zeros(len(w)-1)
    y = 1 if '.spam' in d else 0
    
    
    file = open(d, 'r', encoding='utf-8', errors='ignore')
    text = ""
#         for line in file:
#             text = text + line.strip().lower() + " "
    text = file.read().lower().replace('\n', ' ')
    file.close()
    tokens = word_tokenize(text)
    filtered_keys = [i for i in word_tokenize(text) if i not in myFilter]
    for k in filtered_keys:
        if k in V:
            x[V[k]] += 1
    
    exp = w[0] + np.dot(w[1:], x)
    exp = np.clip(exp,-36,36)
    numerator = np.exp(exp)
    denominator = 1 + numerator
    y_pred = numerator / denominator
    
    return y_pred

In [50]:
def EvaluateMCAPLogisticRegression(C, D, D_test):
    
    voc, w, trend= TrainMCAPLogisticRegression(C, D)
    
    # create a dictionary to read all documents' full path
    files = []
    for r, d, f in os.walk(D):
        for file in f:
            if '.txt' in file:
                files.append(os.path.join(r, file))
    
    
    result = {c: {'positive': 0, 'negative': 0, 'accuracy': 0} for c in C}
    
    for file in files:
        y_pred = ApplyMCAPLogisticRegression(C, voc, w, file)
        y = 1 if '.spam' in file else 0
        if y == 1:
            if y_pred > 0.5:
                result['spam']['positive'] += 1
            else:
                result['spam']['negative'] += 1
        else:
            if y_pred < 0.5:
                result['ham']['positive'] += 1
            else:
                result['ham']['negative'] += 1
    
    pos = 0
    neg = 0
    for c in C:
        result[c]['accuracy'] = result[c]['positive'] / (result[c]['positive'] + result[c]['negative'])
        pos += result[c]['positive']
        neg += result[c]['negative']
    
    overall = pos / (pos + neg)
    
    
    return result, overall;

In [53]:
C = ["spam", "ham"]
D = "./train/"
D_test = "./test/"
doc1 = "./test/spam/0073.2003-12-24.GP.spam.txt"
doc2 = "./test/ham/0003.1999-12-14.farmer.ham.txt"
doc3 = "./test/spam/0889.2004-04-19.GP.spam.txt"
doc4 = "./test/ham/0020.1999-12-15.farmer.ham.txt"


voc, w, trend= TrainMCAPLogisticRegression(C, D)

# y1 = ApplyMCAPLogisticRegression(C, voc, w, doc1)
# y2 = ApplyMCAPLogisticRegression(C, voc, w, doc2)
# y3 = ApplyMCAPLogisticRegression(C, voc, w, doc3)
# y4 = ApplyMCAPLogisticRegression(C, voc, w, doc4)

result, overall = EvaluateMCAPLogisticRegression(C, D, D_test)

# print(trend[:, 1:5])

NameError: name 'numpy' is not defined

In [890]:
print("----------------------------------------------------------")
print("MCAP Logistic Regression")
print("----------------------------------------------------------")
print("No. of iterations:\t50")
print("Words filter:\t\tstop_words")
print("Result: ")
print("  spam: ")
print("\tpositive:\t" + str(result['spam']['positive']) + "\n\tnegative:\t" + str(result['spam']['negative']))
print("\taccuracy: " + "{:.4%}".format(result['spam']['accuracy']))
print("  ham: ")
print("\tpositive:\t" + str(result['ham']['positive']) + "\n\tnegative:\t" + str(result['ham']['negative']))
print("\taccuracy: " + "{:.4%}".format(result['ham']['accuracy']))
print("  overall: ")
print("\taccuracy: " + "{:.4%}".format(overall))
print("----------------------------------------------------------")

----------------------------------------------------------
MCAP Logistic Regression
----------------------------------------------------------
No. of iterations:	50
Words filter:		stop_words
Result: 
  spam: 
	positive:	113
	negative:	10
	accuracy: 91.8699%
  ham: 
	positive:	334
	negative:	6
	accuracy: 98.2353%
  overall: 
	accuracy: 96.5443%
----------------------------------------------------------


In [38]:
path = "./test/spam/0073.2003-12-24.GP.spam.txt"
file = open(path, 'r', encoding='utf-8', errors='ignore')
text = ""
for line in file:
    text = text + line.strip().lower() + " "
file.close()
tokens = word_tokenize(text)
len(tokens)
# tokens[111:120]

189

In [37]:
path = "./test/spam/0073.2003-12-24.GP.spam.txt"
file = open(path, 'r', encoding='utf-8', errors='ignore')
text = ""
# for line in file:
#     text = text + line.strip().lower() + " "
text = file.read().lower().replace('\n', ' ')
file.close()
tokens = word_tokenize(text)
len(tokens)
# tokens[111:120]

189

In [None]:
# TrainMultinomialNB(C, D)
# V <- ExtractVocabularty(D)
# N <- CountDoct(D)
# for each c in C
# do  N_c <- CountDocsInClass(D, c)
#     prior[c] <- N_c/N
#     text_c <- ConcatenateTextOfAllDocsInClass(D, c)
#     for each t in V
#     do  T_ct <- CountTokensOfTerm(text_c, t)
#     for each t in V
#     do  condprob[t][c] <- (T_ct + 1) / SUM_t^(T_ct^ + 1)
# return V, prior, condprob

In [None]:
#     vocabulary = {c: {} for c in C}
#     for c in C:                                                                         # for each class
#         for path in files[c]:                                                           # for each document in the class
#             file = open(path, 'r', encoding="utf-8", errors="ignore")                   # ignore some decoding issues (especialy for emails)
#             text = ""                                                                   # read into a string 'text'
#             for line in file:
#                 text = text + line.strip().lower() + " "
#             tokens = word_tokenize(text)
#             filtered_keys = [i for i in word_tokenize(text) if i not in punctuations]   # remove only punctuations
#             for k in filtered_keys:                                                     # apply to dictionary
#                 if k in vocabulary[c]:
#                     vocabulary[c][k] += 1
#                 else:
#                     vocabulary[c][k] = 1