## Data Preprocessing

### Implement the package needed

In [1]:
import numpy as np
%matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.core.debugger import set_trace
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
import pickle

##Processing dataset *twenty news groups*



### Simplest data processing with no resctriction (Version 1)

In [37]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
twenty_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
twenty_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))


#twenty_train.target_names
#print(len(twenty_train.data))
#print(len(twenty_train.filenames))

#Initialize the model used to transfrom the data from text to vector
#with special condition to increase the accuracy.
vectorizer_v1 = CountVectorizer()
labels_train_twenty = twenty_train.target
labels_test_twenty = twenty_test.target


twenty_train_v1 = vectorizer_v1.fit_transform(twenty_train.data)
twenty_test_v1 = vectorizer_v1.transform(twenty_test.data)


print("shape of labels:" + str(len(labels_train)))
print("shape of vectorizer:" + str(twenty_train_v1.toarray().shape))

shape of labels:11314
shape of vectorizer:(11314, 101631)


### Take care of only stop_words and meaningless combination of number and letters (Version 2)

In [5]:
#Take care of the stop_words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
sw = stopwords.words('english')


# print(sw)

vectorizer_v2 = CountVectorizer(stop_words = sw, token_pattern = '[a-z]+')

twenty_train_v2 = vectorizer_v2.fit_transform(twenty_train.data)
twenty_test_v2 = vectorizer_v2.transform(twenty_test.data)
#print(vectorizer_v2.get_feature_names())
#print(type(twenty_test_v2.toarray()))
print(twenty_train_v2.shape)
print(labels_train.shape)
print(twenty_test_v2.shape)


# print(np.array(twenty_train_v2).shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pjy_t\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(11314, 72790)
(11314,)
(7532, 72790)


### Take care of high frequency words and low frequency words (Version 3)

In [6]:
vectorizer_v3 = CountVectorizer(max_df = 0.5)

twenty_train_v3 = vectorizer_v3.fit_transform(twenty_train.data)
twenty_test_v3 = vectorizer_v3.transform(twenty_test.data)

#print(twenty_train_v3.shape)
# print(vectorizer_v3.get_feature_names())

vectorizer_v4 = CountVectorizer(min_df = 0.001)

twenty_train_v4 = vectorizer_v4.fit_transform(twenty_train.data)
twenty_test_v4 = vectorizer_v4.transform(twenty_test.data)
print(twenty_train_v4.shape)
print(labels_train.shape)
print(twenty_test_v4.shape)


#print(vectorizer_v4.get_feature_names())sdf

(11314, 9349)
(11314,)
(7532, 9349)


### Tfidf data processing on different version of countVectorizer

In [5]:
#Using a new model to eliminate the problem of different weight on words. Using machine instead of manually
tfidf_transformer = TfidfTransformer()

twenty_train_tfidf_v1 = tfidf_transformer.fit_transform(twenty_train_v1)
twenty_test_tfidf_v1 = tfidf_transformer.transform(twenty_test_v1)
#print(twenty_train_tfidf.shape)

twenty_train_tfidf_v2 = tfidf_transformer.fit_transform(twenty_train_v2)
twenty_test_tfidf_v2 = tfidf_transformer.transform(twenty_test_v2)

twenty_train_tfidf_v3 = tfidf_transformer.fit_transform(twenty_train_v3)
twenty_test_tfidf_v3 = tfidf_transformer.transform(twenty_train_v3)

twenty_train_tfidf_v4 = tfidf_transformer.fit_transform(twenty_train_v4)
twenty_test_tfidf_v4 = tfidf_transformer.transform(twenty_train_v4)

## Processing dataset *IMDB review*



In [53]:
def load_imdb_train():
    with open('imdb_train.pickle', 'rb') as handle:
        result = pickle.load(handle)
    return result

def load_imdb_test():
    with open('imdb_test.pickle', 'rb') as handle:
        result = pickle.load(handle)
    return result

In [54]:
#Please download data from http://ai.stanford.edu/~amaas/data/sentiment/ and upload to collab and put under folder "content"

#Uncompress the aclimdb)v1.tar.gz
import os
# os.system('tar zxvf %s' % 'aclImdb_v1.tar.gz' )

#-----------------------------------------------------------------------
#Import data into the code
# from sklearn.datasets import load_files
# imdb_train = load_files('aclImdb/train')
# imdb_test = load_files('aclImdb/test')
#-----------------------------------------------------------------------
# with open('imdb_train.pickle', 'wb') as handle:
#     pickle.dump(imdb_train ,handle, protocol = pickle.HIGHEST_PROTOCOL)
    
# with open('imdb_test.pickle', 'wb') as handle:
#     pickle.dump(imdb_test, handle, protocol = pickle.HIGHEST_PROTOCOL)
#-----------------------------------------------------------------------

# OS takes a long time to load files from disk, therefore we saved the imdb_train and imdb_test set using pickle
imdb_train = load_imdb_train()
imdb_test = load_imdb_test()
labels_train_imdb = imdb_train.target
labels_test_imdb = imdb_test.target

### Simplest data processing with no restriction (Version 1)

In [41]:
#Train the data using the same special vectorize as above
imdb_train_v1 = vectorizer_v1.fit_transform(imdb_train.data)
imdb_test_v1 = vectorizer_v1.transform(imdb_test.data)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 2987)	1
  (0, 3598)	1
  (0, 5618)	3
  (0, 7054)	2
  (0, 7611)	2
  (0, 11008)	1
  (0, 11043)	2
  (0, 18502)	1
  (0, 19732)	1
  (0, 22001)	1
  (0, 23825)	1
  (0, 30916)	1
  (0, 32350)	1
  (0, 41152)	1
  (0, 43231)	1
  (0, 43297)	1
  (0, 46381)	1
  (0, 46419)	1
  (0, 46735)	2
  (0, 49567)	2
  (0, 50002)	2
  (0, 50538)	1
  (0, 51206)	1
  (0, 53345)	1
  (0, 54396)	2
  :	:
  (24999, 110234)	1
  (24999, 110272)	1
  (24999, 110367)	1
  (24999, 110852)	1
  (24999, 110866)	1
  (24999, 111180)	5
  (24999, 111517)	3
  (24999, 113193)	1
  (24999, 113331)	1
  (24999, 113472)	1
  (24999, 113602)	1
  (24999, 115332)	1
  (24999, 116398)	1
  (24999, 117102)	1
  (24999, 117371)	1
  (24999, 118219)	1
  (24999, 119561)	3
  (24999, 119773)	1
  (24999, 120192)	1
  (24999, 120388)	1
  (24999, 121904)	1
  (24999, 121928)	1
  (24999, 122910)	1
  (24999, 123190)	3
  (24999, 123257)	1


### Take care of only stop_words and meaningless combination of number and letters (Version 2)

In [55]:
imdb_train_v2 = vectorizer_v2.fit_transform(imdb_train.data)
imdb_test_v2 = vectorizer_v2.transform(imdb_test.data)

### Take care of high frequency words and low frequency words

In [17]:
imdb_train_v3 = vectorizer_v3.fit_transform(imdb_train.data)
imdb_test_v3 = vectorizer_v3.transform(imdb_test.data)

imdb_train_v4 = vectorizer_v4.fit_transform(imdb_train.data)
imdb_test_v4 = vectorizer_v4.transform(imdb_test.data)

In [75]:
vectorizer_v5 = CountVectorizer(stop_words = sw, token_pattern = '[a-z]+', min_df = 0.001)
imdb_train_v5 = vectorizer_v5.fit_transform(imdb_train.data)
imdb_test_v5 = vectorizer_v5.transform(imdb_test.data)


In [76]:
print(imdb_train_v5.shape)
print(imdb_test_v5.shape)
print(imdb_train_v2.shape)
print(twenty_train_v2.shape)
print(twenty_test_v2.shape)

(75000, 10001)
(25000, 10001)
(75000, 120682)
(11314, 72790)
(7532, 72790)


### Tfidf data processing on different version of countVectorizer

# Task 2: Implement Naive Bayes and k-fold cross validation

## Fitting the data
Below, first we implement the `fit` function that learns the model parameters. We use Laplace smoothing for the class prior using $\alpha=\beta=1$.

In [12]:
# class MultinomialNaiveBayes:
#     def __init__(self):
#         return

#     def fit(self, x, y, sigma=1):
#         N, D = x.shape                          # get the shape from input data
#         C = np.max(y) + 1                       # number of class(y)
#         prob_table = np.zeros((C,D))            # table with occurance of word given class c. 
#                                                 # It goes from y = 1, ... , y = C
#         # From here on the implementation will be different, since we're dealing with multinomial dist.
#         Nc = np.zeros(C)
#         for c in range(C):
#             x_c = x[y == c]                     # slice all elements with a given label y
#             Nc[c] = x_c.shape[0]                # number of elements of class c
#             # calculates probability table with smoothing, default: alpha = beta = 1
#             prob_table[c,:] = (sigma + np.sum(x_c, axis = 0))  / (sigma*C  + np.sum(x_c))
        
#         # Prior believe with smoothing (alpha_y, beta_y)
#         self.pi = (Nc + sigma)/(np.sum(Nc) + sigma*C)
#         self.prob_table = prob_table
#         return self

        
#     def logsumexp(self, Z):                                                # dimension C x N
#         Zmax = np.max(Z,axis=0)                             # max over C
#         log_sum_exp = Zmax + np.log(np.sum(np.exp(Z - Zmax), axis=0))
#         return log_sum_exp

    
#     def predict(self, input):
#         # The input is a numpy array of shape (C, D), it's acquired by fitting the test set
#         # on the vectorizer for training.

#         C, D = self.prob_table.shape            # obtain dimension of probability matrix
#         A = input.shape[0]                      # size of test set
#         # for numerical stability we work in the log domain
#         # we add a dimension because this is added to the log-likelihood matrix 
#         # that assigns a likelihood for each class (C) to each test point, and so it is C x N
#         log_prior = np.log(self.pi)
# #         print("prior = " + str(self.pi))
# #         print(log_prior.shape)
#         result = np.zeros((A, C))   # A x C
#         print(result.shape)
#         for a in range (A):
#             log_prob = np.zeros(C)
#             if a % 1000 == 0:
#                 print(f"Currently processing instance {a}")
#             # loop over all class labels 
#             for c in range(C):
#                 # show progress since it takes a long tim
#                 temp_c = self.prob_table[c]
#                 exist = temp_c[input[a] > 0]
#                 notExist = temp_c[input[a] == 0]
#                 log_prob[c] = np.sum(np.log(exist))+np.sum(np.log(1-notExist)) 
#                 log_posterior = log_prior + log_prob
# #                 print("log_prior.shape: " + str(log_prior.shape))
# #                 print("log_prob.shape: " + str(log_prob.shape))
# #                 print("dimension of log_posterior: " + str(log_posterior.shape))
#                 # normalization involving logsumexp trick
#                 result[a,:] = np.exp(log_posterior - self.logsumexp(log_posterior))
#         return result
        

## Multinomial model for Naive Bayes

In [33]:
class MultinomialNaiveBayes:
    def __init__(self):
        return

    def fit(self, x, y, sigma=1):
        N, D = x.shape                          # get the shape from input data
        C = np.max(y) + 1                       # number of class(y)
        prob_table = np.zeros((C,D))            # table with occurance of word given class c. 
                                                # It goes from y = 1, ... , y = C
        # From here on the implementation will be different, since we're dealing with multinomial dist.
        Nc = np.zeros(C)
        for c in range(C):
            x_c = x[y == c]                     # slice all elements with a given label y
            Nc[c] = x_c.shape[0]                # number of elements of class c
            # calculates probability table with smoothing, default: alpha = beta = 1
            prob_table[c,:] = (sigma + np.sum(x_c, axis = 0))  / (sigma*C  + np.sum(x_c))
        
        # Prior believe with smoothing (alpha_y, beta_y)
        self.pi = (Nc + sigma)/(np.sum(Nc) + sigma*C)
        self.prob_table = prob_table
        return self

        
    def logsumexp(self, Z):                                                # dimension C x N
        Zmax = np.max(Z,axis=0)                             # max over C
        log_sum_exp = Zmax + np.log(np.sum(np.exp(Z - Zmax), axis=0))
        return log_sum_exp

    
    def predict(self, input):
        # The input is a numpy array of shape (C, D), it's acquired by fitting the test set
        # on the vectorizer for training.
        # Sigma being the normalization constant.

        C, D = self.prob_table.shape            # obtain dimension of probability matrix
        A = input.shape[0]                      # size of test set
        # for numerical stability we work in the log domain
        # we add a dimension because this is added to the log-likelihood matrix 
        # that assigns a likelihood for each class (C) to each test point, and so it is C x N
        log_prior = np.log(self.pi)
#         print("prior = " + str(self.pi))
#         print(log_prior.shape)
        result = np.zeros((A, C))   # A x C
        print(result.shape)
        for a in range (A):
            log_prob = np.zeros(C)
            if a % 1000 == 0:
                print(f"Currently processing instance {a}")
            # loop over all class labels 
            for c in range(C):
                # show progress since it takes a long tim
                temp_c = self.prob_table[c]
                exist = temp_c[input[a] > 0]
                notExist = temp_c[input[a] == 0]
                log_prob[c] = np.sum(np.log(exist))+np.sum(np.log(1-notExist)) 
                log_posterior = log_prior + log_prob
#                 print("log_prior.shape: " + str(log_prior.shape))
#                 print("log_prob.shape: " + str(log_prob.shape))
#                 print("dimension of log_posterior: " + str(log_posterior.shape))
                # normalization involving logsumexp trick
                result[a,:] = np.exp(log_posterior - self.logsumexp(log_posterior))
        return result
        

### Evaluate accuracy

In [34]:
def evaluate_acc(y, y_hat):
    accuracy = np.sum(y == y_hat) / y.shape[0]
    return accuracy

## Experiment

In [47]:
model1 = MultinomialNaiveBayes()
model1.fit(twenty_train_v4.toarray(), labels_train_twenty, 1)
y_prob = model1.predict(twenty_test_v4.toarray())
# check_sum =  np.sum(y_prob,axis = 1)
# print(y_prob.shape)
# print(y_prob)
# print(check_sum.T)
y_pred = np.argmax(y_prob, axis = 1)
# print(y_pred.shape)
model1.accuracy = evaluate_acc(y = labels_test, y_hat = y_pred)
print(f'test accuracy: {model1.accuracy}')

(7532, 20)
Currently processing instance 0
Currently processing instance 1000
Currently processing instance 2000
Currently processing instance 3000
Currently processing instance 4000
Currently processing instance 5000
Currently processing instance 6000
Currently processing instance 7000
test accuracy: 0.582979288369623


In [36]:
model2 = MultinomialNaiveBayes()
model2.fit(twenty_train_v2.toarray(), labels_train_twenty,1)
y_prob = model2.predict(twenty_test_v2.toarray())
# check_sum =  np.sum(y_prob,axis = 1)
# print(y_prob.shape)
# print(y_prob)
# print(check_sum.T)
y_pred = np.argmax(y_prob, axis = 1)
# print(y_pred.shape)


accuracy = evaluate_acc(y = labels_test, y_hat = y_pred)
print(f'test accuracy: {accuracy}')

(20, 9349)
(7532, 20)
Currently processing instance 0
Currently processing instance 1000
Currently processing instance 2000
Currently processing instance 3000
Currently processing instance 4000
Currently processing instance 5000
Currently processing instance 6000
Currently processing instance 7000
test accuracy: 0.6018321826872013


In [72]:
model_imdb_v5 = MultinomialNaiveBayes()
model_imdb_v5.fit(imdb_train_v5.toarray(), labels_train_imdb)
y_prob = model_imdb_v5.predict(imdb_test_v5.toarray())
# check_sum =  np.sum(y_prob,axis = 1)
# print(y_prob.shape)
# print(y_prob)
# print(check_sum.T)
y_pred = np.argmax(y_prob, axis = 1)
# print(y_pred.shape)
accuracy = evaluate_acc(y = labels_test_imdb, y_hat = y_pred)
print(f'test accuracy: {accuracy}')

(25000, 3)
Currently processing instance 0
Currently processing instance 1000
Currently processing instance 2000
Currently processing instance 3000
Currently processing instance 4000
Currently processing instance 5000
Currently processing instance 6000
Currently processing instance 7000
Currently processing instance 8000
Currently processing instance 9000
Currently processing instance 10000
Currently processing instance 11000
Currently processing instance 12000
Currently processing instance 13000
Currently processing instance 14000
Currently processing instance 15000
Currently processing instance 16000
Currently processing instance 17000
Currently processing instance 18000
Currently processing instance 19000
Currently processing instance 20000
Currently processing instance 21000
Currently processing instance 22000
Currently processing instance 23000
Currently processing instance 24000
test accuracy: 0.52144


In [74]:
clf = MultinomialNB()
clf.fit(imdb_train_v5, labels_train_imdb)
y_pred_sklearn = clf.predict(imdb_test_v5)
accuracy_sklearn = evaluate_acc(y = labels_test_imdb, y_hat = y_pred_sklearn)
print(accuracy_sklearn)


0.54152


# Task 3: Run experiments and comapre with logistic regression

## Building logistic regression model

In [None]:
#Testing the accuracy
from sklearn.linear_model import LogisticRegression
twenty_train_LR = LogisticRegression(random_state=0,max_iter=200).fit(twenty_train_v2, labels_train)
label_predict = twenty_train_LR.predict(twenty_test_v2)
label_predict.score

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


AttributeError: ignored

### Use sklearn multinomialNB() to veriify that the model is implemented correctly

In [74]:
clf2 = MultinomialNB()
clf2.fit(twenty_train_v2, labels_train)
y_pred_sklearn = clf2.predict(twenty_test_v2)
accuracy_sklearn = evaluate_acc(y = labels_test, y_hat = y_pred_sklearn)
print(accuracy_sklearn)

0.6322357939458311


In [40]:
type(y_pred)

numpy.ndarray