## Exercise 0 : Preprocessing Text Data

In [44]:
from sklearn.datasets import fetch_20newsgroups
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import numpy as np
import math

from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /home/jaisu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Import data

In [35]:
categories = ['sci.med', 'comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

#### Preprocess text data

In [36]:
# Initialize vocabulary
vocabulary={}
ind = 0

def preprocess_text(text_data):
    """
    Removes punctuation marks, special characters, lower case text, removes stop words and creates list of words
    Returns: list of words
    """
    text_data = text_data.translate(str.maketrans('', '', string.punctuation))

    text_data = text_data.replace('\n', ' ')

    text_list = text_data.split(' ')
    text_list = list(map(lambda x: x.lower(), text_list))
    
    stop_words = stopwords.words('english')

    text_words = [word for word in text_list if word not in stop_words and word!='']
    
    global ind
    # Insert items in dictionary and put index as the value
    for word in text_words:
        if word not in vocabulary.keys():
            vocabulary[word] = ind
            ind += 1
            
    return text_words

In [37]:
news_items = []
for news in train.data:
    news_items.append(preprocess_text(news))

#### Bag-of-words feature representation 

In [38]:
def bag_of_words(news_item):
    """
    Returns vector representation of the news_item,
    where vector contains frequency of each unique item in vocabulary.
    """
    count_words = {}
    news_vec = np.zeros(len(vocabulary))
    
    for word in news_item:
        if word not in count_words.keys():
            count_words[word] = 1
        else:
            count_words[word] += 1 
    
    for word in news_item:
        news_vec[vocabulary[word]] = count_words[word]
        
    return news_vec 

In [39]:
news_vecs = []
for news in news_items:
    news_vecs.append(bag_of_words(news))

#### TF-IDF feature representation

In [7]:
def idf(news_vecs):
    """
    Returns idf dictionary for the available news corpus
    """
    # For every unique word in vocabulary, stores its IDF value
    idf_vec = {}
    N = len(vocabulary)
    for key, item in vocabulary.items():
        count_docs = 0
        # count news_items which contain the word
        for news in news_vecs:
            if news[item] != 0:
                count_docs += 1
        # Store the IDF value for that word(key)
        idf_vec[key] = math.log10(N/count_docs)
    
    return idf_vec

def tf_idf(news_vecs):
    """
    Returns tf-idf for the news item
    """
    
    idf_vec = idf(news_vecs)
    
    for i, news_item in enumerate(news_vecs):
        indexes = list(news_item.nonzero()[0])
        total_words_in_doc = len(news_items[i])
        for index in indexes:
            key = list(vocabulary.keys())[index]
            # Put tf-idf value for very news_item word
            news_item[index] = (news_item[index]/total_words_in_doc)/idf_vec[key]
        news_vecs[i] = news_item
            
    return

In [8]:
# Driver code for TF-IDF
tf_idf(news_vecs)

#### Splitting data into train, test and valid

In [46]:
X_train, X_test, y_train, y_test = train_test_split(train.data, train.target, test_size=0.2, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

## Exercise 1 : Implementation of Naive Bayes Classifier

In [40]:
class Gaussian_NB:
    def __init__(self):
        pass
                
    def train(self,news,labels):
        """
        Finds prior, likelihood and evidence.
        These values are used in test function for computing posterior probability
        """
        # Computing probabilities of the classes, called prior belief
        labels_0 = [item for item in labels if item==0]
        self.probs_0 = len(labels_0)/len(labels)
        
        labels_1 = [item for item in labels if item==1]
        self.probs_1 = len(labels_1)/len(labels)
        
        ## Separate the news items of each class
        self.news_0 = []
        self.news_1 = []
        
        for i,news_item in enumerate(news):
            if labels[i] == 0:
                self.news_0.append(news_item)
            else:
                self.news_1.append(news_item)
        
        ## Now finding probability of each word given a target class
        ## For class 0
        self.feature_probs_0 = {} ## Will be of length vocabulary, i.e. for each word
        for key, value in vocabulary.items():
            count_news = 0
            total = len(self.news_0)
            for news_item in self.news_0:
                if news_item[value] != 0:
                    count_news += 1
            self.feature_probs_0[key] = count_news/total
            
        
        ## For class 1
        self.feature_probs_1 = {} ## Will be of length vocabulary, i.e. for each word
        for key, value in vocabulary.items():
            count_news = 0
            total = len(self.news_1)
            for news_item in self.news_1:
                if news_item[value] != 0:
                    count_news += 1
            self.feature_probs_1[key] = count_news/total
            
        ## Now finding probabilities for each word
        self.feature_prob = {}
        for key,value in vocabulary.items():
            count = 0
            for news_item in (news):
                if news_item[value] != 0:
                    count += 1
            self.feature_prob[key] = count/len(news)
            
    def test(self, test_news):
        """
        compute posterior probability
        """
        target = []
        for news in test_news:
            prior_0 = 1
            prior_1 = 1
            evidence = 1
            for i, val in enumerate(news):
                key = list(vocabulary.keys())[i]
                if key not in self.feature_prob.keys():
                    continue
                if val != 0:
                    prior_0 *= self.feature_probs_0[key]
                    prior_1 *= self.feature_probs_1[key]
                    evidence *= self.feature_prob[key]

            p_0 = (prior_0 * self.probs_0)/(evidence+1)
            p_1 = (prior_1 * self.probs_1)/(evidence+1)

            target.append(0 if p_0>p_1 else 1)
        
        return target
        

In [41]:
# Driver code for training Naive Bayes

nb = Gaussian_NB()
 
nb.train(news_vecs,train.target)

#### test accuracy below is shown for only 50 test instances (news items)

In [43]:
## Driver code for testing Naive Bayes

test = fetch_20newsgroups(subset = 'test',categories=categories)

test_labels = test.target[:50]

## Preprocess the test_data
test_news_items = []
for news in test.data[:50]:
    test_news_items.append(preprocess_text(news))
    
test_news_vecs = []
for news in test_news_items:
    test_news_vecs.append(bag_of_words(news))

preds = nb.test(test_news_vecs)


test_acc = np.sum(preds==test_labels)/float(test_labels.shape[0]) 

print ("Test Set Accuracy: ",test_acc*100,"%")

Test Set Accuracy:  56.00000000000001 %


#### Naive Bayes test accuracy on dataset size of 100 --

In [50]:
## Driver code for testing Naive Bayes

test = fetch_20newsgroups(subset = 'test',categories=categories)

test_labels = test.target[:100]

## Preprocess the test_data
test_news_items = []
for news in test.data[:100]:
    test_news_items.append(preprocess_text(news))
    
test_news_vecs = []
for news in test_news_items:
    test_news_vecs.append(bag_of_words(news))

preds = nb.test(test_news_vecs)


test_acc = np.sum(preds==test_labels)/float(test_labels.shape[0]) 

print ("Test Set Accuracy: ",test_acc*100,"%")

Test Set Accuracy:  60.0 %
