# DMG2 Assignment : Problem 6

_Naive Bayes Text Classifier_

Number of classes : 20

In each class, there are a number of documents, each one corresponding to a date. The test-train split will be based on the date. 

**Preprocessing in each document :**
* Keep only From, Subject, Host, Organization, Data
* Remove special characters, stop words
* Stem the words
* There are numbers in the data, as addresses, phone numbers, currency, etc. Should they be removed?

In [40]:
import os,re
import pandas as pd
import numpy as np
import nltk,unicodedata
import operator,math

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

## Reading Files

In [2]:
DATA_DIR = '/home/jishnu/Documents/ISB/Term3/dmg2/assignments/hw_assignment1/dmg2/datasets/20_newsgroups'

In [3]:
labels,files_list = [],[]
for root, dirs, files in os.walk(DATA_DIR):
    for file in files:
        labels.append(re.sub(r'/home/jishnu/Documents/ISB/Term3/dmg2/assignments/hw_assignment1/dmg2/datasets/20_newsgroups/','',root))
        files_list.append(os.path.join(root,file))

In [4]:
files_df = pd.DataFrame({'filename':files_list, 'label' : labels})
list(files_df['label'].unique())

['comp.graphics',
 'talk.religion.misc',
 'rec.sport.baseball',
 'comp.sys.ibm.pc.hardware',
 'rec.motorcycles',
 'talk.politics.guns',
 'misc.forsale',
 'alt.atheism',
 'talk.politics.misc',
 'talk.politics.mideast',
 'rec.autos',
 'comp.windows.x',
 'sci.crypt',
 'sci.electronics',
 'soc.religion.christian',
 'rec.sport.hockey',
 'comp.os.ms-windows.misc',
 'sci.space',
 'comp.sys.mac.hardware',
 'sci.med']

## Train - Test Split

For each class, splitting the documents to training and test based on a 70-30 rule.

In [5]:
train = pd.DataFrame(columns=['filename','label'])
test = pd.DataFrame(columns=['filename','label'])

for label in list(files_df['label'].unique()):
    threshold = files_df.loc[files_df['label'] == label].shape[0] * 0.7
    threshold = int(np.floor(threshold))
    train = train.append(files_df.loc[files_df['label'] == label].iloc[:threshold,:],ignore_index=True)
    test = test.append(files_df.loc[files_df['label'] == label].iloc[threshold:,:],ignore_index=True)

print(train.shape[0],test.shape[0])

13997 6000


In [6]:
#list(files_df.loc[files_df['label'] == 'alt.atheism']['filename'])

In [7]:
#with open(files_list[0],'r') as filein:
#    data = filein.read()
#    data = re.sub(r'^Xref:.*\n','\n',data)
#    data = re.sub(r'(^|\n)(?:Path|Newsgroups|Summary|Message-ID|Expires|Followup-To|Distribution|Approved|Supersedes|Lines):.*\n','\n',data)
#    data = re.sub(r'(^|\n)(?:Keywords|Date|Followup-To|Supersedes):.*\n','\n',data)
#    print(data)

In [8]:
#vectorizer = CountVectorizer(input='filename',analyzer='word',stop_words='english',decode_error='ignore')
#vectorizer.fit_transform(list(files_df.loc[files_df['label'] == 'alt.atheism']['filename']))

In [9]:
#vectorizer.vocabulary_

In [10]:
#alt_atheism_dtm = pd.DataFrame(vectorizer.fit_transform(list(files_df.loc[files_df['label'] == 'alt.atheism']['filename'])).todense().T)

In [11]:
#alt_atheism_dtm['count_docs'] = alt_atheism_dtm.sum(axis=1)
#alt_atheism_dtm['word'] = vectorizer.get_feature_names()
#alt_atheism_dtm
#alt_atheism_dtm.sort_values(by='count_docs',ascending=False).iloc[:5000,:]

In [12]:
#from sklearn.feature_extraction.text import CountVectorizer

#corpus = [
#'All my cats cats in a row',
#'Hello, World!']

#vectorizer = CountVectorizer()
#pd.DataFrame(vectorizer.fit_transform(corpus).todense().T,columns = ['doc1','doc2'])
#print( vectorizer.vocabulary_ )
#print(vectorizer.get_feature_names())

## Creating dictionary of 5000 most frequent words in each class

Calculating P(W|C) for each word in each class, by normalizing using Laplace smoothing parameter of 30

In [48]:
# Dictionary to hold vectorizer objects
vect_dict = {}
# Dictionary to hold Document term matrix for each class.
# The document term matrix is converted to a Pandas DataFrame
class_dict = {}
for label in list(train['label'].unique()):
    vect_dict[label] = CountVectorizer(input='filename',analyzer='word',stop_words='english',decode_error='ignore')
    class_dict[label] = pd.DataFrame(vect_dict[label].fit_transform(list(train.loc[train['label'] == label]['filename'])).todense().T)
    class_dict[label]['count_docs'] = class_dict[label].sum(axis=1)
    class_dict[label]['word'] = vect_dict[label].get_feature_names()
    class_dict[label] = class_dict[label].sort_values(by='count_docs',ascending=False).iloc[:5000,:]
    tot_freq = class_dict[label]['count_docs'].sum() + 30
    class_dict[label]['p(w|c)'] =  class_dict[label]['count_docs'] / (tot_freq + (5000 * 30))

In [23]:
for label in list(train['label'].unique()):
    class_dict[label] = pd.Series(class_dict[label]['p(w|c)'].values,index=class_dict[label]['word']).to_dict()

In [45]:
#class_dict['alt.atheism']['xvxvxvx']

In [26]:
#class_dict['comp.graphics']

The words **cmu, edu,com,cs** can be removed for better results

## Calculating Class Priors

In [17]:
class_priors_dict = {}
total_freq = 0
for label in list(files_df['label'].unique()):
    class_priors_dict[label] = files_df.loc[files_df['label'] == label].shape[0]
    total_freq += class_priors_dict[label]
for label in list(files_df['label'].unique()):
    class_priors_dict[label] = np.round(class_priors_dict[label] / total_freq, 4)

In [18]:
class_priors_dict

{'comp.graphics': 0.050000000000000003,
 'talk.religion.misc': 0.050000000000000003,
 'rec.sport.baseball': 0.050000000000000003,
 'comp.sys.ibm.pc.hardware': 0.050000000000000003,
 'rec.motorcycles': 0.050000000000000003,
 'talk.politics.guns': 0.050000000000000003,
 'misc.forsale': 0.050000000000000003,
 'alt.atheism': 0.050000000000000003,
 'talk.politics.misc': 0.050000000000000003,
 'talk.politics.mideast': 0.050000000000000003,
 'rec.autos': 0.050000000000000003,
 'comp.windows.x': 0.050000000000000003,
 'sci.crypt': 0.050000000000000003,
 'sci.electronics': 0.050000000000000003,
 'soc.religion.christian': 0.0499,
 'rec.sport.hockey': 0.050000000000000003,
 'comp.os.ms-windows.misc': 0.050000000000000003,
 'sci.space': 0.050000000000000003,
 'comp.sys.mac.hardware': 0.050000000000000003,
 'sci.med': 0.050000000000000003}

## Calculating Training Accuracy

In [19]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words


In [20]:
train_words = []
for train_doc in train['filename']:
    with open(train_doc,'r',encoding='ISO-8859-1') as filein:
        words = nltk.word_tokenize(filein.read())
        words = normalize(words)
        train_words.append(words)
    

In [41]:
train_predicted = pd.DataFrame(columns=['predicted','max_class_posterior_prob'])
for train_words_list in train_words:
    log_posterior_dict = class_priors_dict
    log_posterior_dict = dict([(k,math.log(v)) for (k,v) in log_posterior_dict.items()])
    for word in train_words_list:
        for k,v in log_posterior_dict.items():
            try:
                log_posterior_dict[k] = log_posterior_dict[k] + math.log(class_dict[k][word])
            except:
                pass
    log_posterior_dict = dict([(k,np.exp(v)) for (k,v) in log_posterior_dict.items()])
    train_predicted = train_predicted.append({'predicted':max(log_posterior_dict, key=log_posterior_dict.get),'max_class_posterior_prob':max(log_posterior_dict.values())},ignore_index=True)
    
        

In [42]:
train_predicted['actual'] = train['label']
train_predicted.head()

Unnamed: 0,predicted,max_class_posterior_prob,actual
0,talk.politics.mideast,1.110726e-130,comp.graphics
1,rec.sport.baseball,1.571055e-113,comp.graphics
2,talk.politics.mideast,5.849009e-158,comp.graphics
3,rec.sport.baseball,5.692217e-113,comp.graphics
4,rec.sport.baseball,7.939354999999999e-142,comp.graphics


In [43]:
train_predicted.loc[train_predicted['predicted'] == train_predicted['actual']].shape

(81, 3)

In [44]:
train_predicted.shape

(13997, 3)

In [39]:
81/13997

0.005786954347360149