In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
#Importing libraries
import numpy as np
import pandas as pd
from zipfile import ZipFile
import os
import nltk
import operator
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from random import shuffle
import string
nltk.download('stopwords')
nltk.download('wordnet')

lema = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
#Load the file
file = "/content/drive/MyDrive/20_newsgroups.zip" 
#Defining labels
label = ['comp.graphics', 'rec.sport.hockey', 'sci.med', 'sci.space', 'talk.politics.misc']  
# opening the zip file in read mode
with ZipFile(file, 'r') as zip:
    zip.printdir()
    zip.extractall()

In [5]:
classes = []
file_name="20_newsgroups"
folders = os.listdir(file_name)
file_list = []
count = 0

for l in label:
  for root, _, files in os.walk(str(os.getcwd())+'/'+file_name+'/'+str(l)):
      for file in files:
          file_path = os.path.join(root, file)
          file_list.append(file_path)
          classes.append(l)
        
len(file_list),len(classes)

(5000, 5000)

In [8]:
ln = len(string.punctuation)
#Cleaning the data
def pre_process(content):
  #Convert the text to lower case
  content = content.lower()
  #Remove punctuation marks from tokens
  content = content.translate(str.maketrans(string.punctuation, " "*ln,''))
  #Perform word tokenization
  ctokens = word_tokenize(content)
  #Remove stopwords from tokens and do lemmatization
  #Checking length, if length = 1
  ctokens = [lema.lemmatize(s) for s in ctokens if s not in stopwords.words('english') and s.isalpha and len(s)>1]
  return ctokens

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
#performing preprocessing and saving the data
docs = []
# word_list={}
for path in file_list:
  file = open(path, 'r', encoding='cp1250')
  text = file.read().strip()
  x=pre_process(text)
  file.close()
  docs.append(x)


'docs_pd = pd.DataFrame([docs,classes]).T\nprint(docs_pd)\ndocs_pd[0] = pre_process(docs_pd[0])\ndocs_pd.to_csv("docs_pd.csv")\ndocs_pd.to_pickle("docs_pd")\ndocs_pd'

In [13]:
#Converting the dataframes to csv 
docs_pd = pd.DataFrame([docs,classes]).T
docs_pd.to_pickle("docs_pd")
docs_pd.to_csv("docs_pd.csv")
docs_pd

Unnamed: 0,0,1
0,"[xref, cantaloupe, srv, c, cmu, edu, comp, gra...",comp.graphics
1,"[xref, cantaloupe, srv, c, cmu, edu, comp, gra...",comp.graphics
2,"[newsgroups, comp, graphic, path, cantaloupe, ...",comp.graphics
3,"[newsgroups, comp, graphic, path, cantaloupe, ...",comp.graphics
4,"[xref, cantaloupe, srv, c, cmu, edu, comp, gra...",comp.graphics
...,...,...
4995,"[xref, cantaloupe, srv, c, cmu, edu, talk, pol...",talk.politics.misc
4996,"[xref, cantaloupe, srv, c, cmu, edu, talk, pol...",talk.politics.misc
4997,"[xref, cantaloupe, srv, c, cmu, edu, alt, fan,...",talk.politics.misc
4998,"[xref, cantaloupe, srv, c, cmu, edu, sci, skep...",talk.politics.misc


In [14]:
docs_pd=pd.read_pickle('/content/docs_pd')

# **Naive Bayes Model with TF-ICF**

In [23]:
#Class for implementing Naive Bayes algorithm with TF-ICF
class NaiveBayes_tf_icf:

  #Function to predict
  def predict(self,X_test):
    predc = []
    for i in range(len(X_test)):
        classes_words_probability = []
        for l in label:
            words_probability = 0
            for word in X_test[i]:
                fr, cn = self._word_freq(word, l)
                pp = (fr+1) /(cn+len(self._unique_words))
                words_probability += np.log(pp)
            classes_words_probability.append(words_probability)
        predc.append(label[np.argmax(classes_words_probability)])
    return predc

  #Function to compute confusion matrix
  def confusion_matrix(self, ypred, ytest):
    matrix= np.zeros((len(label), len(label))).astype(int)
    for i in range(len(ypred)):
        matrix[label.index(ypred[i])][label.index(ytest[i])]+= 1
    return matrix
  

  #Function to compute accuracy
  def calculate_accuracy(self, ypred, ytest):
    return len([1 for i in range(len(ypred)) if ypred[i] == ytest[i]])/len(ypred)


  #Function to compute word frequency
  def _word_freq(self, word, label):
    try:
        return self._word_freq_per_class[label, word], self._number_words_perclass[label]
    except:
        return 0, self._number_words_perclass[label]

  #Calculate tf-icf
  def _calculate_tf_icf(self):
    self._tf_icf = {}
    c = Counter(self._word_list)
    for i in set(self._word_list):
      tf = c[i]
      icf = np.log(len(self._m_dict)/self._class_word[i]+1)
      self._tf_icf[i] = tf*icf

  #Function to fit the data
  def fit(self,X_train,y_train, k):
    words = X_train
    self._N = len(words)
    classes = y_train
    self._m_dict = {}
    for i in range(self._N):
      if classes[i] in self._m_dict.keys():
          self._m_dict[classes[i]] = self._m_dict[classes[i]] + words[i]
      else:
          self._m_dict[classes[i]] = words[i]

    #Listing words containing multiple occurence of same word
    self._word_list = []
    for i in self._m_dict:
        self._word_list = self._word_list + self._m_dict[i]

    #Count of word per class
    self._class_word = {}
    for i in self._m_dict:
      l=self._m_dict[i]
      for j in set(l):
        if j not in self._class_word.keys():
          self._class_word[j] = 1
        else:
          self._class_word[j] += 1
    self._calculate_tf_icf()
    sorted_x = sorted(self._tf_icf.items(), key = operator.itemgetter(1), reverse=True)


    #considering top k features 
    self._unique_words = [i[0] for i in sorted_x[:int(len(sorted_x)*k)]]  
    self._word_freq_per_class = {}
    self._number_words_perclass = {}
    for i in label:
        freq_list= Counter(self._m_dict[i])
        for j in self._unique_words:
            self._word_freq_per_class[i,j] = freq_list[j]
            if i in self._number_words_perclass.keys():
                self._number_words_perclass[i] = self._number_words_perclass[i] +freq_list[j]
            else:
                self._number_words_perclass[i] = freq_list[j]
    self._freq_train = {}
    for i in y_train:
      if i not in self._freq_train.keys():
        self._freq_train[i] = 1
      else:
        self._freq_train[i] += 1

In [24]:
#50:50, 70:30, and 80:20 training and testing split ratios
ratio = [0.5,0.7,0.8]
naive_tf_dict = []

for i in range(3):
  train = docs_pd.sample(frac=ratio[i],random_state=42)
  xtrain, ytrain = train[0].tolist(),train[1].tolist()
  test = docs_pd.sample(frac=1,random_state=42).drop(train.index)

  xtest,ytest = test[0].tolist(),test[1].tolist()
  nb = NaiveBayes_tf_icf()
  #Fitting the xtrain and ytrain
  #Taking k as 500
  k = 500
  nb.fit(xtrain, ytrain, k)
  ypred = nb.predict(xtest)

  r = int(ratio[i]*100)
  accuracy = nb.calculate_accuracy(ypred, ytest)*100
  naive_tf_dict.append(accuracy)

  print("At ratio {}:{}".format(r,100-r))
  print("\n")
  print("Confusion Matrix is given by: \n",nb.confusion_matrix(ypred,ytest))
  print("\n")
  print("Accuracy is {:.2f} \n".format(accuracy))

At ratio 50:50


Confusion Matrix is given by: 
 [[491   4  10  11   1]
 [  1 478   2   1   0]
 [  0   0 465   4   0]
 [  2   0   8 503   4]
 [  0   2   7   3 503]]


Accuracy is 97.60 

At ratio 70:30


Confusion Matrix is given by: 
 [[299   1   4   5   0]
 [  0 277   0   1   0]
 [  2   0 291   3   0]
 [  1   0   3 302   4]
 [  0   0   3   0 304]]


Accuracy is 98.20 

At ratio 80:20


Confusion Matrix is given by: 
 [[189   1   4   2   0]
 [  0 190   0   0   0]
 [  0   0 204   2   0]
 [  0   0   2 200   3]
 [  0   0   3   0 200]]


Accuracy is 98.30 

