In [341]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import os
import math
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [342]:
# 5 Classes
mapping = {
    'comp.graphics': 0,
    'sci.med': 1,
    'talk.politics.misc': 2,
    'rec.sport.hockey': 3,
    'sci.space': 4
}

reverseMapping = {
    0: 'comp.graphics',
    1: 'sci.med',
    2: 'talk.politics.misc',
    3: 'rec.sport.hockey',
    4: 'sci.space'
}

In [343]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [344]:
data_file = []
target = []
path = "/content/drive/MyDrive/20_newsgroups/"
for dir in os.listdir(path):
    if dir in mapping:
             
        for file in os.listdir(path+dir+"/"):
            try:
               
                f = open(path+dir+"/"+file)
                data_file.append(f.read())
                target.append(mapping[dir])
            except:
             
                f = open(path+dir+"/"+file,"rb")
                data_file.append(f.read().decode('utf-8', 'backslashreplace'))
                target.append(mapping[dir])


In [345]:
data_file = np.array(data_file)
target = np.array(target)

In [346]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer() 
stopWords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Method for data Pre-processing

In [347]:
def remove_numbers(txt_data):
   numbers="0123456789"
   for i in range(len(numbers)):
       txt_data=np.char.replace(txt_data, numbers[i], ' ')
       txt_data = np.char.replace(txt_data, "  ", " ")
   return txt_data

In [348]:
def punctuations_removal_from_data(txt_data):
    sym = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(sym)):
        txt_data = np.char.replace(txt_data, sym[i], ' ')
        txt_data = np.char.replace(txt_data, "  ", " ")
    txt_data = np.char.replace(txt_data, ',', ' ')
    txt_data = np.char.replace(txt_data, "'", "")#additionally removing apostrophe
    return txt_data

In [349]:
def to_lower_case(txt_data):
    return np.char.lower(txt_data)

In [350]:
def stop_words_removal_from_data(txt_data):
    stopWords = stopwords.words('english')
    words = word_tokenize(str(txt_data))
    clean_data = ""
    for w in words:
        if w not in stopWords:
            clean_data = clean_data + " " + w
    return np.char.strip(clean_data)

In [351]:
def short_char_removal(txt_data):
    words = word_tokenize(str(txt_data))
    clean_data = ""
    for w in words:
        if len(w) > 1:
            clean_data = clean_data + " " + w
    return np.char.strip(clean_data)

In [352]:
def lemmatize_text(txt_data):
    lemmatizer = WordNetLemmatizer()   
    tokens = word_tokenize(str(txt_data))
    clean_data = ""
    for w in tokens:
        clean_data = clean_data + " " +lemmatizer.lemmatize(w)
    return np.char.strip(clean_data)

Method for data Pre processing

In [353]:
def process_txt_data(txt_data):
  txt_data=remove_numbers(txt_data)
  txt_data=punctuations_removal_from_data(txt_data)
  txt_data=to_lower_case(txt_data)
  txt_data=stop_words_removal_from_data(txt_data)
  txt_data=short_char_removal(txt_data)
  txt_data=lemmatize_text(txt_data)
  return txt_data

In [354]:
for fileNo,file in enumerate(data_file):
    data_file[fileNo] = process_txt_data(file)

Method to split data

In [355]:
def methodRandomSplitData(splitRatio = 0.8):
    dataSize = data_file.shape[0]
    trainingSize = int(dataSize * splitRatio)   
    randomIndexes = np.random.choice(dataSize, size=trainingSize)
    #test data
    testX = data_file[np.setdiff1d(range(dataSize), randomIndexes)]
    testY = target[np.setdiff1d(range(dataSize), randomIndexes)]
    #train data.
    trainX = data_file[randomIndexes]
    trainY = target[randomIndexes] 
    return (trainX, trainY, testX, testY)

Splitting data into training and test

In [356]:
train, trainLabel, test, testLabel = methodRandomSplitData()

TF-ICF

In [357]:
ClassFrequency = {}
for indexNo,text  in enumerate(train):
    for w in text.split(" "):
        if w not in ClassFrequency:
            ClassFrequency[w] = set()
        ClassFrequency[w].add(target[indexNo])
        

In [358]:
InverseClassFrequency = {}
for w in ClassFrequency.keys():
    InverseClassFrequency[w] = math.log(5/len(ClassFrequency[w]))

Computing Term Frequencies

In [359]:
TermFrequency = {0: {},1: {}, 2: {}, 3: {},4: {}}

for indexNo,txt in enumerate(train):
    for w in txt.split(" "):
        if w  in TermFrequency[trainLabel[indexNo]]:
            TermFrequency[trainLabel[indexNo]][w] += 1
        else:
            TermFrequency[trainLabel[indexNo]][w] = 0
        

In [360]:
dict_TF_ICF = {}
for i in range(5):
    dict_TF_ICF[i] = {}
    for w in TermFrequency[i]:
        dict_TF_ICF[i][w] = TermFrequency[i][w] * InverseClassFrequency[w]

Method to get words with highest value

In [361]:
def featuresK(k):
    feat = set()
    for i in range(5):
        x = list(dict(sorted(dict_TF_ICF[i].items(), key=lambda y: y[1], reverse=True)).keys())
        feat = feat.union(set(x[:k]))
    return feat

In [362]:
k = int(input("Enter no of features : "))

Enter no of features : 500


In [363]:
vocabulary = featuresK(k)

In [364]:
len(vocabulary)

2372

Method to generate new Data

In [365]:
def updatedDataVocab(data, vocabulary):
    updatedData = []
    for t in data:
        txt = []
        for w in t.split(" "):
            if w in vocabulary:
                txt.append(w)
        updatedData.append(" ".join(txt).strip())
        
    return updatedData

In [366]:
def getFeatures(train, test):
    vector = TfidfVectorizer()
    tr = vector.fit_transform(train)
    te = vector.transform(test)
    return tr, te

In [367]:
train = updatedDataVocab(train, vocabulary)
test = updatedDataVocab(test, vocabulary)

In [368]:
(train, test) = getFeatures(train, test)

In [369]:
train.shape

(3609, 2370)

### 4.

In [370]:
from sklearn.naive_bayes import GaussianNB
modelNB = GaussianNB()

In [371]:
modelNB.fit(train.toarray(),trainLabel)

GaussianNB()

In [372]:
predict_train = modelNB.predict(train.toarray())
predict_test = modelNB.predict(test.toarray())

In [373]:
print(sum(predict_train == trainLabel)/len(trainLabel))
print(sum(predict_test == testLabel)/len(testLabel))

0.914380714879468
0.823905558288244


In [374]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [375]:
from sklearn.metrics import confusion_matrix

In [376]:
confusion_matrix(testLabel,predict_test)

array([[338,  19,  19,  36,  38],
       [ 16, 345,  24,  32,  27],
       [ 13,  22, 414,  12,  15],
       [  0,   9,   9, 202,   3],
       [ 17,   9,  21,  17, 376]])

### 6.

##### 50-50 Split

In [377]:
trainX, trainLabel, testX, testLabel = methodRandomSplitData(0.5)

trainX = updatedDataVocab(trainX, vocabulary)
testX = updatedDataVocab(testX, vocabulary)

trainX, testX = getFeatures(trainX, testX)
gnb = GaussianNB()
gnb.fit(trainX.toarray(), trainLabel)
x = gnb.predict(trainX.toarray())
y = gnb.predict(testX.toarray())

print("Train Accuracy: ", sum(x == trainLabel)/len(trainLabel))
print("Test Accuracy: ", sum(y == testLabel)/len(testLabel))


Train Accuracy:  0.925531914893617
Test Accuracy:  0.8096288129364204


In [378]:
confusion_matrix(testLabel,y)

array([[460,  26,  35,  61,  38],
       [ 38, 437,  41,  34,  24],
       [ 14,  31, 506,  24,  26],
       [  9,   5,  13, 273,   4],
       [ 30,  18,  18,  29, 527]])

##### 70-30 split

In [379]:
trainX, trainLabel, testX, testLabel = methodRandomSplitData(0.7)

trainX = updatedDataVocab(trainX, vocabulary)
testX = updatedDataVocab(testX, vocabulary)

trainX, testX = getFeatures(trainX, testX)
gnb = GaussianNB()
gnb.fit(trainX.toarray(), trainLabel)
x = gnb.predict(trainX.toarray())
y = gnb.predict(testX.toarray())

print("Train Accuracy: ", sum(x == trainLabel)/len(trainLabel))
print("Test Accuracy: ", sum(y == testLabel)/len(testLabel))

Train Accuracy:  0.9211526282457252
Test Accuracy:  0.8069151324651999


In [380]:
confusion_matrix(testLabel,y)

array([[350,  27,  21,  63,  36],
       [ 25, 371,  42,  39,  26],
       [ 13,  20, 426,  20,  15],
       [  4,   1,  10, 224,   3],
       [  9,  20,  14,  22, 426]])