In [None]:
import pandas as pd
import numpy as np
import wget

from sklearn.feature_extraction.text import CountVectorizer

#MultinomialNB is the classifier for this choice.
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import wordnet

nltk.download("wordnet")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")


To download file : (had to download manually)

In [None]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
#wget.download('https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv')

Reading csv file :

In [None]:
df = pd.read_csv(r"C:\Users\Asus\Desktop\All Code\NLP\Vector Models and Text Processing\bbc_text_cls.csv")

Checking file :

In [None]:
df.head()

Assigning the input data and target data to variables :

In [None]:
inputs = df['text']
labels = df['labels']

In [None]:
labels.hist(figsize=(10,5))

The above histogram plot shows that the documents are fairly evenly spread out over all the labels.
The data is balanced.

To split training and testing data from given dataset :

In [None]:
input_train, input_test, output_train, output_test =train_test_split(inputs,labels,random_state=123)

Instantiate vectorizer and TRAIN/FIT on training data as well as testing data:

In [None]:
vectorizer=CountVectorizer()

Xtrain=vectorizer.fit_transform(input_train)
Xtest=vectorizer.transform(input_test)

Xtrain/Xtest are both vectorizers, i.e. they are in form of a sparse matrix, consisiting of values of each document
represented as a vector.

Number of rows = Number of documents
Number of columns = Number of vocabulary size

Number of 0s = Number of words absent

To check how sparse the matrix is, how many values are non zero, we will do :

In [None]:
(Xtrain != 0).sum()

To check overall percentage of values which are non zero : 

In [None]:
(Xtrain != 0).sum() / np.prod((Xtrain.shape)) * 100

Since less than 1% of matrix contains non zero values, it is justified to employ a sparse matrix.

Instantiating a classifer of Naive Bayes class 

In [None]:
model= MultinomialNB()

Fitting the model on train and testing set :

In [None]:
model.fit(Xtrain,output_train)

Checking score :

In [None]:
print("Training score :",model.score(Xtrain,output_train))
print("Testing score :",model.score(Xtest,output_test))
      

Now, by implementing the identification and removal of stopwords:

In [None]:
vectorizer1=CountVectorizer(stop_words='english')
Xtrain=vectorizer.fit_transform(input_train)
Xtest=vectorizer.transform(input_test)
model1=MultinomialNB()
model1.fit(Xtrain,output_train)

In [None]:
print("Train score for model",model1.score(Xtrain,output_train))
print("Test score for model",model1.score(Xtest,output_test))

The results are similar to previous model.

Now we will implement Lemmatization with the model

Function to map parts of speech tags

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
        
        

The class LemmaTokeniser, 

In [None]:
class LemmaTokeniser:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self,doc):
        tokens=word_tokenize(doc)
        words_and_tags= nltk.pos_tag(tokens)
        return [self.wnl.lemmatize(word,pos= get_wordnet_pos(tag)) for word,tag in words_and_tags ]
        
    

implementing lemmatisation

In [None]:
vectorizer2= CountVectorizer(tokenizer=LemmaTokeniser())
Xtrain=vectorizer2.fit_transform(input_train)
Xtest=vectorizer2.transform(input_test)
model3=MultinomialNB()
model3.fit(Xtrain,output_train)


In [None]:
print("Train score with lemmatisation:",model.score(Xtrain,output_train))
print("Test score with lemmatisation:",model.score(Xtest,output_test))

Next, we will implement stemming for tokenising

In [None]:
class StemTokenizer:
    def __init__(self):
        self.porter= PorterStemmer()
    def __call__(self,doc):
        tokens=word_tokenize(doc)
        return [self.porter.stem(t) for t in tokens]

In [None]:
vectorizer1=CountVectorizer(tokenizer=StemTokenizer())
Xtrain=vectorizer.fit_transform(input_train)
Xtest=vectorizer.transform(input_test)
model3=MultinomialNB()
model3.fit(Xtrain,output_train)

In [None]:
print("Train score for model",model3.score(Xtrain,output_train))
print("Test score for model",model3.score(Xtest,output_test))

Now, implementing simple string split for tokenisation

In [None]:
def simple_tokenizer(s):
    return s.split()

In [None]:
vectorizer1=CountVectorizer(tokenizer=simple_tokenizer)
Xtrain=vectorizer.fit_transform(input_train)
Xtest=vectorizer.transform(input_test)
model4=MultinomialNB()
model4.fit(Xtrain,output_train)

In [None]:
print("Train score for model",model4.score(Xtrain,output_train))
print("Test score for model",model4.score(Xtest,output_test))