In [1]:
import re
from pathlib import Path
import string
from functools import reduce
from math import log
import itertools

In [2]:
!pip install nltk




In [3]:
# Enter smoothing or no smoothing.
smoothing = 1
filename = "/content/textfile.txt"

In [4]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

TASK 1 (A): Pre-Processing of RAW Text File: Split File to Sentences

In [5]:
# Loads file
# input - filename.txt
# returns a list of sentences seperated by newline in the textfile.
import nltk

def load_file(filename):
    with open(filename, 'r') as file:
        data = file.read().replace('\n', ' ')
    sentences = nltk.tokenize.sent_tokenize(data)
    return sentences

words=load_file(filename)
load_file(filename)

['[ moby dick by herman melville 1851 ] etymology .',
 '( supplied by a late consumptive usher to a grammar school ) the pale usher -- threadbare in coat , heart , body , and brain ; i see him now .',
 'he was ever dusting his old lexicons and grammars , with a queer handkerchief , mockingly embellished with all the gay flags of all the known nations of the world .',
 'he loved to dust his old grammars ; it somehow mildly reminded him of his mortality . "',
 'while you take in hand to school others , and to teach them by what name a whale - fish is to be called in our tongue leaving out , through ignorance , the letter h , which almost alone maketh the signification of the word , you deliver that which is not true ."',
 '-- hackluyt " whale .',
 '... sw .',
 'and dan .',
 'hval .',
 'this animal is named from roundness or rolling ; for in dan .',
 'hvalt is arched or vaulted ."',
 '-- webster \' s dictionary " whale .',
 '... it is more immediately from the dut .',
 'and ger .',
 'wall

TASK 1 (B): Perform Tokenization Technique on Raw Text File

In [6]:
# Tokenizes the sentences meaning split the sentences into words seperated by the "white sapce".
# input - List of sentences
# returns a list of lists of each sentence being tokenized.
def tokenize_sentence(lines):
    tokenized_sentences = [sentence.split() for sentence in lines]
    return tokenized_sentences


tokens=tokenize_sentence(words)
tokenize_sentence(words)

[['[',
  'moby',
  'dick',
  'by',
  'herman',
  'melville',
  '1851',
  ']',
  'etymology',
  '.'],
 ['(',
  'supplied',
  'by',
  'a',
  'late',
  'consumptive',
  'usher',
  'to',
  'a',
  'grammar',
  'school',
  ')',
  'the',
  'pale',
  'usher',
  '--',
  'threadbare',
  'in',
  'coat',
  ',',
  'heart',
  ',',
  'body',
  ',',
  'and',
  'brain',
  ';',
  'i',
  'see',
  'him',
  'now',
  '.'],
 ['he',
  'was',
  'ever',
  'dusting',
  'his',
  'old',
  'lexicons',
  'and',
  'grammars',
  ',',
  'with',
  'a',
  'queer',
  'handkerchief',
  ',',
  'mockingly',
  'embellished',
  'with',
  'all',
  'the',
  'gay',
  'flags',
  'of',
  'all',
  'the',
  'known',
  'nations',
  'of',
  'the',
  'world',
  '.'],
 ['he',
  'loved',
  'to',
  'dust',
  'his',
  'old',
  'grammars',
  ';',
  'it',
  'somehow',
  'mildly',
  'reminded',
  'him',
  'of',
  'his',
  'mortality',
  '.',
  '"'],
 ['while',
  'you',
  'take',
  'in',
  'hand',
  'to',
  'school',
  'others',
  ',',
  'and',

TASK 1 (B): Perform Removing of StopWords and Empty Strings, Stemming, Lemmatization and Appending Technique on Raw Text File

In [7]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def prep_data(lines):
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    processed_lines = []
    for sentence in lines:
        # Remove punctuation and convert to lower case
        sentence = [''.join(c for c in w if c not in string.punctuation) for w in sentence]
        sentence = [word.lower() for word in sentence]
        # Remove stopwords and empty strings
        sentence = [word for word in sentence if word not in stop_words and word != '']
        # Perform stemming
        sentence = [ps.stem(word) for word in sentence]
        # Perform lemmatization
        sentence = [lemmatizer.lemmatize(word) for word in sentence]
        # Append <s> and </s> at the beginning and end of the sentence
        sentence = ['<s>'] + sentence + ['</s>']
        processed_lines.append(sentence)

    print("No of sentences in Corpus: "+str(len(processed_lines)))
    return processed_lines

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [8]:
prep_data(tokens)

No of sentences in Corpus: 9999


[['<s>', 'mobi', 'dick', 'herman', 'melvil', '1851', 'etymolog', '</s>'],
 ['<s>',
  'suppli',
  'late',
  'consumpt',
  'usher',
  'grammar',
  'school',
  'pale',
  'usher',
  'threadbar',
  'coat',
  'heart',
  'bodi',
  'brain',
  'see',
  '</s>'],
 ['<s>',
  'ever',
  'dust',
  'old',
  'lexicon',
  'grammar',
  'queer',
  'handkerchief',
  'mockingli',
  'embellish',
  'gay',
  'flag',
  'known',
  'nation',
  'world',
  '</s>'],
 ['<s>',
  'love',
  'dust',
  'old',
  'grammar',
  'somehow',
  'mildli',
  'remind',
  'mortal',
  '</s>'],
 ['<s>',
  'take',
  'hand',
  'school',
  'other',
  'teach',
  'name',
  'whale',
  'fish',
  'call',
  'tongu',
  'leav',
  'ignor',
  'letter',
  'h',
  'almost',
  'alon',
  'maketh',
  'signif',
  'word',
  'deliv',
  'true',
  '</s>'],
 ['<s>', 'hackluyt', 'whale', '</s>'],
 ['<s>', 'sw', '</s>'],
 ['<s>', 'dan', '</s>'],
 ['<s>', 'hval', '</s>'],
 ['<s>', 'anim', 'name', 'round', 'roll', 'dan', '</s>'],
 ['<s>', 'hvalt', 'arch', 'vault',

In [9]:
dataset = load_file(filename)
dataset = tokenize_sentence(dataset)
dataset = prep_data(dataset)

No of sentences in Corpus: 9999


In [10]:
# Counts the no. of times a word repeats (frequency of each word) in the corpus.
# input - list of lists of words obtained from "prep_data"
# returns - a dictionary defined as {word:frequency} for words of the corpus including <s> and </s>.
def freq_of_unique_words(lines):
    count = {}
    for line in lines:
        for word in line:
            if word in count:
                count[word] += 1
            else:
                count[word] = 1
    unique_word_count = len(count)
    print("No of unique words in corpus : "+ str(unique_word_count))
    return count

freq_of_unique_words(tokens)

No of unique words in corpus : 17225


{'[': 3,
 'moby': 84,
 'dick': 84,
 'by': 1204,
 'herman': 1,
 'melville': 1,
 '1851': 3,
 ']': 1,
 'etymology': 1,
 '.': 6880,
 '(': 210,
 'supplied': 12,
 'a': 4736,
 'late': 30,
 'consumptive': 1,
 'usher': 2,
 'to': 4625,
 'grammar': 2,
 'school': 10,
 ')': 78,
 'the': 14431,
 'pale': 19,
 '--': 1090,
 'threadbare': 1,
 'in': 4172,
 'coat': 28,
 ',': 18713,
 'heart': 91,
 'body': 110,
 'and': 6430,
 'brain': 37,
 ';': 4072,
 'i': 2127,
 'see': 272,
 'him': 1067,
 'now': 785,
 'he': 1896,
 'was': 1644,
 'ever': 206,
 'dusting': 2,
 'his': 2530,
 'old': 450,
 'lexicons': 1,
 'grammars': 2,
 'with': 1722,
 'queer': 44,
 'handkerchief': 5,
 'mockingly': 1,
 'embellished': 3,
 'all': 1526,
 'gay': 21,
 'flags': 1,
 'of': 6609,
 'known': 80,
 'nations': 12,
 'world': 176,
 'loved': 3,
 'dust': 10,
 'it': 2522,
 'somehow': 44,
 'mildly': 10,
 'reminded': 4,
 'mortality': 1,
 '"': 1478,
 'while': 246,
 'you': 894,
 'take': 137,
 'hand': 214,
 'others': 39,
 'teach': 5,
 'them': 474,
 'what

In [11]:
unique_word_frequency = freq_of_unique_words(dataset)
#len(unique_word_frequency)

No of unique words in corpus : 10682


TASK2: Implementation of Naive Baye's Classifier from Scratch

In [44]:
'''
    Bayes Theorem form
    P(y|X) = P(X|y) * P(y) / P(X)
    '''
import pandas as pd
import numpy as np
data=pd.read_csv("/content/adult.csv")
data.shape





(32561, 15)

In [45]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [50]:
categorical = [var for var in data.columns if data[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :\n\n', categorical)

There are 9 categorical variables

The categorical variables are :

 ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country', 'income']


In [27]:
data[categorical].head()

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
0,?,HS-grad,Widowed,?,Not-in-family,White,Female,United-States,<=50K
1,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,United-States,<=50K
2,?,Some-college,Widowed,?,Unmarried,Black,Female,United-States,<=50K
3,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,United-States,<=50K
4,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,United-States,<=50K


In [28]:
data[categorical].isnull().sum()

workclass         0
education         0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
native.country    0
income            0
dtype: int64

TASK: Train the Model on Given Classification Dataset by splitting it into ratio of 80:20 and calculate the accuracy of the trained classifier.

In [30]:
  ### Write Your Code Here
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def calc_prior(features, target):
    prior_prob = {}
    for label in np.unique(target):
        prior_prob[label] = np.sum(target == label) / len(target)
    return prior_prob

def calc_statistics(features, target):
    mean = {}
    variance = {}
    for label in np.unique(target):
        labelleddata = features[target == label]
        mean[label] = np.mean(labelleddata, axis=0)
        variance[label] = np.var(labelleddata, axis=0)
    return mean, variance

def gaussian_density(x, mean, variance):
    prob = 1
    for i in range(len(x)):
        prob *= (1 / np.sqrt(2 * np.pi * variance[i])) * np.exp((-1 / 2) * ((x[i] - mean[i]) ** 2) / (2 * variance[i]))
    return prob

def calc_posterior(x, features, target):
    posteriors = {}
    # calculate posterior probability for each class
    for label in np.unique(target):
        class_features = features[target == label]
        class_mean, class_variance = calc_statistics(class_features, target)
        prior = calc_prior(features, target)[label]
        likelihood = gaussian_density(x, class_mean, class_variance)
        posterior = prior * likelihood
        posteriors[label] = posterior

    # return class with highest posterior probability
    max_label = None
    max_prob = -1
    for label, prob in posteriors.items():
        if prob > max_prob:
            max_prob = prob
            max_label = label
    return max_label

class NaiveBayes:
    def predict(self, features):
        preds = []
        for i in range(len(features)):
            pred = calc_posterior(features.iloc[i], self.X_train.values, self.y_train.values)
            preds.append(pred)
        return preds
    def accuracy(self, y_test, y_pred):
         accuracy = accuracy_score(y_test, y_pred)
         return accuracy


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('income', axis=1), data['income'], test_size=0.2, random_state=42)

# Create NaiveBayes object and fit the model on training data
nb = NaiveBayes()
nb.X_train = X_train
nb.y_train = y_train

# Make predictions on test set and calculate accuracy of model
y_pred = nb.predict(X_test)
accuracy = nb.accuracy(y_test.values, y_pred)
print('Accuracy:', accuracy)

TASK 3: Implement Naive Bayes Classifier Using Built-In function
Calculate the Accuracy Score and Metrics for Given Dataset and Compare the Results with your designed algorithm.
Compare using classification matrix and Plots for accuracy, precision and Recall

In [58]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Load dataset
data = pd.read_csv('/content/adult.csv')


# Split dataset into features and target variable
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=42)

# Create a Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Train the model using the training sets
gnb.fit(X_train, y_train)

# Make predictions on test data
y_pred = gnb.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision score
precision = precision_score(y_test, y_pred, average='weighted')

# Calculate recall score
recall = recall_score(y_test, y_pred, average='weighted')

# Generate classification matrix
matrix = confusion_matrix(y_test, y_pred)

# Plot accuracy, precision, and recall curves
plot_precision_recall_curve(gnb, X_test, y_test)
plot_roc_curve(gnb, X_test, y_test)
plot_confusion_matrix(gnb, X_test, y_test)


ValueError: ignored