In [1]:

import numpy as np
import pandas as pd
import sklearn

# training data
train_docs = pd.read_csv('example_train.csv') 
train_docs

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,education
1,Educational greatness depends on ethics,education
2,A story of great ethics and educational greatness,education
3,Sholey is a great cinema,cinema
4,good movie depends on good story,cinema


In [2]:
train_docs.Class  = train_docs.Class.map({'cinema':0, 'education':1})
train_docs

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,1
1,Educational greatness depends on ethics,1
2,A story of great ethics and educational greatness,1
3,Sholey is a great cinema,0
4,good movie depends on good story,0


In [3]:
# splitting to x and y labels
train_array = train_docs.values

X_train  = train_array[:,0]
y_train = train_array[:,1]
y_train = y_train.astype('int')

print('X_trian')
print(X_train)
print('y_train')
print(y_train)

X_trian
['Upgrad is a great educational institution.'
 'Educational greatness depends on ethics'
 'A story of great ethics and educational greatness'
 'Sholey is a great cinema' 'good movie depends on good story']
y_train
[1 1 1 0 0]


# Creating the bag of words representation

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
# help(CountVectorizer())

In [6]:
vec = CountVectorizer()

In [10]:
vec.fit(X_train)
vec.vocabulary_
vec.vocabulary_ = dict(sorted(vec.vocabulary_.items(), key=lambda x : x[1]))

In [11]:
vec.vocabulary_

{'and': 0,
 'cinema': 1,
 'depends': 2,
 'educational': 3,
 'ethics': 4,
 'good': 5,
 'great': 6,
 'greatness': 7,
 'institution': 8,
 'is': 9,
 'movie': 10,
 'of': 11,
 'on': 12,
 'sholey': 13,
 'story': 14,
 'upgrad': 15}

In [13]:
vec = CountVectorizer(stop_words='english')
vec.fit(X_train)
vec.vocabulary_ = dict(sorted(vec.vocabulary_.items(), key=lambda x : x[1]))
vec.vocabulary_

{'cinema': 0,
 'depends': 1,
 'educational': 2,
 'ethics': 3,
 'good': 4,
 'great': 5,
 'greatness': 6,
 'institution': 7,
 'movie': 8,
 'sholey': 9,
 'story': 10,
 'upgrad': 11}

In [18]:
print(vec.get_feature_names_out())
print(len(vec.get_feature_names_out()))

['cinema' 'depends' 'educational' 'ethics' 'good' 'great' 'greatness'
 'institution' 'movie' 'sholey' 'story' 'upgrad']
12


In [20]:
X_transformed  = vec.transform(X_train)
X_transformed

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [21]:
print(X_transformed)

  (0, 2)	1
  (0, 5)	1
  (0, 7)	1
  (0, 11)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 6)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1
  (2, 6)	1
  (2, 10)	1
  (3, 0)	1
  (3, 5)	1
  (3, 9)	1
  (4, 1)	1
  (4, 4)	2
  (4, 8)	1
  (4, 10)	1


In [22]:
X_transformed.toarray()

array([[0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [23]:
pd.DataFrame(X_transformed.toarray(), columns=vec.get_feature_names_out())

Unnamed: 0,cinema,depends,educational,ethics,good,great,greatness,institution,movie,sholey,story,upgrad
0,0,0,1,0,0,1,0,1,0,0,0,1
1,0,1,1,1,0,0,1,0,0,0,0,0
2,0,0,1,1,0,1,1,0,0,0,1,0
3,1,0,0,0,0,1,0,0,0,1,0,0
4,0,1,0,0,2,0,0,0,1,0,1,0


In [24]:
test_doc = pd.read_csv('example_test.csv')
test_doc

Unnamed: 0,Document,Class
0,very good educational institution,education


In [25]:
test_doc.Class = test_doc.Class.map({'cinema':0,'education': 1})
test_doc

Unnamed: 0,Document,Class
0,very good educational institution,1


In [27]:
test_arr = test_doc.values
X_test = test_arr[:,0]
y_test = test_arr[:,1]

y_test = y_test.astype('int')
print(X_test)
print(y_test)

['very good educational institution']
[1]


In [28]:
X_test_transformed = vec.transform(X_test)
X_test_transformed

<1x12 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [30]:
x_test = X_test_transformed.toarray()
x_test

array([[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0]], dtype=int64)

# Finally Building multinomial naive bayes model

In [36]:
from sklearn.naive_bayes import MultinomialNB

mnb= MultinomialNB()
mnb.fit(X_transformed,y_train)

pred  = mnb.predict_proba(x_test)
pred

array([[0.32808399, 0.67191601]])

In [38]:
# probability of each class (test data)
print("probability of test document belonging to class CINEMA" , pred[:,0])
print("predbility of test document belonging to class EDUCATION" , pred[:,1])

probability of test document belonging to class CINEMA [0.32808399]
predbility of test document belonging to class EDUCATION [0.67191601]


# Now Building using Bernaulli Naive Bayes

In [39]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_transformed, y_train)

pred_bnb = bnb.predict_proba(x_test)


In [40]:
pred_bnb

array([[0.2326374, 0.7673626]])