In [2]:
import numpy as np
import pandas as pd
import sklearn

# training data
train_docs = pd.read_csv(r"C:\Users\Harishree.R\Downloads\example_train.csv") 
train_docs

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,education
1,Educational greatness depends on ethics,education
2,A story of great ethics and educational greatness,education
3,Sholey is a great cinema,cinema
4,good movie depends on good story,cinema


In [3]:
# convert label to a numerical variable
train_docs['Class'] = train_docs.Class.map({'cinema':0, 'education':1})
train_docs

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,1
1,Educational greatness depends on ethics,1
2,A story of great ethics and educational greatness,1
3,Sholey is a great cinema,0
4,good movie depends on good story,0


In [4]:
# convert the df to a numpy array 
train_array = train_docs.values

# split X and y
X_train = train_array[:,0]
y_train = train_array[:,1]
y_train = y_train.astype('int') # sklearn needs y as integers

print("X_train")
print(X_train)
print("y_train")
print(y_train)

X_train
['Upgrad is a great educational institution.'
 'Educational greatness depends on ethics'
 'A story of great ethics and educational greatness'
 'Sholey is a great cinema' 'good movie depends on good story']
y_train
[1 1 1 0 0]


In [5]:
# create an object of CountVectorizer() class 
from sklearn.feature_extraction.text import CountVectorizer 
# help(CountVectorizer)

In [6]:
vec = CountVectorizer()

In [7]:
# fit the vectorizer on training data 
vec.fit(X_train)
vec.vocabulary_

{'upgrad': 15,
 'is': 9,
 'great': 6,
 'educational': 3,
 'institution': 8,
 'greatness': 7,
 'depends': 2,
 'on': 12,
 'ethics': 4,
 'story': 14,
 'of': 11,
 'and': 0,
 'sholey': 13,
 'cinema': 1,
 'good': 5,
 'movie': 10}

In [8]:
# fitting the vectorizer on training data again
# removing the stop words this time
vec = CountVectorizer(stop_words='english')
vec.fit(X_train)
vec.vocabulary_

{'upgrad': 11,
 'great': 5,
 'educational': 2,
 'institution': 7,
 'greatness': 6,
 'depends': 1,
 'ethics': 3,
 'story': 10,
 'sholey': 9,
 'cinema': 0,
 'good': 4,
 'movie': 8}

In [9]:
# printing feature names
print(vec.get_feature_names_out())
print(len(vec.get_feature_names_out()))

['cinema' 'depends' 'educational' 'ethics' 'good' 'great' 'greatness'
 'institution' 'movie' 'sholey' 'story' 'upgrad']
12


In [10]:
# another way of representing the features
X_transformed = vec.transform(X_train)
X_transformed

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [11]:
# converting transformed matrix back to an array
# note the high number of zeros
X_transformed.toarray()

array([[0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [12]:
# converting matrix to dataframe
pd.DataFrame(X_transformed.toarray(), 
             columns=vec.get_feature_names())

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

In [13]:
# building a multinomial NB model
from sklearn.naive_bayes import MultinomialNB

# instantiate NB class
mnb=MultinomialNB()

# fitting the model on training data
mnb.fit(X_transformed, y_train)

# note that we are using the sparse matrix X_transformed, 
# though you can also use the non-sparse version
# mnb.fit(X_transformed.toarray(), y_train) 



In [14]:
mnb

In [15]:
a = ["Jailer is a best movie"]

In [16]:
a = vec.transform(a)
a

<1x12 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [17]:
a=a.toarray()
a

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [18]:
proba = mnb.predict_proba(a)

In [19]:
proba

array([[0.625, 0.375]])

In [20]:
# probability of each class (test data)
print("probability of test document belonging to class CINEMA" , proba[:,0])
print("probability of test document belonging to class EDUCATION" , proba[:,1])

probability of test document belonging to class CINEMA [0.625]
probability of test document belonging to class EDUCATION [0.375]


In [21]:
a = ["fita is  a great educational institution"]

In [22]:
a = vec.transform(a)
a=a.toarray()
proba = mnb.predict_proba(a)
proba

array([[0.09788567, 0.90211433]])

In [23]:
# probability of each class (test data)
print("probability of test document belonging to class CINEMA" , proba[:,0])
print("probability of test document belonging to class EDUCATION" , proba[:,1])

probability of test document belonging to class CINEMA [0.09788567]
probability of test document belonging to class EDUCATION [0.90211433]
