In [65]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [66]:
df1 = pd.read_csv("text.csv") 

In [67]:
df1

Unnamed: 0,Document,Class
0,Teclov is a great educational institution.,education
1,Educational greatness depends on ethics,education
2,A story of great ethics and educational greatness,education
3,Sholey is a great cinema,cinema
4,good movie depends on good story,cinema


In [68]:
df1['Class'] = df1.Class.map({'cinemaa':0, 'education':1})
df1

Unnamed: 0,Document,Class
0,Teclov is a great educational institution.,1.0
1,Educational greatness depends on ethics,1.0
2,A story of great ethics and educational greatness,1.0
3,Sholey is a great cinema,
4,good movie depends on good story,


In [69]:
X= df1['Document'].to_numpy()
Y = df1['Class'].to_numpy()
y = Y.astype('int')


In [70]:
print("X")
print(X)
print("y")
print(y)

X
['Teclov is a great educational institution.'
 'Educational greatness depends on ethics'
 'A story of great ethics and educational greatness'
 'Sholey is a great cinema' 'good movie depends on good story']
y
[                   1                    1                    1
 -9223372036854775808 -9223372036854775808]


In [71]:
vec = CountVectorizer()

In [72]:
vec.fit(X)
vec.vocabulary_

{'teclov': 15,
 'is': 9,
 'great': 6,
 'educational': 3,
 'institution': 8,
 'greatness': 7,
 'depends': 2,
 'on': 12,
 'ethics': 4,
 'story': 14,
 'of': 11,
 'and': 0,
 'sholey': 13,
 'cinema': 1,
 'good': 5,
 'movie': 10}

In [73]:
vec = CountVectorizer(stop_words='english' )
vec.fit(X)
vec.vocabulary_

{'teclov': 11,
 'great': 5,
 'educational': 2,
 'institution': 7,
 'greatness': 6,
 'depends': 1,
 'ethics': 3,
 'story': 10,
 'sholey': 9,
 'cinema': 0,
 'good': 4,
 'movie': 8}

In [74]:
print(vec.get_feature_names_out())
print(len(vec.get_feature_names_out()))

['cinema' 'depends' 'educational' 'ethics' 'good' 'great' 'greatness'
 'institution' 'movie' 'sholey' 'story' 'teclov']
12


In [75]:
X_transformed=vec.transform(X)
X_transformed

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [76]:
X=X_transformed.toarray()
X

array([[0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0]])

In [77]:
pd.DataFrame(X, columns=vec.get_feature_names_out())

Unnamed: 0,cinema,depends,educational,ethics,good,great,greatness,institution,movie,sholey,story,teclov
0,0,0,1,0,0,1,0,1,0,0,0,1
1,0,1,1,1,0,0,1,0,0,0,0,0
2,0,0,1,1,0,1,1,0,0,0,1,0
3,1,0,0,0,0,1,0,0,0,1,0,0
4,0,1,0,0,2,0,0,0,1,0,1,0


In [78]:
df2 = pd.read_csv('test_text.csv') 
df2

Unnamed: 0,Document,Class
0,very good educational institution,1


In [79]:
test_numpy_array = df2.to_numpy()
X_test = test_numpy_array[:,0]
y_test = test_numpy_array[:,1]
y_test = y_test.astype('int')
print("X_test")
print(X_test)
print("Y_test")
print(y_test)

X_test
['very good educational institution']
Y_test
[1]


In [80]:
X_test_transformed=vec.transform(X_test)
X_test_transformed

<1x12 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [81]:
X_test=X_test_transformed.toarray()
X_test

array([[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0]])

In [82]:
mnb=MultinomialNB()

mnb.fit(X,y)

mnb.predict_proba(X_test)

array([[0.32808399, 0.67191601]])

In [83]:
y_prediction = mnb.predict(X_test)


acc = accuracy_score(y_test, y_prediction)


print("Accuracy:", acc)


Accuracy: 1.0
