In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

In [2]:
X = ["This was an awesome movie",
     "Great movie! I liked it a lot",
     "Happy Ending! awesome acting by the hero",
     "loved it! truly great",
     "bad not upto the mark",
     "could have better",
     "Surely a Disappointing movie"]

y = [1,1,1,1,0,0,0]

In [3]:
X_test = ["I was happy. It was great and I loved the acting",
          "The movie I saw was bad"]
y_test=[1,0]

In [4]:
tokenizer=RegexpTokenizer('[a-zA-Z]+')
en_stopwords=set(stopwords.words('english'))
ps=PorterStemmer()

In [5]:
def stemmed(review):
    review=review.lower()
    tokens=tokenizer.tokenize(review)
    filtered_tokens=[token for token in tokens if token not in en_stopwords]
    stemmed_tokens=[ps.stem(token) for token in filtered_tokens]
    cleaned_review=' '.join(stemmed_tokens)
    return cleaned_review

In [6]:
X_train=[stemmed(i) for i in X]
x_test=[stemmed(i) for i in X]
print(X_train)

['awesom movi', 'great movi like lot', 'happi end awesom act hero', 'love truli great', 'bad upto mark', 'could better', 'sure disappoint movi']


In [7]:
cv=CountVectorizer(ngram_range=(1,2))
X_train_vec=cv.fit_transform(X_train).toarray()
print(X_train_vec.shape)
print(X_train_vec)

(7, 33)
[[0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0]
 [0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0]]


In [8]:
print(cv.get_feature_names())

['act', 'act hero', 'awesom', 'awesom act', 'awesom movi', 'bad', 'bad upto', 'better', 'could', 'could better', 'disappoint', 'disappoint movi', 'end', 'end awesom', 'great', 'great movi', 'happi', 'happi end', 'hero', 'like', 'like lot', 'lot', 'love', 'love truli', 'mark', 'movi', 'movi like', 'sure', 'sure disappoint', 'truli', 'truli great', 'upto', 'upto mark']


In [9]:
X_test_vec=cv.transform(X_test).toarray()
print(X_test_vec.shape)
print(X_test_vec)
print(cv.get_feature_names())

(2, 33)
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
['act', 'act hero', 'awesom', 'awesom act', 'awesom movi', 'bad', 'bad upto', 'better', 'could', 'could better', 'disappoint', 'disappoint movi', 'end', 'end awesom', 'great', 'great movi', 'happi', 'happi end', 'hero', 'like', 'like lot', 'lot', 'love', 'love truli', 'mark', 'movi', 'movi like', 'sure', 'sure disappoint', 'truli', 'truli great', 'upto', 'upto mark']


## Multinomial Naive Bayes

In [10]:
mnb=MultinomialNB()

In [11]:
mnb=mnb.fit(X_train_vec,y)

In [12]:
mnb.predict(X_test_vec)

array([1, 0])

In [13]:
mnb.predict_proba(X_test_vec)

array([[0.23651452, 0.76348548],
       [0.65019011, 0.34980989]])

## Multivariate Bernoulli  Naive Bayes

In [14]:
bnb=BernoulliNB(binarize=0.0)

In [15]:
bnb=bnb.fit(X_train_vec,y)

In [16]:
bnb.predict(X_test_vec)

array([1, 0])

In [17]:
bnb.predict_proba(X_test_vec)

array([[0.22936356, 0.77063644],
       [0.79872728, 0.20127272]])

In [18]:
bnb.score(X_train_vec,y)

1.0