In [1]:
x_train = ["This was awesome an awesome movie",
          "Great Movie! I liked it a lot",
          "Happy Ending! awesome acting by the hero",
          "loved it! truly great",
          "bad not upto the mark",
          "could have been better",
          "Surely a Disapponting movie"]
y_train = [1,1,1,1,0,0,0]  # 1 :- Positive Class & 0 :- Negative Class

x_test = ["I was happy and happy I loved the acting in the movie",
          "The movie I saw was bad"]

## Data Cleaning

In [2]:
from nltk.tokenize import RegexpTokenizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [4]:
def getCleanText(text):
    text = text.lower()
    
    tokens = tokenizer.tokenize(text)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(tokens) for tokens in new_tokens]
    
    clean_text = " ".join(stemmed_tokens)
    return clean_text

In [5]:
# x_clean = getCleanText(x_train) # gives error 
x_clean = [getCleanText(i) for i in x_train]
xt_clean = [getCleanText(i) for i in x_test]

In [6]:
x_clean

['awesom awesom movi',
 'great movi like lot',
 'happi end awesom act hero',
 'love truli great',
 'bad upto mark',
 'could better',
 'sure disappont movi']

## Vectorization

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))

In [9]:
x_vec = cv.fit_transform(x_clean).toarray()
x_vec

array([[0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0]], dtype=int64)

In [10]:
cv.get_feature_names()

['act',
 'act hero',
 'awesom',
 'awesom act',
 'awesom awesom',
 'awesom movi',
 'bad',
 'bad upto',
 'better',
 'could',
 'could better',
 'disappont',
 'disappont movi',
 'end',
 'end awesom',
 'great',
 'great movi',
 'happi',
 'happi end',
 'hero',
 'like',
 'like lot',
 'lot',
 'love',
 'love truli',
 'mark',
 'movi',
 'movi like',
 'sure',
 'sure disappont',
 'truli',
 'truli great',
 'upto',
 'upto mark']

In [11]:
xt_vec = cv.transform(xt_clean).toarray()
xt_vec

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

## Multinomial Naive Bayes

In [12]:
from sklearn.naive_bayes import MultinomialNB
mn = MultinomialNB()

In [13]:
mn.fit(x_vec,y_train)

MultinomialNB()

In [14]:
y_pred = mn.predict(xt_vec)

In [15]:
y_pred

array([1, 0])