In [145]:
x_train = ["This was awesome an awesome movie", 
            "Great movie! I liked it a lot",
            "Happy Ending! awesome acting by the hero",
            "loved it! truly great",
            "bad not upto the mark",
            "could have been better",
            "Surely a Disappointing movie"]
x_test = ["I was happy & happy and I loved the acting in the movie", "The movie I saw was bad"]           
y_train = [1,1,1,1,0,0,0]

# Data Cleaning

In [146]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk import download as nltk_download

In [147]:
nltk_download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [148]:
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [149]:
def clean_text(text):
  text_cleaned = []
  for sent in text:
    sent_cleaned = []
    sent = sent.lower() # first step
    tokens = tokenizer.tokenize(sent) # second step note tokenization includes punctuation
    for token in tokens:
      if token not in en_stopwords: # third step
        token_stemmed = ps.stem(token) # forth step
        sent_cleaned.append(token_stemmed)
    text_cleaned.append(" ".join(sent_cleaned))
  
  return text_cleaned

In [150]:
x_train_cleaned = clean_text(x_train)
x_test_cleaned = clean_text(x_test)

# Vectorization

In [152]:
x_train

['This was awesome an awesome movie',
 'Great movie! I liked it a lot',
 'Happy Ending! awesome acting by the hero',
 'loved it! truly great',
 'bad not upto the mark',
 'could have been better',
 'Surely a Disappointing movie']

In [151]:
x_train_cleaned

['awesom awesom movi',
 'great movi like lot',
 'happi end awesom act hero',
 'love truli great',
 'bad upto mark',
 'could better',
 'sure disappoint movi']

In [171]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1,2))
x_train_cleaned_vec = count_vect.fit_transform(x_train_cleaned).toarray()
x_test_cleaned_vec = count_vect.transform(x_test_cleaned).toarray()

In [173]:
from sklearn.naive_bayes import MultinomialNB
mn = MultinomialNB()
mn.fit(x_train_cleaned_vec, y_train)

MultinomialNB()

In [174]:
mn.predict(x_test_cleaned_vec)

array([1, 0])

In [175]:
import numpy as np

In [176]:
# prior propability
def prior_propability(y, c):
  return np.mean(y==c)

In [177]:
def conditional_propability(x, y, feature_idx, feature_value, label):
  p_intersection = ((x[:, feature_idx] == feature_value) & (y == label)).mean()
  prior_propability = np.mean(y==label)
  return p_intersection / prior_propability

In [178]:
def predict(x_train, y_train, x_test):
  classes = np.unique(y_train)
  posterior_prop = []
  for c in classes:
    likelyhood = 1
    for feature_idx in range(x_train.shape[1]):
      condprop = conditional_propability(x_train, y_train, feature_idx, x_test[feature_idx], c)
      likelyhood*=condprop
    posterior_prop.append(likelyhood)

  return classes[np.argmax(posterior_prop)]


In [194]:
def accuracy(x_train, y_train, x_test, y_test):
  y_hat = []
  for idx_row in range(x_test.shape[0]):
    sample = x_test[idx_row, :]
    y_hat.append(predict(x_train, y_train, sample))
  print(y_hat)
  acc = np.mean(y_hat == y_test)
  return acc

In [195]:
predict(x_train_cleaned_vec, y_train, x_test_cleaned_vec[0])

0

In [203]:
l1 = [1,2,3]
l2 = [5,2,3]
a1 = np.array(l1)
a2 = np.array(l2)
l1 == a2

array([False,  True,  True])

In [204]:
accuracy(x_train_cleaned_vec, y_train, x_test_cleaned_vec, np.array([1,0])) # with label encoder

[0, 0]


0.5