In [1]:
%config IPCompleter.use_jedi = False

### Import Libraries

In [2]:
import pandas as pd
import numpy as np

### Import dataset

In [3]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")

In [4]:
train.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [5]:
test.head()

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...


### x_train, y_train, x_test

In [6]:
x_train = train.values[:,0]
y_train = train.values[:,-1]

In [7]:
print(x_train.shape)
print(y_train.shape)

(40000,)
(40000,)


In [8]:
x_test = test.values[:,0]

In [9]:
print(x_test.shape)

(10000,)


In [10]:
x_train[0]

"mature intelligent and highly charged melodrama unbelivebly filmed in China in 1948. wei wei's stunning performance as the catylast in a love triangle is simply stunning if you have the oppurunity to see this magnificent film take it"

## Bag of words pipeline

### Tokenisation

In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [12]:
def tokenise(x_train):
    words_in_documents = []
    for document in x_train:
        sentences = (sent_tokenize(document))
        words = []
        for sentence in sentences:
            temp = word_tokenize(sentence)
            final_words = [x for x in temp if x.isalnum()]
            words.extend(final_words)
        words_in_documents.append(words)
    return words_in_documents

In [13]:
words_in_documents = tokenise(x_train)

In [14]:
len(words_in_documents)

40000

In [15]:
words_in_test_documents = tokenise(x_test)

### Stopword Removal

In [16]:
from nltk.corpus import stopwords

In [17]:
sw = set(stopwords.words('english'))

In [18]:
'not' in sw

True

In [19]:
sw.discard('not')

In [20]:
'not' in sw

False

In [21]:
sw.add("br")

In [22]:
sw.add("http")

In [23]:
def remove_stopwards(words_in_documents, sws):
    final_words_in_documents = []
    for text in words_in_documents:
        useful_words = [w.lower() for w in text if w.lower() not in sws]
        final_words_in_documents.append(useful_words)
    return final_words_in_documents

In [24]:
words_in_documents = remove_stopwards(words_in_documents, sw)

In [25]:
len(words_in_documents)

40000

In [26]:
words_in_test_documents = remove_stopwards(words_in_test_documents, sw)

### Stemming

In [27]:
from nltk.stem.snowball import PorterStemmer

In [28]:
ps = PorterStemmer()

In [29]:
def stemming(stemmer, documents):
    final_documents = []
    for document in documents:
        useful_words = [stemmer(w) for w in document if not w.isdigit()]
        final_documents.append(useful_words)
    return final_documents

In [30]:
words_in_documents = stemming(ps.stem,words_in_documents)

In [31]:
len(words_in_documents)

40000

In [32]:
words_in_test_documents = stemming(ps.stem,words_in_test_documents)

### Multinomial Naive Bayes Classification

In [33]:
N = x_train.shape[0]
N

40000

In [34]:
Npos = np.count_nonzero(y_train=='pos')
Nneg = np.count_nonzero(y_train=='neg')
print(Npos+Nneg)

40000


In [35]:
P_pos = Npos/N
P_neg = Nneg/N

In [36]:
def vocab_size(documents):
    vocab = set()
    count=0
    for document in documents:
        for w in document:
            if w not in vocab:
                vocab.add(w)
                count+=1
    return count
vocab = vocab_size(words_in_documents)

In [37]:
d_pos = {}
d_neg = {}
for i in range(N):
    if y_train[i]=="pos":
        for w in words_in_documents[i]:
            d_pos[w] = d_pos.get(w,0)+1
    else:
        for w in words_in_documents[i]:
            d_neg[w] = d_neg.get(w,0)+1

In [38]:
count_words_pos = 0
count_words_neg = 0
temp_pos = set()
temp_neg = set()
for i in range(N):
    if y_train[i]=="pos":
        for word in words_in_documents[i]:
            if word not in temp_pos:
                temp_pos.add(word)
                count_words_pos+=1
    else:
        for word in words_in_documents[i]:
            if word not in temp_neg:
                temp_neg.add(word)
                count_words_neg+=1

In [39]:
def posterior(X,c):
    temp = set()
    countc=0
    L = 1
    if c=="pos":
        countc = count_words_pos
        for w in X:
            countwc = d_pos.get(w,0)
            num = countwc+1
            denom = countc+vocab
            L*=100*num/denom
        Pc = P_pos
    else:
        countc = count_words_neg
        for w in X:
            countwc = d_neg.get(w,0)
            num = countwc+1
            denom = countc+vocab
            L*=100*num/denom
        Pc = P_neg
    return L*Pc

In [40]:
posterior(words_in_documents[1233],"pos")

8.307628363622563e-21

In [41]:
posterior(words_in_documents[1233],"neg")

0.0005877058373650559

In [42]:
y_train[1233]

'neg'

In [43]:
predicted = []
for document in words_in_test_documents:
    if posterior(document,"pos")>=posterior(document,"neg"):
        predicted.append("pos")
    else:
        predicted.append("neg")

In [44]:
Id = [int(i) for i in range(len(words_in_test_documents))]
d = {"Id":Id, "label":predicted}

In [45]:
result = pd.DataFrame.from_dict(d)

In [46]:
result.to_csv("Result.csv", index=False)

### Accuracy on Training Data

In [47]:
predicted = []
for document in words_in_documents:
    if posterior(document,"pos")>=posterior(document,"neg"):
        predicted.append("pos")
    else:
        predicted.append("neg")

In [48]:
predicted = np.array(predicted)

In [49]:
accuracy = np.count_nonzero(predicted==y_train)/y_train.shape[0]

In [50]:
print(accuracy)

0.890525
