## Load dataset

In [25]:
import numpy as np 
import pandas as pd

train = pd.read_csv('../data/labeledTrainData.tsv.zip', delimiter="\t")
test = pd.read_csv("../data/testData.tsv.zip",delimiter= "\t")

train = train.drop(['id'], axis=1)

In [26]:
train['word_n'] = train['review'].apply(lambda x : len(x.split(' ')))
test["word_n"] = test["review"].apply(lambda x : len(x.split(" ")))

## Data Preprocessing

In [27]:
import re
import json

Using regrex library, we can remove the **html** tags easily from the sentiments

In [28]:
TAG_RE = re.compile(r'<[^>]+>')

In [29]:
train['review']=train['review'].apply(lambda x:TAG_RE.sub('', x))
test['review']=test['review'].apply(lambda x: TAG_RE.sub('', x))

In [30]:
train['review']=train['review'].apply(lambda x: re.sub("[^a-zA-Z]"," ",x))
test['review']=test['review'].apply(lambda x: re.sub("[^a-zA-Z]"," ",x))

In [31]:
train['word_n_2'] = train['review'].apply(lambda x : len(x.split(' ')))
test['word_n_2'] = test['review'].apply(lambda x : len(x.split(' ')))

In [32]:
# from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english")) 
# lemmatizer = WordNetLemmatizer()

In [33]:
train["review"]=train['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [34]:
test["review"]=test['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [35]:
train["review"].str.find("?").value_counts()

-1    25000
Name: review, dtype: int64

In [36]:
test1=test.drop(["word_n","word_n_2","id"],axis=1)

In [37]:
X=train.drop(["word_n","word_n_2","sentiment"],axis=1)

In [38]:
Y=train.drop(["word_n","word_n_2","review"],axis=1)

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(train["review"])

In [40]:
text_counts

<25000x73903 sparse matrix of type '<class 'numpy.int64'>'
	with 2163574 stored elements in Compressed Sparse Row format>

## Vectorization

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(train['review'])

In [42]:
text_tf

<25000x74188 sparse matrix of type '<class 'numpy.float64'>'
	with 2582646 stored elements in Compressed Sparse Row format>

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_tf,Y, test_size=0.2, random_state=42)

In [44]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((20000, 74188), (5000, 74188), (20000, 1), (5000, 1))

## Model

In [45]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train.values.ravel())
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted)*100)


MultinomialNB Accuracy: 86.26
