In [1]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import gensim
from wordcloud import WordCloud
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report as cr

warnings.simplefilter(action = 'ignore', category = FutureWarning)

In [2]:
dataset = pd.read_csv("IMDB_dataset_preprocessed.csv")
#dataset.drop(dataset.tail(44930).index, inplace = True) - will reduce the 50000 review dataset to 5070 reviews
reviews = [w for w in dataset["review"]]
tokenized_words = [gensim.utils.simple_preprocess(w) for w in reviews]
#tokenized_words

In [3]:
#wordcloud = WordCloud(width = 500, height = 500).generate(dataset["review"])
#plt.figure(figsize = (12, 8), facecolor = 'k')
#plt.imshow(wordcloud)
#plt.show()

In [4]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf = True,
    strip_accents = 'unicode',
    analyzer = 'word',
    token_pattern = r'\w{1,}',
    stop_words = 'english',
    ngram_range = (1, 3),
    max_features = 10000
)

In [5]:
x = word_vectorizer.fit_transform(dataset['review'])
x

<50000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 4199142 stored elements in Compressed Sparse Row format>

In [6]:
y = pd.get_dummies(dataset["sentiment"])
y = y.iloc[:,1].values
y

array([1, 1, 1, ..., 0, 0, 0], dtype=uint8)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x.toarray(), y, test_size = 0.2, random_state = 0)

In [8]:
Tfidf_NB = GaussianNB().fit(X_train, y_train)

In [9]:
Tfidf_NB_prediction = Tfidf_NB.predict(X_test)

In [10]:
score = accuracy_score(y_test, Tfidf_NB_prediction)
print(score)
print(cr(y_test, Tfidf_NB_prediction))

0.8276
              precision    recall  f1-score   support

           0       0.83      0.82      0.83      5035
           1       0.82      0.83      0.83      4965

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000

