In [1]:
import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrics, accuracy_score, classification_report

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sadakanekazuma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sadakanekazuma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
file_dir = os.path.join("./", "dataset/", "IMDB Dataset.csv")

In [3]:
df = pd.read_csv(file_dir)

In [4]:
df.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


In [5]:
df.shape

(50000, 2)

In [6]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

In [7]:
df["sentiment"].value_counts() / df["sentiment"].value_counts().sum()

positive    0.5
negative    0.5
Name: sentiment, dtype: float64

In [8]:
df_review = df.copy()

In [10]:
# remove special words
df_review.review = df_review.review.apply(lambda x: re.sub(r"<.*>", "", x))
df_review.review = df_review.review.apply(lambda x: re.sub(r"[^0-9a-zA-Z\s]", "", x))

In [11]:
def remove_stopwords(text):
    stopword_set = set(stopwords.words('english'))
    not_stopwords = [token for token in word_tokenize(text) if token.lower() not in stopword_set]
    return " ".join(not_stopwords)

df_review.review = df_review.review.apply(remove_stopwords)

In [12]:
# stemming
stemmer = PorterStemmer()
df_review.review = df_review.review.apply(lambda x: " ".join([stemmer.stem(token) for token in x.split()]))

In [13]:
X = df_review.review
y = pd.get_dummies(df_review.sentiment)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y.iloc[:,0], test_size=0.2, shuffle=True, random_state = 0)

In [21]:
# BOW
vectorizer = CountVectorizer()
X1_train = vectorizer.fit_transform(X_train).toarray()
X1_test = vectorizer.transform(X_test).toarray()

In [22]:
#TF*IDF
vectorizer = TfidfVectorizer()
X2_train = vectorizer.fit_transform(X_train).toarray()
X2_test = vectorizer.fit_transform(X_test).toarray()

In [23]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X1_train, y_train)
y_pred = model.predict(X1_test)

In [26]:
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test, y_pred)

In [27]:
confusion_m

array([[4100,  865],
       [ 765, 4270]])

In [28]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [29]:
accuracy

0.837

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X2_train, y_train)
y_pred = model.predict(X2_test)

In [None]:
confusion_m = confusion_matrix(y_test, y_pred)

In [None]:
confusion_m

In [None]:
accuracy = accuracy_score(y_test, y_pred)

In [None]:
accuracy