In [None]:
import re
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv('datasets/IMDB-Dataset.csv')
df.head(5)

In [None]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

df['review'] = df['review'].apply(preprocessor)
df.head(5)

In [None]:
# prepare some training and testing samples

num_samples = len(df['review'])
num_train = int(num_samples * 0.7)
num_test = int(num_samples * 0.3)
random_indices = np.random.permutation(num_samples)

In [None]:
# traing and testing datasets

X_train = df.loc[random_indices[:num_train], 'review'].values
y_train = df.loc[random_indices[:num_train], 'sentiment'].values
X_test = df.loc[random_indices[-num_test:], 'review'].values
y_test = df.loc[random_indices[-num_test:], 'sentiment'].values

In [None]:
print("X_train : ",X_train.shape," X_test : ",X_test.shape,"\ny_train : ",y_train.shape," y_test : ",y_test.shape)

In [None]:
# TF-IDF matrix to feed into LR model

tfidf = TfidfVectorizer()
classifier = LogisticRegressionCV(max_iter = 10)
clf = Pipeline([('tfidf',tfidf), ('clf',classifier)])
clf.fit(X_train, y_train)

In [None]:
# make prediction on testing datasets

y_pred = clf.predict(X_test)

In [None]:
# print out the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# print out classification details

print(classification_report(y_test, y_pred))

In [None]:
# overall accuracy on IMDB datasets

accuracy_score(y_test, y_pred)