# Sentiment analysis for entertainment media using Natural Language Processing models
This notebook contains the script used to generate the proposed model for sentiment analysis of IMDb reviews.

In [1]:
# The first step is to be sure that pandas and scikit-learn are installed.
%pip install pandas scikit-learn  

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [28]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn.pipeline import make_pipeline

In [3]:
dataset = pd.read_csv("./dataset/IMDB Dataset.csv")
dataset

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
positiveReviews = dataset[(dataset["sentiment"] == "positive")]
negativeReviews = dataset[(dataset["sentiment"] == "negative")]

positiveInput = positiveReviews["review"].values
positiveOutput = positiveReviews["sentiment"].values

negativeInput = negativeReviews["review"].values
negativeOutput = negativeReviews["sentiment"].values

pInput_train, pInput_test, pOutput_train, pOutput_test = train_test_split(positiveInput, positiveOutput, train_size = 0.50, random_state = 54321)
nInput_train, nInput_test, nOutput_train, nOutput_test = train_test_split(negativeInput, negativeOutput, train_size = 0.50, random_state = 54321)

In [5]:
input_train = np.concatenate((nInput_train, pInput_train), axis = 0)
output_train = np.concatenate((nOutput_train, pOutput_train), axis = 0)

# Queda pendiente unificar las listas de input y output para pruebas (se realiza hasta que tengamos el producto terminado)
input_test = np.concatenate((nInput_test, pInput_test), axis = 0)
output_test = np.concatenate((nOutput_test, pOutput_test), axis = 0)

print(len(input_train))
print(len(input_test))

25000
25000


In [6]:
vectorizer = TfidfVectorizer()
tfidf_input_train = vectorizer.fit_transform(input_train)
tfidf_input_test = vectorizer.fit_transform(input_test)
print(tfidf_input_test.shape)
print(tfidf_input_train.shape)


(25000, 76609)
(25000, 76597)


In [7]:
model = SVC(kernel="rbf", gamma="auto", max_iter=1000)
# model = LinearSVC().fit(tfidf_input_train, tfidf_output_train)
model.fit(tfidf_input_train, output_train,)



In [8]:
predict = model.predict(tfidf_input_test)
print(predict)

# confusion_matrix(output_test, predict)

ValueError: X has 76609 features, but SVC is expecting 76597 features as input.

In [21]:
clf = make_pipeline(StandardScaler(), SVC(kernel="linear", gamma='auto', max_iter= 2000))
clf.fit(input_train, output_train)



In [23]:
predictionSVC = clf.predict(input_test)
print(predictionSVC)

['negative' 'negative' 'negative' ... 'positive' 'negative' 'positive']


In [48]:
linearTfidf = make_pipeline(TfidfVectorizer(), LinearSVC(max_iter= 4000))
linearTfidf.fit(input_train, output_train)

In [49]:
predictionLinear = linearTfidf.predict(input_test)
print(predictionLinear)

['negative' 'negative' 'negative' ... 'positive' 'negative' 'positive']


In [50]:
confusionMatrix = confusion_matrix(output_test, predictionLinear)
print(confusionMatrix)

[[11123  1377]
 [ 1268 11232]]


In [52]:
VP = confusionMatrix[0][0]
VN =  confusionMatrix[1][1]
FP = confusionMatrix[0][1]
FN = confusionMatrix[1][0]

In [53]:
accuracy =  (VP + VN) / (VP + VN + FP + FN)
precision = VP / (VP + FP)
recall= VP / (VP + FN)
F1 = (2 * (precision) * (recall)) / (precision + recall)

print("Accuracy:", accuracy)
print("precision:", precision)
print("Recall:", recall)
print("F1:", F1)

Accuracy: 0.8942
precision: 0.88984
Recall: 0.8976676620127512
F1: 0.8937366919770198
