In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import string

In [2]:
try:
  import nltk
  from nltk.corpus import stopwords
  from nltk.tokenize import word_tokenize
  from gensim.models import Word2Vec
except:
  !pip install nltk
  !pip install gensim

In [None]:
# Load the data
# https://www.kaggle.com/datasets/stutisehgal/movie-review-csv?resource=download
data = pd.read_csv('movie_reviews.csv')
data.head()

Unnamed: 0,review,sentiment
0,This is one of those unfortunate films that su...,1
1,Okay maybe it was because I happen to be in Ya...,1
2,"Although I love this movie, I can barely watch...",1
3,"A man arrives in a strange, beautiful, sterile...",1
4,I'm sitting around going through movie listing...,1


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((79,), (20,), (79,), (20,))

In [5]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [9]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
# Preprocess the text data
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

X_train = X_train.apply(preprocess)
X_test  = X_test.apply(preprocess)

In [13]:
# Train the Word2Vec model
sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

In [21]:
sentences[0]

['nazarin',
 'directed',
 'luis',
 'bunuel',
 'presents',
 'extraordinary',
 'view',
 'religion',
 'mexico',
 'written',
 'director',
 'julio',
 'alejandro',
 'notable',
 'collaborator',
 'film',
 'put',
 'mexican',
 'cinema',
 'international',
 'map',
 'receiving',
 'grand',
 'prix',
 'cannes',
 'year',
 'disturbing',
 'film',
 'mr',
 'bunuel',
 'delves',
 'deep',
 'whats',
 'wrong',
 'churchbr',
 'br',
 'nazarin',
 'reckoning',
 'saint',
 'young',
 'priest',
 'seen',
 'living',
 'life',
 'poverty',
 'seedy',
 'pension',
 'city',
 'doesnt',
 'enough',
 'doesnt',
 'mind',
 'parting',
 'coin',
 'beggar',
 'appears',
 'window',
 'asking',
 'help',
 'time',
 'takes',
 'small',
 'room',
 'prostitute',
 'hurt',
 'fight',
 'another',
 'woman',
 'andara',
 'woman',
 'repays',
 'kindness',
 'burning',
 'room',
 'whole',
 'building',
 'nazarin',
 'seen',
 'taking',
 'countryside',
 'begging',
 'food',
 'andara',
 'beatriz',
 'two',
 'prostitutes',
 'old',
 'town',
 'follow',
 'nazarins',
 'life

In [None]:
# Vectorize the text data
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_train = np.array([vectorize(sentence) for sentence in X_train])
X_test  = np.array([vectorize(sentence) for sentence in X_test])

In [23]:
# Train a classification model
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [24]:
# Evaluate the model
y_pred = clf.predict(X_test)

In [29]:
print('Accuracy:' , accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, pos_label=1))
print('Recall:'   , recall_score(y_test, y_pred, pos_label=1))
print('F1-score:' , f1_score(y_test, y_pred, pos_label=1))

Accuracy: 0.6
Precision: 0.6
Recall: 1.0
F1-score: 0.75
