In [10]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [11]:
data = pd.read_csv("movie.csv")
data.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


# Data preprocessing

Cleaning text data (removing punctuation, turning to lowercase, removing stop words)

In [12]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nehue\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nehue\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    words = word_tokenize(text)
    text = ' '.join(word for word in words if word not in stop_words)
    return text


In [14]:
data['text'] = data['text'].apply(preprocess_text)

In [15]:
data.head()

Unnamed: 0,text,label
0,grew b 1965 watching loving thunderbirds mates...,0
1,put movie dvd player sat coke chips expectatio...,0
2,people know particular time past like feel nee...,0
3,even though great interest biblical movies bor...,0
4,im die hard dads army fan nothing ever change ...,1


# Feature engineering
Converting each sample's text into numerical features that can be used as input for an ML model

## Feature Extraction Technique: **TF-IDF**
**Term frequency - inverse document frecuency**, which highlights the importance of words in a sample in contrast to the collection of all samples, giving higher weights to terms that are frequent within a text sample but rare across the rest of samples

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TF-IDF vectorizer to convert text data to numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['text'])
y = data['label']

In [19]:
X

<40000x159207 sparse matrix of type '<class 'numpy.float64'>'
	with 4024949 stored elements in Compressed Sparse Row format>

In [20]:
y.head()

0    0
1    0
2    0
3    0
4    1
Name: label, dtype: int64

## Model training

Trying with Logistic Regression, since it's made for binary classification tasks, predicting the probability of an outcome (0 = Negative, 1 = Positive)

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)


LogisticRegression()

## Evaluating the model

In [22]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print(classification_report(y_test, y_pred))


Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.90      0.89      0.89      3966
           1       0.89      0.91      0.90      4034

    accuracy                           0.90      8000
   macro avg       0.90      0.90      0.90      8000
weighted avg       0.90      0.90      0.90      8000



## Predicting with new reviews

In [29]:
new_reviews = [
    "The plot was kind of good I must admit, and loved the action scenes",
    "The movies was great... For a 2 year old, who doesn't differentiate a car from a plane",
    "cars was too good because appear the Rayo Macuin and Franchesco Virgolini, FIAUUUUUUUUU",
    "The good was movie, yeah"
]

new_reviews = [preprocess_text(review) for review in new_reviews]

X_new = vectorizer.transform(new_reviews)

predictions = model.predict(X_new)
print(predictions)


[1 1 1 1]
