In [1]:
import kagglehub
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")



In [3]:
csv_path = os.path.join(path, "IMDB Dataset.csv")

df = pd.read_csv(csv_path)
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [4]:
#Limpando o texto
df['review'] = (
    df['review']
    .str.lower()
    .str.replace(r"<br\s*/?>", " ", regex=True)
    .str.replace(r"[^a-zA-Z']", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,petter mattei's love in the time of money is a...,positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


In [5]:
vocab_size = 10000
max_len = 200

vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_sequence_length=max_len,
    standardize=None
    
)

In [6]:
df['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [7]:
df['label'] = df['sentiment'].map({'negative':0, 'positive':1})

In [8]:
x_treino, x_teste, y_treino, y_teste = train_test_split(
    df['review'], df['label'], test_size=0.2, random_state=13
)

In [9]:
vectorizer.adapt(x_treino)
x_treino[:5]

18542    fear of a black hat is a superbly crafted film...
41418    you know a movie is bad when the highlight of ...
14092    i watched this movie when it was released and ...
44522    awful waste of time there is no camp or trash ...
18071    in the real world of art elizabeth wurtzel is ...
Name: review, dtype: object

In [10]:
embedding_dim = 128 #tamanho do vetor que representa as palavras

modelo = tf.keras.Sequential([
    vectorizer,
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True), #Serve para ignorar os zeros do veotor
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')

])

In [11]:
modelo.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

historico = modelo.fit(x_treino, y_treino, 
                       validation_data=(x_teste, y_teste),
                       epochs=5)

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 93ms/step - accuracy: 0.7447 - loss: 0.5014 - val_accuracy: 0.8025 - val_loss: 0.4398
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 91ms/step - accuracy: 0.8820 - loss: 0.2973 - val_accuracy: 0.8764 - val_loss: 0.2963
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 94ms/step - accuracy: 0.9254 - loss: 0.1935 - val_accuracy: 0.8774 - val_loss: 0.2969
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 94ms/step - accuracy: 0.9559 - loss: 0.1245 - val_accuracy: 0.8696 - val_loss: 0.3744
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 94ms/step - accuracy: 0.9754 - loss: 0.0768 - val_accuracy: 0.8542 - val_loss: 0.4297


In [12]:
loss, acc = modelo.evaluate(x_teste, y_teste)
print(f'Acurácia: {acc}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - accuracy: 0.8526 - loss: 0.4342
Acurácia: 0.854200005531311


O modelo obteve uma acuracia de 85% em dados que ele nunca viu antes, o que é um valor expressivo