In [101]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import os
import textract
from docx import Document

import pandas as pd

In [74]:
def extract_text(file_path):
    file_name, file_ext = os.path.splitext(file_path)
    if file_ext == '.txt':
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    elif file_ext == '.docx':
        doc = Document(file_path)
        text = []
        for para in doc.paragraphs:
            text.append(para.text)
        text = '\n'.join(text)
    else:
        raise ValueError(f'Unsupported file extension: {file_ext}. Only .txt and .docx are supported.')

    return text

In [75]:
df= pd.read_csv('../Data/Datasets/IMDB Dataset SPANISH.csv')

df= df.drop(['Unnamed: 0', 'review_es', 'sentimiento'], axis= 1)

df

Unnamed: 0,review_en,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [76]:
tokenizer= Tokenizer(num_words= 10000)

tokenizer.fit_on_texts(df['review_en'])

sequences= tokenizer.texts_to_sequences(df['review_en'])

max_length= 100

padded_sequences= pad_sequences(sequences, maxlen= max_length)


In [77]:
padded_sequences

array([[ 122,  209, 3237, ...,  124, 4102,  485],
       [   1, 2291,  174, ..., 1975,   68,  220],
       [ 219,  233, 2916, ...,   62,   15,  349],
       ...,
       [  48,   29,   85, ...,    2,    2, 6043],
       [8457, 5545,    2, ...,   66,  738,   40],
       [   4,    1,  115, ...,  792,   10,   16]])

In [78]:
df['review_en']= [padded_sequences[x] for x in padded_sequences]

df

Unnamed: 0,review_en,sentiment
0,"[[2234, 163, 13, 839, 3656, 4914, 1976, 649, 5...",positive
1,"[[1, 2291, 174, 3334, 95, 21, 66, 370, 62, 1, ...",positive
2,"[[4, 37, 146, 89, 67, 957, 11, 1575, 7071, 12,...",positive
3,"[[390, 35, 171, 26, 2842, 406, 13, 68, 10, 456...",negative
4,"[[627, 835, 1, 128, 49, 27, 1, 2767, 8, 1818, ...",positive
...,...,...
49995,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",positive
49996,"[[1, 16, 6, 319, 72, 90, 62, 10, 185, 419, 145...",negative
49997,"[[7365, 2818, 856, 5, 1190, 312, 1, 18, 6, 20,...",negative
49998,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",negative


In [79]:
embedding_layer = Embedding(input_dim=10000, output_dim=128, input_length=max_length)
conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')
pooling_layer = GlobalMaxPooling1D()

model = Sequential([
    embedding_layer,
    conv_layer,
    pooling_layer,
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compilar el modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# early_stopping = EarlyStopping(monitor= 'val_loss', patience= 3)

# Resumen del modelo
model.summary()



In [80]:
sent_dict= {'positive': 1,
            'negative': 0}

df['sentiment']= df['sentiment'].map(sent_dict)

In [81]:
x_train= padded_sequences
y_train= df['sentiment']

model.fit(x_train, y_train, epochs=5, batch_size=2)

Epoch 1/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 3ms/step - accuracy: 0.7806 - loss: 0.4450
Epoch 2/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 3ms/step - accuracy: 0.9015 - loss: 0.2559
Epoch 3/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 4ms/step - accuracy: 0.9418 - loss: 0.1624
Epoch 4/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 6ms/step - accuracy: 0.9730 - loss: 0.0808
Epoch 5/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 5ms/step - accuracy: 0.9851 - loss: 0.0443


<keras.src.callbacks.history.History at 0x2b0809ec890>

In [91]:
text= ['my house is red and i like it', 'she is not really happy']

In [92]:
sequences= tokenizer.texts_to_sequences(text)

max_length= 30

padded_sequences= pad_sequences(sequences, maxlen= max_length)

In [93]:
model.predict(padded_sequences)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


array([[0.70909524],
       [0.00213531]], dtype=float32)

In [98]:
loss, accuracy = model.evaluate(x_train, y_train)

print(f'test loss: {loss:.2f}. test accuracy: {accuracy:.2f}')

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9963 - loss: 0.0113
test loss: 0.01. test accuracy: 1.00


In [102]:
# pickle.dump(tokenizer, open('../tokenizador/Tokenizer.pkl', 'wb'))
# pickle.dump(model, open('../saved_models/Modelo de prediccion de sentimiento en texto.sav', 'wb'))