In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from nltk.tokenize import word_tokenize
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Dense, LSTM, SimpleRNN, Input, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from nltk.corpus import stopwords
import spacy

In [12]:
text_cleaned = pd.read_parquet('data_cleaned.parquet')

In [13]:
text_cleaned

Unnamed: 0,review_text,review_rate
0,"[mayor, virtud, película, existencia.el, hecho...",negative
1,"[experto, cinéfilo, ,, poco, vez, tanto, juego...",negative
2,"[si, incondicional, humor, estilo, tele, 5.si,...",negative
3,"[saber, pasar, ,, si, gente, dejar, llevar, mo...",negative
4,"[`, `, amanecer, ,, quedo, solo, ,, sentir, fo...",negative
...,...,...
56362,"[pensar, película, hacer, buen, trabajo, derec...",positive
56363,"[malo, parcela, ,, mal, diálogo, ,, malo, actu...",negative
56364,"[católico, enseñado, escuela, primario, parroq...",negative
56365,"[ir, tener, desacuerdo, comentario, anterior, ...",negative


In [14]:
text_vectorizer = TextVectorization(output_mode='int')

In [15]:
text_cleaned['review_text'] = text_cleaned['review_text'].map(lambda corpus: ' '.join(corpus))

In [16]:
text_vectorizer.adapt([text_cleaned['review_text']])

In [17]:
text_cleaned

Unnamed: 0,review_text,review_rate
0,mayor virtud película existencia.el hecho pode...,negative
1,"experto cinéfilo , poco vez tanto juego sala c...",negative
2,si incondicional humor estilo tele 5.si termin...,negative
3,"saber pasar , si gente dejar llevar moda , si ...",negative
4,"` ` amanecer , quedo solo , sentir fondoun mar...",negative
...,...,...
56362,pensar película hacer buen trabajo derecha.no ...,positive
56363,"malo parcela , mal diálogo , malo actuación , ...",negative
56364,católico enseñado escuela primario parroquial ...,negative
56365,ir tener desacuerdo comentario anterior lado m...,negative


In [18]:
model = Sequential(name='Text_Vectorizing')
model.add(Input(shape=(1,), dtype=tf.string))
model.add(text_vectorizer)

text_vectorized = model.predict([text_cleaned['review_text']])


   1/1762 [..............................] - ETA: 1:51



In [19]:
model.predict(['loca'])



array([[4533]], dtype=int64)

In [20]:
size_voc = len(text_vectorizer.get_vocabulary())

In [21]:
size_voc

288998

In [22]:
data_train_texts = text_cleaned['review_text']

In [23]:
y_prediction_rate = OneHotEncoder().fit_transform(np.array(text_cleaned['review_rate']).reshape(-1,1)).toarray()

In [24]:
x_train, x_test, y_train,y_test = train_test_split(text_vectorized.to_tensor().numpy(),y_prediction_rate,random_state=43, train_size=.8)

In [25]:
MX_LEN= 100

x_train_padd = sequence.pad_sequences(x_train, maxlen=MX_LEN, padding='post',
                                         truncating = 'post')

x_test_padd = sequence.pad_sequences(x_test, maxlen=MX_LEN, padding='post',
                                     truncating='post')


In [26]:
words =dict(enumerate(text_vectorizer.get_vocabulary()))

In [27]:
model_text = Sequential()

model_text.add(Embedding(input_dim=size_voc+1,output_dim=200,input_length=100))
model_text.add(SimpleRNN(200, return_sequences=True))
model_text.add(LSTM(200))
model_text.add(Dense(2, activation='softmax'))

optimiz = tf.optimizers.RMSprop(learning_rate=0.0001)

model_text.compile(loss='categorical_crossentropy', optimizer=optimiz, metrics=['accuracy'])

In [28]:
model_text.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 200)          57799800  
                                                                 
 simple_rnn (SimpleRNN)      (None, 100, 200)          80200     
                                                                 
 lstm (LSTM)                 (None, 200)               320800    
                                                                 
 dense (Dense)               (None, 2)                 402       
                                                                 
Total params: 58,201,202
Trainable params: 58,201,202
Non-trainable params: 0
_________________________________________________________________


In [29]:
history = model_text.fit(x_train_padd, y_train, batch_size=100, epochs=2, validation_data=(x_test_padd, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
lemmer = spacy.load('es_core_news_sm')

def lematizer_text(rows):
    
    doc = [word.lemma_ for word in lemmer(rows)]

    return doc

In [33]:
stopwords_spanish = stopwords.words('spanish')
def clean_stopWords(rows):
    no_stops = []
    for word in rows:
        if word not in stopwords_spanish:
            no_stops.append(word)
    return no_stops

In [34]:

probe_padd = sequence.pad_sequences(model.predict([' '.join(clean_stopWords(lematizer_text('Tan buena que quedé maravillado por su falta de realismo...la ausencia de buen reparto fue muy notorio')))]), maxlen=100, padding='post', truncating='post')

rest = model_text.predict(probe_padd.reshape(1,100,1))

if rest[0][0]> rest[0][1]:
    print(rest[0][0],'Negativo')
else:
    print(rest[0][1],'Positivo')



0.5389313 Positivo


In [37]:
model_text.save('RNN_model_prueba.h5')