# EJERCICIO DE ANALISIS DE SENTIMIENTOS USANDO EMBEDDINGS PREENTRENADOS Y REDES NEURONALES CON TENSORFLOW

In [1]:
!pip install numpy==1.26.4 scipy==1.13.1 gensim

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy==1.13.1
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

# creamos dataset de prueba

In [2]:
# 🔹 Simulación de carga de un dataset (puedes reemplazar por un CSV real)
tweets = [
    'Este producto es una maravilla',
    'No recomiendo comprar esto',
    'Excelente atención y calidad',
    'Muy mala experiencia, pésimo servicio',
    'Totalmente satisfecho con mi compra',
    'No funciona como esperaba',
    'Una compra perfecta y rápida',
    'Decepcionante, esperaba algo mejor'
]

# 1 = positivo, 0 = negativo
labels = [1, 0, 1, 0, 1, 0, 1, 0]

# TOKENIZACION

In [3]:
# 🔹 Tokenización
tokenizer = Tokenizer()
# Fit tokenizer on lowercased tweets to align with Word2Vec vocabulary
tweets_tokenized = [tweet.lower().split() for tweet in tweets]
tokenizer.fit_on_texts([' '.join(tweet) for tweet in tweets_tokenized])
sequences = tokenizer.texts_to_sequences([' '.join(tweet) for tweet in tweets_tokenized])
word_index = tokenizer.word_index
word_index

{'una': 1,
 'no': 2,
 'y': 3,
 'compra': 4,
 'esperaba': 5,
 'este': 6,
 'producto': 7,
 'es': 8,
 'maravilla': 9,
 'recomiendo': 10,
 'comprar': 11,
 'esto': 12,
 'excelente': 13,
 'atención': 14,
 'calidad': 15,
 'muy': 16,
 'mala': 17,
 'experiencia': 18,
 'pésimo': 19,
 'servicio': 20,
 'totalmente': 21,
 'satisfecho': 22,
 'con': 23,
 'mi': 24,
 'funciona': 25,
 'como': 26,
 'perfecta': 27,
 'rápida': 28,
 'decepcionante': 29,
 'algo': 30,
 'mejor': 31}

In [4]:
padded_sequences = pad_sequences(sequences, padding='post')

# ENTRENAR LOS TWEETS TOKENIZADOS CON WORD2VEC

In [5]:
w2v_model = Word2Vec(tweets_tokenized, vector_size=100, window=3, min_count=1, sg=1)

# CREO UNA MATRIZ DE EMBEDDING

In [6]:
vocab_size = len(word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    # Check if the word is in the Word2Vec model's vocabulary before accessing
    if word in w2v_model.wv:
        embedding_vector = w2v_model.wv[word]
        embedding_matrix[i] = embedding_vector

In [7]:
embedding_matrix

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-5.36209613e-04,  2.33797240e-04,  5.10245934e-03, ...,
        -7.04496354e-03,  9.00856685e-04,  6.39314251e-03],
       [ 9.45639622e-05,  3.07731982e-03, -6.81264512e-03, ...,
         5.12590399e-04,  8.21308419e-03, -7.01904064e-03],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 9.73111484e-03, -9.79223382e-03, -6.50790054e-03, ...,
        -2.72329105e-03,  3.82445473e-03,  3.32233176e-04],
       [-8.71688314e-03,  2.10690335e-03, -8.84466222e-04, ...,
        -8.71822610e-03,  2.95548537e-03, -6.67331927e-03]])

# CREAR EL MODELO DE RED NEURONAL CON TENSORFLOW

In [8]:
model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=padded_sequences.shape[1],
              trainable=False),
    GlobalAveragePooling1D(),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])



# COMPILAR EL MODELO

In [9]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# ENTRENAR EL MODELO

In [10]:
model.fit(padded_sequences, np.array(labels), epochs=30)

Epoch 1/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.5000 - loss: 0.6931
Epoch 2/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.8750 - loss: 0.6922
Epoch 3/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.8750 - loss: 0.6915
Epoch 4/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.8750 - loss: 0.6911
Epoch 5/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.8750 - loss: 0.6907
Epoch 6/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 1.0000 - loss: 0.6903
Epoch 7/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 1.0000 - loss: 0.6900
Epoch 8/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 1.0000 - loss: 0.6898
Epoch 9/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x7d2fc7597790>

# PROBAR EL MODELO

In [11]:
# 🔹 Predicción sobre un tweet nuevo
tweet_nuevo = ['pésimo producto, muy decepcionado']
seq_nuevo = tokenizer.texts_to_sequences(tweet_nuevo)
seq_nuevo_padded = pad_sequences(seq_nuevo, maxlen=padded_sequences.shape[1], padding='post')
prediccion = model.predict(seq_nuevo_padded)

print('Probabilidad de ser positivo:', float(prediccion[0]))
print('Sentimiento:', 'Positivo' if prediccion[0] > 0.5 else 'Negativo')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
Probabilidad de ser positivo: 0.49868544936180115
Sentimiento: Negativo


  print('Probabilidad de ser positivo:', float(prediccion[0]))
