In [1]:
!pip install numpy==1.26.4 scipy==1.13.1 gensim

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy==1.13.1
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [1]:
from gensim.models import Word2Vec

# Datos de ejemplo (lista de oraciones tokenizadas)

In [2]:

sentences = [
    ['gato', 'juega', 'con', 'bola'],
    ['perro', 'ladra', 'al', 'cartero'],
    ['pájaro', 'vuela', 'en', 'el', 'cielo'],
    ['gato', 'duerme', 'en', 'el', 'sofá']
]

# Entrenando modelo Word2Vec

In [3]:

model_w2v = Word2Vec(sentences, vector_size=50, window=3, min_count=1, sg=1)

## Parámetros importantes:

- **vector_size=50**:  
  👉 Esto indica que **cada palabra** será representada como un **vector de 50 dimensiones**.  

- **window=3**:  
  👉 La **"ventana de contexto"** es 3; es decir, al aprender relaciones entre palabras, se consideran hasta **3 palabras antes y después**.

- **min_count=1**:  
  👉 Una palabra debe aparecer al menos **1 vez** para ser incluida en el modelo (en este caso, todas entran).

- **sg=1**:  
  👉 Indica que se usará el modelo **Skip-Gram**.  
  Si fuera `sg=0`, usaría **CBOW (Continuous Bag of Words)**.


# Obtener embedding de una palabra

In [4]:
vector_gato = model_w2v.wv['gato']
print(vector_gato)

[-0.01723938  0.00733148  0.01037977  0.01148388  0.01493384 -0.01233535
  0.00221123  0.01209456 -0.0056801  -0.01234705 -0.00082045 -0.0167379
 -0.01120002  0.01420908  0.00670508  0.01445134  0.01360049  0.01506148
 -0.00757831 -0.00112361  0.00469675 -0.00903806  0.01677746 -0.01971633
  0.01352928  0.00582883 -0.00986566  0.00879638 -0.00347915  0.01342277
  0.0199297  -0.00872489 -0.00119868 -0.01139127  0.00770164  0.00557325
  0.01378215  0.01220219  0.01907699  0.01854683  0.01579614 -0.01397901
 -0.01831173 -0.00071151 -0.00619968  0.01578863  0.01187715 -0.00309133
  0.00302193  0.00358008]


# Buscar palabras similares

In [8]:
model_w2v.wv.most_similar('cielo')

[('con', 0.3065578043460846),
 ('al', 0.22442302107810974),
 ('perro', 0.1977294683456421),
 ('vuela', 0.18339458107948303),
 ('duerme', 0.10232102125883102),
 ('bola', 0.10159841924905777),
 ('juega', 0.08190789073705673),
 ('pájaro', 0.029594380408525467),
 ('en', 0.01830098405480385),
 ('el', 0.01243123784661293)]

## 📌 1. Glove con Gensim

In [9]:
import gensim.downloader as api

In [10]:
# Descargar embeddings GloVe preentrenados
glove_vectors = api.load('glove-wiki-gigaword-50')



In [12]:
# Ver vector de 'gato' (en inglés: 'cat')
vector_cat = glove_vectors['cat']
vector_cat

array([ 0.45281 , -0.50108 , -0.53714 , -0.015697,  0.22191 ,  0.54602 ,
       -0.67301 , -0.6891  ,  0.63493 , -0.19726 ,  0.33685 ,  0.7735  ,
        0.90094 ,  0.38488 ,  0.38367 ,  0.2657  , -0.08057 ,  0.61089 ,
       -1.2894  , -0.22313 , -0.61578 ,  0.21697 ,  0.35614 ,  0.44499 ,
        0.60885 , -1.1633  , -1.1579  ,  0.36118 ,  0.10466 , -0.78325 ,
        1.4352  ,  0.18629 , -0.26112 ,  0.83275 , -0.23123 ,  0.32481 ,
        0.14485 , -0.44552 ,  0.33497 , -0.95946 , -0.097479,  0.48138 ,
       -0.43352 ,  0.69455 ,  0.91043 , -0.28173 ,  0.41637 , -1.2609  ,
        0.71278 ,  0.23782 ], dtype=float32)

In [14]:
glove_vectors.most_similar('cat')

[('dog', 0.9218006134033203),
 ('rabbit', 0.8487821221351624),
 ('monkey', 0.8041081428527832),
 ('rat', 0.7891963124275208),
 ('cats', 0.7865270972251892),
 ('snake', 0.7798910737037659),
 ('dogs', 0.7795814871788025),
 ('pet', 0.7792249917984009),
 ('mouse', 0.773166835308075),
 ('bite', 0.7728800177574158)]