# Word2Vec con Gensim

## Instalación de librerías y carga de dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')
from gensim.models import Word2Vec
import pandas as pd
import re
from gensim.parsing.preprocessing import strip_punctuation, strip_numeric, strip_short, stem_text
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

-------------------------

In [2]:
from datasets import load_dataset

dataset_corpus = load_dataset("large_spanish_corpus", "ParaCrawl")

In [3]:
dataset_corpus

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 15510649
    })
})

In [4]:
subset = dataset_corpus['train'].select(range(1000000))

In [5]:
subset[0:2]

{'text': ['lavado de cerebro a través de los medios de comunicación, y amenaza de fuerza a través de los militares.',
  'Sin un constante aluvión de doble cañón, requiriendo la complicidad de los seres humanos para reprimir y engañar a sus semejantes, su tan cacareada magia rápidamente se desvanecería y se disiparía.']}

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


## Pre-procesamiento de texto

In [7]:
def clean_text(sentence_batch):
    # extrae el texto de la entrada
    text_list = sentence_batch['text']

    cleaned_text_list = []
    for text in text_list:
        # Convierte el texto a minúsculas
        text = text.lower()

        # Elimina URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Elimina las menciones @ y '#' de las redes sociales
        text = re.sub(r'\@\w+|\#\w+', '', text)

        # Elimina los caracteres de puntuación
        text = strip_punctuation(text)

        # Elimina los números
        text = strip_numeric(text)

        # Elimina las palabras cortas
        text = strip_short(text,minsize=2)

        # Elimina las palabras comunes (stop words)
        stop_words = set(stopwords.words('spanish'))
        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word not in stop_words]

        cleaned_text_list.append(filtered_text)

    # Devuelve el texto limpio
    return {'text': cleaned_text_list}

In [8]:
sentences_corpus = subset.map(clean_text, batched=True)

Map: 100%|██████████| 1000000/1000000 [05:11<00:00, 3212.85 examples/s]


In [10]:
sentences_corpus['text'][:2]

[['lavado',
  'cerebro',
  'través',
  'medios',
  'comunicación',
  'amenaza',
  'fuerza',
  'través',
  'militares'],
 ['constante',
  'aluvión',
  'doble',
  'cañón',
  'requiriendo',
  'complicidad',
  'seres',
  'humanos',
  'reprimir',
  'engañar',
  'semejantes',
  'tan',
  'cacareada',
  'magia',
  'rápidamente',
  'desvanecería',
  'disiparía']]

## Carga y uso de modelo de embeddings Word2Vec

In [11]:
model = Word2Vec(sentences_corpus['text'], vector_size=100, window=5, min_count= 2, workers=6, sg=1)

# Podemos guardar el modelo para uso futuro
model.save("word2vec.model")

In [12]:
model.wv['rey']

array([ 0.4405885 ,  0.25182328,  0.38889658, -0.1550255 ,  0.6978569 ,
        0.02721099, -0.11090719,  0.14892736, -0.81173   , -0.08095737,
       -0.5561422 , -0.52682006,  0.14487427,  0.5237141 , -0.6440865 ,
        0.289733  , -0.09736057, -0.87007046, -0.40572667, -0.16130371,
        0.34690636,  0.44136423,  0.75687504, -0.3299259 ,  0.39223567,
        0.97858685, -0.46213353, -0.52474207, -0.58636725, -0.4360571 ,
       -0.10583293,  0.05380669,  0.15072323, -0.23158605, -0.1715245 ,
        0.5548093 , -0.8663069 , -0.3198255 ,  0.07742387, -0.99232715,
       -0.08115203, -0.9155559 ,  0.13184567,  0.05092776, -0.13803643,
       -0.08724967, -0.42716852, -0.25924748,  0.13926835, -0.2918576 ,
       -0.43774974, -0.8254771 ,  0.5409307 ,  0.1063757 , -0.52943933,
       -0.42738485, -0.2035856 , -0.27531886, -0.12316383,  0.05332975,
        0.38656646, -0.29523233,  0.26849824, -0.10316229,  0.42634213,
        0.61328214, -0.05494862, -0.00392009, -0.2797938 , -0.37

In [13]:
##comida, ser, reina, television
model.wv.most_similar(['television'],topn=3)

[('gsm', 0.878962516784668),
 ('entertainment', 0.8598889112472534),
 ('supren', 0.8541260957717896)]

In [14]:
word_vectors = model.wv
vectors = word_vectors.vectors
words = word_vectors.index_to_key

## Almacenamiento de embeddings

In [15]:
df_vectors = pd.DataFrame(vectors)
df_vectors.to_csv('C:/Users/nicol/Documents/Proyectos/Embeddings/embeddings.tsv',sep='\t', index=False)

In [16]:
df_words = pd.DataFrame(words)
df_words.to_csv('C:/Users/nicol/Documents/Proyectos/Embeddings/labels.tsv',sep='\t', index=False)