In [1]:
import re
from collections import Counter

import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

Предобработка данных

In [2]:
def prepare_data(text, window_size=2):
	# Удаляем все символы кроме a-z, @, и #
	text = re.sub(r'[^a-z@# ]', '', text)    
	# Преобразуем текст в нижний регистр
	text = text.lower()
	# Разбиваем по словам
	tokens = text.split()    
	# Формируем словарь уникальных слов
	vocab = set(tokens)
	# Формируем слова слов с указанием индекса  слова в словаре
	word_to_ix = {word: i for i, word in enumerate(vocab)}
	# Формируем пары слов n-грамм
	data = []
	for i in range(len(tokens)):
		for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
			if i != j:
				data.append((tokens[i], tokens[j]))    
	return data, word_to_ix, len(vocab)

# Skip-Gram

Класс данных

In [3]:
class SkipGramModelDataset(Dataset):
	def __init__(self, data, word_to_ix):
		self.data = [(word_to_ix[center], word_to_ix[context]) for center, context in data]	
	def __len__(self):
		return len(self.data)	
	def __getitem__(self, idx):
		return torch.tensor(self.data[idx][0], dtype=torch.long), torch.tensor(self.data[idx][1], dtype=torch.long)

Структура модели

In [4]:
class Word2VecSkipGramModel(nn.Module):
	def __init__(self, vocab_size, embedding_dim):
		super(Word2VecSkipGramModel, self).__init__()
		self.embeddings = nn.Embedding(vocab_size, embedding_dim)
		self.out_layer = nn.Linear(embedding_dim, vocab_size)
		self.activation_function = nn.LogSoftmax(dim=-1)

	def forward(self, center_word_idx):
		hidden_layer = self.embeddings(center_word_idx)
		out_layer = self.out_layer(hidden_layer)
		log_probs = self.activation_function(out_layer)
		return log_probs

Обучение модели

In [5]:
def train_model(data, word_to_ix, vocab_size, embedding_dim=50, epochs=10, batch_size=1):
	# Формируем набор данных
	dataset = SkipGramModelDataset(data, word_to_ix)
	dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
	# модель
	model = Word2VecSkipGramModel(vocab_size, embedding_dim)
	# функция потерь
	loss_function = nn.NLLLoss()
	#  оптимизатор
	optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

	for epoch in range(epochs):
		total_loss = 0
		for center_word, context_word in dataloader:
			model.zero_grad()
			log_probs = model(center_word)
			loss = loss_function(log_probs, context_word)
			loss.backward()
			optimizer.step()            
			total_loss += loss.item()			
		print(f'Epoch {epoch + 1}, Loss: {total_loss}')
	return model

In [10]:
# Основная функция для вызова
def train(data: str):
	# Гиперпараметры:
	# размер окна
	window_size = 2
	# длина ембединга
	embedding_dim = 10
	# количество эпох обучения
	epochs = 5
	# размер батча
	batch_size = 2
	
	# предобработка данных
	ngramm_data, word_to_ix, vocab_size = prepare_data(data, window_size) 
	# основной процесс формирование и обучения модели
	model = train_model(ngramm_data, word_to_ix, vocab_size, embedding_dim, epochs, batch_size)
	
	# # Извлекаем векторы слов из модели
	embeddings = model.embeddings.weight.data.numpy()
	# формируем словарь слов и их векторное представление
	ix_to_word = {i: word for word, i in word_to_ix.items()}
	w2v_dict = {ix_to_word[ix]: embeddings[ix] for ix in range(vocab_size)}
	return w2v_dict

In [11]:
# Тестовые данные
test_text = 'Captures Semantic Relationships: The skip-gram model effectively captures semantic relationships between words. It learns word embeddings that encode similar meanings and associations, allowing for tasks like word analogies and similarity calculations. Handles Rare Words: The skip-gram model performs well even with rare words or words with limited occurrences in the training data. It can generate meaningful representations for such words by leveraging the context in which they appear. Contextual Flexibility: The skip-gram model allows for flexible context definitions by using a window around each target word. This flexibility captures local and global word associations, resulting in richer semantic representations. Scalability: The skip-gram model can be trained efficiently on large-scale datasets due to its simplicity and parallelization potential. It can process vast amounts of text data to generate high-quality word embeddings.'

w2v_dict = train(test_text)

Epoch 1, Loss: 3617.666100502014
Epoch 2, Loss: 3207.677946448326
Epoch 3, Loss: 3012.711859345436
Epoch 4, Loss: 2860.186472296715
Epoch 5, Loss: 2734.4632539749146
Epoch 6, Loss: 2638.899858355522
Epoch 7, Loss: 2538.966754436493
Epoch 8, Loss: 2465.6429120898247
Epoch 9, Loss: 2404.75001847744
Epoch 10, Loss: 2352.578776985407
Epoch 11, Loss: 2316.4191286563873
Epoch 12, Loss: 2283.9881920814514
Epoch 13, Loss: 2258.787660598755
Epoch 14, Loss: 2242.6005405187607
Epoch 15, Loss: 2214.313904672861
Epoch 16, Loss: 2193.749487325549
Epoch 17, Loss: 2188.7517686486244
Epoch 18, Loss: 2179.3011078238487
Epoch 19, Loss: 2165.4264918863773
Epoch 20, Loss: 2155.266880095005
Epoch 21, Loss: 2145.6149440407753
Epoch 22, Loss: 2142.541223704815
Epoch 23, Loss: 2139.577820658684
Epoch 24, Loss: 2122.148221999407
Epoch 25, Loss: 2119.5181475281715
Epoch 26, Loss: 2118.0613654851913
Epoch 27, Loss: 2117.4751784205437
Epoch 28, Loss: 2114.4236257076263
Epoch 29, Loss: 2103.0918547809124
Epoch 30, 

# CBOW

In [27]:
def prepare_data_cbow(text: str, window_size=2):
	text = re.sub(r'[^a-z@# ]', '', text.lower())    
	tokens = text.split()    
	
	vocab = set(tokens)
	word_to_ix = {word: i for i, word in enumerate(vocab)}
	
	data = []
	for i in range(window_size, len(tokens) - window_size):
		context = [tokens[i - j - 1] for j in range(window_size)] + [tokens[i + j + 1] for j in range(window_size)]
		target = tokens[i]
		data.append((context, target))
	return data, word_to_ix, len(vocab)	

class CBOWDataset(Dataset):
	def __init__(self, data, word_to_ix):
		self.contexts = []
		self.targets = []
		for context, target in data:
			indexed_context = [word_to_ix[word] for word in context]
			self.contexts.append(indexed_context)
			self.targets.append(word_to_ix[target])

	def __len__(self):
		return len(self.targets)

	def __getitem__(self, idx):
		# Возвращаем контекст и центральное слово как пару тензоров
		return torch.tensor(self.contexts[idx], dtype=torch.long), torch.tensor(self.targets[idx], dtype=torch.long)

Изменение архитектуры модели

In [28]:
class Word2VecCBOWModel(nn.Module):
	def __init__(self, vocab_size, embedding_dim):
		super(Word2VecCBOWModel, self).__init__()
		self.embeddings = nn.Embedding(vocab_size, embedding_dim)
		self.out_layer = nn.Linear(embedding_dim, vocab_size)
		self.activation_function = nn.LogSoftmax(dim=1)

	def forward(self, center_word_idx):
		hidden_layer = torch.mean(self.embeddings(center_word_idx), dim=1)
		out_layer = self.out_layer(hidden_layer)
		log_probs = self.activation_function(out_layer)
		return log_probs

Обновление функции обучения

In [29]:
def train_model_cbow(data, word_to_ix, vocab_size, embedding_dim=50, epochs=10, batch_size=1):
	dataset = CBOWDataset(data, word_to_ix)
	dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
	
	model = Word2VecCBOWModel(vocab_size, embedding_dim)
	loss_function = nn.NLLLoss()
	optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
	
	for epoch in range(epochs):
		total_loss = 0
		for context_words, target_word in dataloader:
			context_words = context_words  # Подготавливаем контекстные слова
			model.zero_grad()
			log_probs = model(context_words)
			loss = loss_function(log_probs, target_word)
			loss.backward()
			optimizer.step()
			total_loss += loss.item()
		print(f'Epoch {epoch + 1}, Loss: {total_loss}')
	return model

In [30]:
def train(data: str):
	window_size = 2
	embedding_dim = 10
	epochs = 5
	batch_size = 2
	
	ngramm_data, word_to_ix, vocab_size = prepare_data_cbow(data, window_size)    
	model = train_model_cbow(ngramm_data, word_to_ix, vocab_size, embedding_dim, epochs, batch_size)
	
	# # Извлекаем векторы слов из модели
	embeddings = model.embeddings.weight.data.numpy()
	ix_to_word = {i: word for word, i in word_to_ix.items()}
	w2v_dict = {ix_to_word[ix]: embeddings[ix] for ix in range(vocab_size)}
	return w2v_dict

In [31]:
test_text = 'Captures Semantic Relationships: The skip-gram model effectively captures semantic relationships between words. It learns word embeddings that encode similar meanings and associations, allowing for tasks like word analogies and similarity calculations. Handles Rare Words: The skip-gram model performs well even with rare words or words with limited occurrences in the training data. It can generate meaningful representations for such words by leveraging the context in which they appear. Contextual Flexibility: The skip-gram model allows for flexible context definitions by using a window around each target word. This flexibility captures local and global word associations, resulting in richer semantic representations. Scalability: The skip-gram model can be trained efficiently on large-scale datasets due to its simplicity and parallelization potential. It can process vast amounts of text data to generate high-quality word embeddings.'

w2v_dict = train(test_text)

Epoch 1, Loss: 282.7689628601074
Epoch 2, Loss: 282.6620945930481
Epoch 3, Loss: 282.55592036247253
Epoch 4, Loss: 282.4492983818054
Epoch 5, Loss: 282.34387159347534
