In [56]:
# !pip install gensim
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

In [57]:
corpus = [
    "I love machine learning",
    "Word2Vec is a great algorithm",
    "Implementing word2vec is fun and educational"
]

In [58]:
simple_preprocess('Word2Vec is a great algorithm')

['word', 'vec', 'is', 'great', 'algorithm']

In [59]:
# Preprocessing the text
processed_corpus = [simple_preprocess(doc) for doc in corpus]

In [60]:
processed_corpus

[['love', 'machine', 'learning'],
 ['word', 'vec', 'is', 'great', 'algorithm'],
 ['implementing', 'word', 'vec', 'is', 'fun', 'and', 'educational']]

In [61]:
model = Word2Vec(sentences=processed_corpus, vector_size=100, window=2, min_count=1, workers=4)

In [62]:
model.wv['fun']

array([-8.7274825e-03,  2.1301615e-03, -8.7354420e-04, -9.3190884e-03,
       -9.4281426e-03, -1.4107180e-03,  4.4324086e-03,  3.7040710e-03,
       -6.4986930e-03, -6.8730675e-03, -4.9994122e-03, -2.2868442e-03,
       -7.2502876e-03, -9.6033178e-03, -2.7436293e-03, -8.3628409e-03,
       -6.0388758e-03, -5.6709289e-03, -2.3441375e-03, -1.7069972e-03,
       -8.9569986e-03, -7.3519943e-04,  8.1525063e-03,  7.6904297e-03,
       -7.2061159e-03, -3.6668312e-03,  3.1185520e-03, -9.5707225e-03,
        1.4764392e-03,  6.5244664e-03,  5.7464195e-03, -8.7630618e-03,
       -4.5171441e-03, -8.1401607e-03,  4.5956374e-05,  9.2636338e-03,
        5.9733056e-03,  5.0673080e-03,  5.0610625e-03, -3.2429171e-03,
        9.5521836e-03, -7.3564244e-03, -7.2703874e-03, -2.2653891e-03,
       -7.7856064e-04, -3.2161034e-03, -5.9258583e-04,  7.4888230e-03,
       -6.9751858e-04, -1.6249407e-03,  2.7443992e-03, -8.3591007e-03,
        7.8558037e-03,  8.5361041e-03, -9.5840869e-03,  2.4462664e-03,
      

In [17]:
# Get the vector for a word
vector = model.wv['fun']
print(len(vector))

100


In [38]:
model.wv.index_to_key

['is',
 'vec',
 'word',
 'educational',
 'and',
 'fun',
 'implementing',
 'algorithm',
 'great',
 'learning',
 'machine',
 'love']

In [18]:
# Example: Find similar words
similar_words = model.wv.most_similar('fun')
print(similar_words)

[('love', 0.16694684326648712), ('and', 0.13887687027454376), ('educational', 0.13149002194404602), ('word', 0.06408979743719101), ('great', 0.06059185042977333), ('machine', 0.020000355318188667), ('implementing', 0.019154027104377747), ('vec', 0.009391160681843758), ('algorithm', -0.05774581432342529), ('is', -0.059874895960092545)]


# Doc2Vec

In [22]:
tagged_corpus = [TaggedDocument(words=simple_preprocess(doc), tags=[str(i)]) for i, doc in enumerate(corpus)]

In [73]:
tagged_corpus

[TaggedDocument(words=['love', 'machine', 'learning'], tags=['0']),
 TaggedDocument(words=['word', 'vec', 'is', 'great', 'algorithm'], tags=['1']),
 TaggedDocument(words=['implementing', 'word', 'vec', 'is', 'fun', 'and', 'educational'], tags=['2'])]

In [74]:
# for k in enumerate(['a','b','c']):
#   print(k)

In [75]:
# {n:m for m,n in enumerate(['a','b','c'])}

In [23]:
tagged_corpus

[TaggedDocument(words=['love', 'machine', 'learning'], tags=['0']),
 TaggedDocument(words=['word', 'vec', 'is', 'great', 'algorithm'], tags=['1']),
 TaggedDocument(words=['implementing', 'word', 'vec', 'is', 'fun', 'and', 'educational'], tags=['2'])]

In [24]:
# Training the Doc2Vec model
mod = Doc2Vec(tagged_corpus, vector_size=100, window=2, min_count=1, workers=4, epochs=20)

In [25]:
vector = mod.dv['0']
print(vector)

[-0.00527819 -0.00601931 -0.00989545  0.00857172  0.00359562  0.00025288
 -0.00988777 -0.00518546 -0.00973477  0.00202757  0.00281696  0.0046634
 -0.00433183 -0.00317033 -0.00306658 -0.00873361  0.00216362  0.0092466
 -0.00952865 -0.00346199 -0.0037968   0.00259918 -0.00569514  0.00265455
  0.00579866 -0.00812233 -0.00836511 -0.0099757   0.00493463 -0.00914691
  0.00585652  0.00680242 -0.00650747 -0.00455145 -0.00126637  0.00166305
 -0.00150028 -0.00857375 -0.00361711  0.00172257 -0.00202982 -0.00723293
  0.00421299 -0.00860138  0.00270574 -0.00462646  0.00064875 -0.00203858
  0.00541401 -0.00805321 -0.00214881 -0.00010938 -0.00666505 -0.00655978
 -0.00195023  0.00885407 -0.00125152  0.00356975 -0.0057673   0.00884441
  0.00294264  0.00933568  0.00438536 -0.00421386  0.00224783 -0.00441896
  0.0058108   0.00185461 -0.00227713 -0.0058888  -0.00807288 -0.00085071
 -0.00896562 -0.00923251 -0.00794214  0.00217329 -0.00653946 -0.00782066
  0.00212299  0.00204655  0.00836188  0.00470029 -0.0

In [29]:
# Find similar documents
mod.dv.most_similar('1')

[('0', 0.164995938539505), ('2', -0.043281614780426025)]

In [39]:
mod.dv.index_to_key

['0', '1', '2']

In [40]:
import torch
import torch.nn as nn
import torch.optim as optim

In [41]:
from torch.utils.data import DataLoader, TensorDataset

In [42]:
# Creating sequences of word embeddings
sequences = []
for sentence in processed_corpus:
    embeddings = [model.wv[word] for word in sentence if word in model.wv]
    sequences.append(torch.tensor(embeddings, dtype=torch.float))

  sequences.append(torch.tensor(embeddings, dtype=torch.float))


In [44]:
len(sequences)

3

In [45]:
# Padding sequences to the same length
padded_sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True)

In [47]:
padded_sequences.shape

torch.Size([3, 7, 100])

In [48]:
sequence_lengths = torch.tensor([len(seq) for seq in sequences])

In [49]:
sequence_lengths

tensor([3, 5, 7])

In [50]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, lengths):
        packed_input = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, hidden = self.rnn(packed_input)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        return self.fc(hidden.squeeze(0))

In [51]:
input_size = 100  # Same as the vector_size of Word2Vec
hidden_size = 50
output_size = 2  # Example output size (e.g., binary classification)

In [52]:
model = RNNModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
dataset = TensorDataset(padded_sequences, sequence_lengths)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [None]:

for epoch in range(10):
    for batch in data_loader:
        padded_sequences, sequence_lengths = batch
        labels = torch.randint(0, output_size, (padded_sequences.size(0),))  # Dummy labels for example purposes

        optimizer.zero_grad()
        outputs = model(padded_sequences, sequence_lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from torch.utils.data import DataLoader, TensorDataset

# Sample text data
corpus = [
    "I love machine learning",
    "Word2Vec is a great algorithm",
    "Implementing word2vec is fun and educational"
]

# Preprocessing the text
processed_corpus = [simple_preprocess(doc) for doc in corpus]

# Training the Word2Vec model using CBOW
w2v_model = Word2Vec(sentences=processed_corpus, vector_size=100, window=4, min_count=1, workers=4, sg=0)

# Creating sequences of word embeddings for next word prediction
sequences = []
targets = []
for sentence in processed_corpus:
    embeddings = [w2v_model.wv[word] for word in sentence if word in w2v_model.wv]
    for i in range(len(embeddings) - 1):
        sequences.append(torch.tensor(embeddings[:i+1], dtype=torch.float))
        targets.append(torch.tensor(embeddings[i+1], dtype=torch.float))

# Padding sequences to the same length
padded_sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True)
targets = torch.stack(targets)

# Create DataLoader
dataset = TensorDataset(padded_sequences, targets)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Define the RNN Model for next word prediction
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, input_size)

    def forward(self, x):
        packed_input = nn.utils.rnn.pack_padded_sequence(x, x.size(1), batch_first=True, enforce_sorted=False)
        packed_output, hidden = self.rnn(packed_input)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        return self.fc(output[:, -1, :])

# Initialize the RNN Model
input_size = 100  # Same as the vector_size of Word2Vec
hidden_size = 50

model = RNNModel(input_size, hidden_size)
criterion = nn.MSELoss()  # Mean Squared Error Loss for regression
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the RNN Model
num_epochs = 10
for epoch in range(num_epochs):
    for batch in data_loader:
        padded_sequences, targets = batch

        optimizer.zero_grad()
        outputs = model(padded_sequences)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")



# Example: Predict the next word embedding for a given sequence
with torch.no_grad():
    input_sequence = torch.tensor([w2v_model.wv[word] for word in simple_preprocess("I love machine")], dtype=torch.float).unsqueeze(0)
    padded_input_sequence = nn.utils.rnn.pad_sequence([input_sequence], batch_first=True)
    next_word_embedding = model(padded_input_sequence)
    print(next_word_embedding)
