# DataSet

In [15]:
import random
import torch
import torch.nn as nn

In [None]:
def generate_synthetic_sentences(
  vocab, 
  num_sentences, 
  max_sentence_length
) -> list[str]:

    sentences = []
    for _ in range(num_sentences):
        sentence_length = random.randint(3, max_sentence_length)
        sentence = " ".join(random.choices(vocab, k=sentence_length))
        sentences.append(sentence)
    return sentences

In [4]:
 # Define a vocabulary of words
vocab = [
    'the', 'cat', 'sat', 'on', 'mat', 'dog', 'lay', 'rug', 
    'runs', 'quickly', 'jumps', 'over'
]

# Number of sentences to generate
num_sentences = 10000

# Maximum length of each sentence
max_sentence_length = 10

# Generate synthetic sentences
sentences = generate_synthetic_sentences(
    vocab,
    num_sentences,
    max_sentence_length
)

# Print a few generated sentences
for i in range(5):
    print(sentences[i])

mat rug on on over jumps sat runs mat
the sat dog cat dog on the sat dog
jumps the rug cat quickly runs
lay lay rug over jumps quickly dog quickly the rug
runs sat sat the runs dog


In [5]:
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for i, word in enumerate(vocab)}

In [6]:
def sentence_to_indices(sentence):
    return [word_to_index[word] for word in sentence.split()]

sequences = [sentence_to_indices(sentence) for sentence in sentences]

In [7]:
input_sequences = []
target_words = []

for seq in sequences:
    for i in range(len(seq) - 1):
        input_sequences.append(seq[:i+1])
        target_words.append(seq[i+1])

In [9]:
max_len = max(len(seq) for seq in input_sequences)
input_sequences = [seq + [0]*(max_len - len(seq)) for seq in input_sequences]

In [11]:
input_sequences_tensor= torch.tensor(input_sequences, dtype=torch.long)
target_words_tensor= torch.tensor(target_words, dtype=torch.long)

In [13]:
vocab_size = len(vocab)
embedding_dim = 10

# neural network

In [16]:
class SimpleLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_len) -> None:
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.position_embeddings = nn.Embedding(max_len, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        seq_length = x.size(1)
        word_emb = self.word_embeddings(x)
        # word_emb [batch_size,seq_length,emb_dim]
        positions = torch.arange(0, seq_length, device=x.device).unsqueeze(0)
        pos_emb = self.position_embeddings(positions)
        # pos_emb [1,seq_length,emb_dim]
        combined_emb = word_emb + pos_emb
        # combined_emb [batch_size,seq_length,emb_dim]
        pooled = combined_emb.mean(dim=1)
        # pooled [batch_size,emb_dim]
        output = self.linear(pooled)
        # pooled [batch_size,vocab_size]
        return output

In [17]:
# Initialize the model, loss function, and optimizer
model = SimpleLM(vocab_size, embedding_dim, max_len)
criterion= nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs: int = 10 

for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(input_sequences_tensor)
    loss = criterion(outputs, target_words_tensor)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 1 == 0:
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}')

Epoch 1/10, Loss: 2.5265
Epoch 2/10, Loss: 2.5253
Epoch 3/10, Loss: 2.5242
Epoch 4/10, Loss: 2.5230
Epoch 5/10, Loss: 2.5220
Epoch 6/10, Loss: 2.5209
Epoch 7/10, Loss: 2.5199
Epoch 8/10, Loss: 2.5189
Epoch 9/10, Loss: 2.5179
Epoch 10/10, Loss: 2.5169


In [18]:
position_embeddings= model.position_embeddings.weight.detach().numpy()

In [19]:
print(position_embeddings)

[[ 1.1340306   0.11033292  0.22841862  0.11904594  0.03617826  1.5396925
  -0.23890209  0.511333    0.03775401  0.75233614]
 [ 1.2807093  -0.9325126  -1.2030383   0.49914822  0.2410827  -0.98017025
   0.26483223  0.43462896  0.10837775  1.0033818 ]
 [ 0.81870496  1.9024105   0.48907223 -1.08184     0.10827861 -0.34640405
   1.263768    1.6890718   0.5562738  -0.03814454]
 [-0.16376473  0.48563674 -0.13159533  0.29760948 -0.8620853  -0.7046244
   1.950941   -0.24978565  0.5007888  -0.7222857 ]
 [-1.5125744   0.91639096  0.47649866  0.65601915 -0.25429332 -1.3177302
  -0.6660218   0.9739118   1.0674806   0.7716453 ]
 [-1.0245352   2.9435475   1.1478709   1.5670217  -0.7481856   1.0309759
   0.4723779  -0.0290156  -0.5433204  -0.48901293]
 [ 0.21585585 -1.8701496  -0.7799847  -0.08169478 -0.74236745 -0.35431692
  -1.2548198  -1.4870505   0.61008084 -0.96681476]
 [-1.269984   -0.03000763  0.32594803  1.4416723   1.9523418  -1.0127788
  -1.2961525  -2.7342653   0.11808898  1.4207361 ]
 [ 0.