# NNLM WITH LSTM

## preprocessing

In [1]:
from google.colab import files
import shutil
import re
import random

import torch
import torchtext
from nltk.corpus import brown
from collections import Counter


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# # preprocessing done on vs code, then uploaded cleaned corpus to colab

# # Define a function to clean and tokenize the text
# def clean_and_tokenize_text(input_file, output_file):
#     with open(input_file, 'r', encoding='utf-8') as file:
#         text = file.read()

#     # Tokenize the text into sentences
#     sentences = nltk.sent_tokenize(text)

#     # Filter and clean sentences
#     cleaned_sentences = []
#     for sentence in sentences:
#         # Remove extra spaces and tabs, and convert to lowercase
#         cleaned_sentence = ' '.join(sentence.split()).strip().lower()

#         # Check if the sentence contains alphanumeric characters
#         if any(c.isalnum() for c in cleaned_sentence):
#             # Check if the sentence contains the word "Chapter"
#             if "chapter" not in cleaned_sentence:
#                 # Add the cleaned sentence to the list
#                 cleaned_sentences.append(cleaned_sentence)

#     # Write cleaned sentences to the output file
#     with open(output_file, 'w', encoding='utf-8') as file:
#         for sentence in cleaned_sentences:
#             file.write(sentence + '\n')


# # Input and output file paths
# input_file = 'Auguste_Maquet.txt'
# output_file = 'cleaned_auguste_maquet.txt'

# # Clean and tokenize the text
# clean_and_tokenize_text(input_file, output_file)

# print(f'Cleaned sentences have been saved to {output_file}')


In [None]:
# Upload the .txt file to your Google Drive
uploaded = files.upload()


Saving cleaned_auguste_maquet.txt to cleaned_auguste_maquet.txt


In [None]:
# Path to the uploaded file in Colab (use the exact filename you see in the output)
colab_file_path = "/content/cleaned_auguste_maquet.txt"

# Destination directory in Google Drive
drive_directory = "/content/drive/My Drive/"

# Move the file to the specified directory
shutil.move(colab_file_path, drive_directory)


'/content/drive/My Drive/cleaned_auguste_maquet.txt'

In [3]:
# Specify the path to the file in your Google Drive
file_path = "/content/drive/My Drive/cleaned_auguste_maquet.txt"

# Initialize a list to store the sentences
content = []

# Read the file line by line
with open(file_path, "r") as file:
    for line in file:
        # Remove leading and trailing whitespace and append the line as a sentence
        sentence = line.strip()
        content.append(sentence)

# Display the content
# print(content)

In [None]:
print(len(content))

34860


In [None]:
# Remove sentences with less than 5 words
filtered_sentences = [sentence for sentence in content if len(sentence.split()) >= 5]

# Display the filtered content
print(len(filtered_sentences))


32525


In [None]:
def generate_vocabulary(sentences):
    # Initialize an empty Counter to store the word frequencies
    vocabulary = Counter()

    # Tokenize and count words in each sentence
    for sentence in sentences:
        # Tokenize the sentence into words using regex (you can use more sophisticated tokenization methods)
        words = re.findall(r'\b\w+\b', sentence.lower())

        # Update the vocabulary with word counts from this sentence
        vocabulary.update(words)

    # Convert the Counter to a dictionary
    vocabulary_dict = dict(vocabulary)

    return vocabulary_dict


In [None]:
vocab = generate_vocabulary(content)
vocab['UNK'] = len(vocab)
print(len(vocab))

21650


In [4]:
# Split sizes
validation_size = 10000
test_size = 20000

# Randomly shuffle the sentences
# random.shuffle(filtered_content)

# Split the data
validation_set = content[:validation_size]
test_set = content[validation_size:validation_size + test_size]
train_set = content
print(len(validation_set))
print(len(test_set))
print(len(train_set))

10000
20000
34860


## DATA

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader, Subset
import spacy
from torchtext.vocab import GloVe
import torch.nn as nn
import torch.optim as optim
import math


In [6]:
# Tokenize the sentences using spaCy (you can use other tokenizers as well)
nlp = spacy.load("en_core_web_sm")
tokenized_sentences = [token.text for sentence in content for token in nlp(sentence)]

# Flatten the list of tokens
corpus = tokenized_sentences

# Build the vocabulary (mapping from tokens to indices)
vocab = {token: idx for idx, token in enumerate(set(corpus))}


In [7]:
class LMData(Dataset):
    def __init__(self, corpus_tokens, vocab, seq_len):
        self.seq_len = seq_len
        self.vocab = vocab
        self.inputs, self.targets = self.process_corpus(corpus_tokens)

    def process_corpus(self, corpus_tokens):
        # Convert tokens to indices using the vocab
        indices = [self.vocab[token] for token in corpus_tokens]

        # Prepare input and target sequences
        inputs = []
        targets = []
        for i in range(0, len(indices) - self.seq_len, self.seq_len):
            inputs.append(indices[i:i + self.seq_len])
            targets.append(indices[i + self.seq_len])

        return inputs, targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_seq = torch.LongTensor(self.inputs[idx])
        target_token = torch.LongTensor([self.targets[idx]])
        return input_seq, target_token


In [8]:
# Set the sequence length
seq_len = 5

# Create the custom dataset
total_dataset = LMData(corpus, vocab, seq_len)


In [9]:
# Split the dataset into training, validation, and test sets
def create_train_val_test(dataset, split_val, split_test):
    total_size = len(dataset)
    val_size = int(total_size * split_val)
    test_size = int(total_size * split_test)
    train_size = total_size - val_size - test_size
    train_set, val_set, test_set = torch.utils.data.random_split(
        dataset, [train_size, val_size, test_size]
    )
    return train_set, val_set, test_set

In [25]:
split_val = 0.1  # Validation set size
split_test = 0.1  # Test set size
train_set, val_set, test_set = create_train_val_test(total_dataset, split_val, split_test)


In [26]:
# Create DataLoaders for training, validation, and test sets
batch_size = 16  # Adjust as needed
trn_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)


In [27]:
# Printing shapes of the first batch from each DataLoader
for name, loader in [("Train", trn_loader), ("Validation", val_loader), ("Test", test_loader)]:
    inputs, targets = next(iter(loader))
    print(f"{name} Input Shape: {inputs.shape}, Target Shape: {targets.shape}")


Train Input Shape: torch.Size([16, 5]), Target Shape: torch.Size([16, 1])
Validation Input Shape: torch.Size([16, 5]), Target Shape: torch.Size([16, 1])
Test Input Shape: torch.Size([16, 5]), Target Shape: torch.Size([16, 1])


In [28]:
# Accessing a sample from the dataset
sample_idx = 0  # Change this index to access different samples
sample_input, sample_target = total_dataset[sample_idx]

# Print the sample
print("Sample Input:", sample_input)
print("Sample Target:", sample_target)


Sample Input: tensor([13190,  7330, 15504, 20406, 10986])
Sample Target: tensor([3129])


## MODEL

In [10]:
global_vectors = GloVe(name='6B', dim=100)


.vector_cache/glove.6B.zip: 862MB [02:39, 5.41MB/s]                           
100%|█████████▉| 399999/400000 [00:19<00:00, 20102.98it/s]


In [11]:
# Define a function to create the embedding matrix
def create_embedding_matrix(vocab, global_vectors, embedding_dim):
    num_tokens = len(vocab)
    embedding_matrix = torch.randn(num_tokens, embedding_dim)

    for token, idx in vocab.items():
        if token in global_vectors.stoi:
            embedding_matrix[idx] = global_vectors[token]

    # Check if '<unk>' is in vocab; if not, add it
    unk_idx = vocab.get('<unk>')
    if unk_idx is None:
        unk_idx = len(vocab)  # Assign a new index
        vocab['<unk>'] = unk_idx
        embedding_matrix = torch.cat([embedding_matrix, torch.randn(1, embedding_dim)], dim=0)  # Add a random vector for '<unk>'

    return embedding_matrix


In [12]:
# Define hyperparameters
embedding_dim = 100  # Adjust this based on your GLoVe embedding dimension
hidden_dim = 128
output_dim = len(vocab)  # Output dimension is the size of your vocabulary

# Create the embedding matrix
embedding_matrix = create_embedding_matrix(vocab, global_vectors, embedding_dim)


In [20]:

class LanguageModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, sequence_length):
        super(LanguageModel, self).__init__()

        # Define constants for dimensions
        self.embedding_dim = embedding_matrix.shape[1]  # Assuming the embedding matrix has shape (vocab_size, embedding_dim)

        # Define layers
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.hidden_dim = hidden_dim

        self.sequence_length = 5

        # Linear layers
        self.hidden1 = nn.Linear(sequence_length * embedding_dim, hidden_dim)
        self.hidden2 = nn.Linear(hidden_dim, hidden_dim)
        self.output_layer = nn.Linear(hidden_dim, output_dim)

        # Activation functions
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

    def forward(self, text):
        # Ensure the input tensor's shape matches the expected input shape
        assert text.shape[1] == self.sequence_length

        embedded = self.embedding(text)

        # Infer the input dimension for hidden1 based on embedding output and sequence length
        hidden1_input_dim = self.sequence_length * self.embedding_dim
        embedded = embedded.view(-1, hidden1_input_dim)

        hidden1_out = self.relu(self.hidden1(embedded))
        hidden2_out = self.tanh(self.hidden2(hidden1_out))
        output = self.output_layer(hidden2_out)



        return output


In [31]:
# Initialize and move the model to the appropriate device (CPU or GPU)
model = LanguageModel(embedding_matrix, hidden_dim, output_dim, 5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


LanguageModel(
  (embedding): Embedding(22407, 100)
  (hidden1): Linear(in_features=500, out_features=128, bias=True)
  (hidden2): Linear(in_features=128, out_features=128, bias=True)
  (output_layer): Linear(in_features=128, out_features=22406, bias=True)
  (relu): ReLU()
  (tanh): Tanh()
)

In [22]:
# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10  # Adjust as needed
m1_losses = {'train': [], 'val': []}


In [32]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for inputs, targets in trn_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.squeeze())
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Calculate and store the average training loss for this epoch
    average_train_loss = train_loss / len(trn_loader)
    m1_losses['train'].append(average_train_loss)

    # Validation loop
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets.squeeze())
            val_loss += loss.item()

    # Calculate and store the average validation loss for this epoch
    average_val_loss = val_loss / len(val_loader)
    m1_losses['val'].append(average_val_loss)

    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {average_train_loss:.4f}, Val Loss: {average_val_loss:.4f}')


Epoch 1/10, Train Loss: 10.0197, Val Loss: 10.0198
Epoch 2/10, Train Loss: 10.0197, Val Loss: 10.0198
Epoch 3/10, Train Loss: 10.0197, Val Loss: 10.0198
Epoch 4/10, Train Loss: 10.0197, Val Loss: 10.0198
Epoch 5/10, Train Loss: 10.0197, Val Loss: 10.0198
Epoch 6/10, Train Loss: 10.0197, Val Loss: 10.0198
Epoch 7/10, Train Loss: 10.0197, Val Loss: 10.0198
Epoch 8/10, Train Loss: 10.0197, Val Loss: 10.0198
Epoch 9/10, Train Loss: 10.0197, Val Loss: 10.0198
Epoch 10/10, Train Loss: 10.0197, Val Loss: 10.0198


In [None]:
# Define the path to save the model in your Google Drive
model_path = "/content/drive/My Drive/model_1a.pt"
torch.save(model.state_dict(), model_path)

print(f"Model saved to Google Drive at: {model_path}")

Model saved to Google Drive at: /content/drive/My Drive/model_1a.pt


In [13]:
import os

# Specify the path to your model directory on Google Drive
model_dir = '/content/drive/My Drive'

# Change the current working directory to the model directory
os.chdir(model_dir)


In [23]:
model_path = "/content/drive/My Drive/model_1a.pt"

model = torch.load('model_1a.pt')


## test and perplexity

In [None]:
pip install nltk



In [15]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [16]:
# Tokenize and clean the test sentences
test_tokens = [word_tokenize(sentence.lower()) for sentence in train_set]


In [17]:
def create_ngrams(tokens, n):
    ngrams = []
    for sentence_tokens in tokens:
        for i in range(len(sentence_tokens) - n + 1):
            ngram = sentence_tokens[i:i + n]
            ngrams.append(ngram)
    return ngrams

n = 5  # 5-grams
test_ngrams = create_ngrams(test_tokens, n)


In [18]:
import torch.nn.functional as F

def calculate_perplexity(model, ngrams, vocab):
    perplexity_scores = []
    total_ngrams = len(ngrams)

    for i, ngram in enumerate(ngrams):
        ngram_indices = [vocab.get(token, vocab['<unk>']) for token in ngram]
        input_seq = torch.LongTensor(ngram_indices).unsqueeze(0).to(device)
        with torch.no_grad():
            outputs = model(input_seq)
        probabilities = F.softmax(outputs, dim=1).squeeze()
        log_probabilities = torch.log(probabilities)
        perplexity = torch.exp(-torch.mean(log_probabilities)).item()
        perplexity_scores.append(perplexity)

        # Calculate and print the completion percentage
        completion_percentage = (i + 1) / total_ngrams * 100
        print(f'Completed: {completion_percentage:.2f}%')

    return perplexity_scores


In [33]:
# Calculate perplexity scores for the test set
test_perplexity_scores = calculate_perplexity(model, test_ngrams, vocab)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%
Completed: 99.52%

In [34]:
output_file = '/content/drive/My Drive/perp_1a_train.txt'

with open(output_file, 'w') as f:
    for sentence, perplexity in zip(test_set, test_perplexity_scores):
        f.write(f'Sentence: {sentence}\n')
        f.write(f'Perplexity: {perplexity:.4f}\n\n')


In [35]:
# Calculate and write the average perplexity score
average_perplexity = sum(test_perplexity_scores) / len(test_perplexity_scores)
with open(output_file, 'a') as f:
    f.write(f'Average Perplexity: {average_perplexity:.4f}')
print(average_perplexity)

22508.507849051075
