<a href="https://colab.research.google.com/github/ImHungry48/Next-Word-Prediction/blob/main/Next_Word_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random
from random import choices
from enum import Enum

class SentenceMutator():
  class TYPO_TYPE(Enum):
      SWAP = 1
      SUBSTITUTE = 2
      DELETE = 3
      INSERT = 4
      CASE = 5
  
  def __init__(self, texts, typo_chance=0.005):
    # The chance of an individual character in a word being mutated
    self.typo_chance = typo_chance

    # All neighboring keys on a standard QWERTY keyboard
    self.neighboring_keys = {
        'q': 'was',
        'w': 'qeasd',
        'e': 'wrsdf',
        'r': 'etdfg',
        't': 'ryfgh',
        'y': 'tughj',
        'u': 'yihjk',
        'i': 'uojkl',
        'o': 'ipkl',
        'p': 'ol',
        'a': 'qwszx',
        's': 'qwesz',
        'd': 'wersfc',
        'f': 'ertdgcv',
        'g': 'rtyfhvb',
        'h': 'tyugjbn',
        'j': 'yuihknm',
        'k': 'uiojlm',
        'l': 'iopk',
        'z': 'asx',
        'x': 'sdzc',
        'c': 'dfxv',
        'v': 'fgcb',
        'b': 'vghn',
        'n': 'bhjm',
        'm': 'njk',
        '.': 'l;/\',',
        '?': ';\'.',
        '!': '12@wq`',
        '-': '=[p0]'
      }

    self.word_as_list = []
  
  def mutate_sentence(self, sentence):
    """ Mutates an entire sentence by swapping, substituting, deleting, inserting, 
        or changing the case of characters in the sentence's words.

    Args:
        sentence(str): The sentence to mutate
    Returns:
        mutated_sentence(str): The sentence after ungoing mutations. Value does not change
        if no mutations occur
    """
    
    # Split the sentence into individual words
    words = sentence.split()

    # Create an array to store transformed words
    typoed_words = []

    # Mutate and store each word
    for word in words:
      typoed_words.append(self.mutate(word))
    
    # Return the sentence
    return ' '.join(typoed_words)

  def get_neighboring_keys(self, char, is_upper):
    # Get all neighboring keys of the given character
    neighbors = self.neighboring_keys[char]

    # Return a random neighboring key
    return random.choice(neighbors).upper() if is_upper else random.choice(neighbors)
  
  def swap(self, char_index, word):
    # Determine index of character is being swapped
    index_to_swap = char_index + 1

    if char_index == len(word) - 1: # If the last letter
      index_to_swap = len(word) - 2

    # Perform the swap
    temp = self.word_as_list[char_index]
    self.word_as_list[char_index] = self.word_as_list[index_to_swap]
    self.word_as_list[index_to_swap] = temp

  def substitute(self, char, char_index):
    # 80% chance to substitute with a neighboring character
    sub_with_nearby = random.random() < 0.8

    if sub_with_nearby:
      # Get neighboring keys
      sub_char = self.get_neighboring_keys(char.lower(), char.isupper())
      
    else:
      # Get a random letter in the alphabet
      characters = 'abcdefghijklmnopqrstuvwxyz!-=;,.\'\\?'
      sub_char = random.choice(list(characters))

    # Perform the substitution
    self.word_as_list[char_index] = sub_char

  def delete(self, char_index): 
    # Perform a soft deletion
    self.word_as_list[char_index] = -1

  def insert(self, char_index, char):
    # 80% chance to insert with a neighboring character
    insert_with_nearby = random.random() < 0.8

    if insert_with_nearby:
      # Get neighboring keys
      insert_char = self.get_neighboring_keys(char.lower(), char.isupper())
    else:
      # Get a random letter in the alphabet
      characters = 'abcdefghijklmnopqrstuvwxyz!-=;,.\'\\?'
      insert_char = random.choice(list(characters))

    # Perform the insertion
    self.word_as_list.insert(char_index, insert_char)

  def case(self, char, char_index):
    # Change letter to opposite case
    if char.isupper():
      self.word_as_list[char_index] = char.lower()
    else:
      self.word_as_list[char_index] = char.upper()

  def mutate(self, word):
    """ Mutates a single word by swapping, substituting, deleting, inserting, 
        or changing the case of characters.

    Args:
        word(str): The word to mutate.
    Returns:
        mutated_word(str): The word after ungoing mutations. Value does not change
        if no mutations occur
    """
    
    # Make the word a list for easier manipulation
    word_as_list = list(word)

    for char_index in range(len(word)):    
      char = word[char_index]    
      if random.random() <= self.typo_chance:
        # Select a random typo type
        typo_type= random.choices(list(self.TYPO_TYPE),
                              weights = [3, 4, 3, 4, 1])[0]
        
        # Implement the randomly selected typo type
        if typo_type == self.TYPO_TYPE.SWAP:

          # Only swap if there are enough characters in the word
          if len(word) < 2:
            break

          # Perform the swap
          self.swap(char_index)

        elif typo_type == self.TYPO_TYPE.SUBSTITUTE:
          # Perform the substitution
          self.substitute(char_index)

        elif typo_type == self.TYPO_TYPE.DELETE:
          # Only delete if there are enough characters in the word
          if len(word) < 2:
            break

          # Perform the deletion
          self.delete(char_index)

        elif typo_type == self.TYPO_TYPE.INSERT:
          # Perform the insertion
          self.insert(char_index, char)

        elif typo_type == self.TYPO_TYPE.CASE.value:
          # Perform the case change
          self.case(char, char_index)

    mutated_word = ''.join(char for char in word_as_list if type(char) == str)
    return mutated_word

In [None]:
import torch
from nltk.tokenize import sent_tokenize
from torch.utils.data import Dataset

class SentenceDataset(Dataset):
  def __init__(self, texts, tokenizer, max_length, augmentation=False, typo_chance=0.005):
    self.tokenizer = tokenizer
    self.tokenized_sentences = []
    self.augmentation = augmentation
    self.mutator = SentenceMutator(texts, typo_chance)

    # Initalize the sentences in the dataset
    self.init_sentences(texts, typo_chance)
  
  def __len__(self):
    return len(self.tokenized_sentences)

  def __getitem__(self, idx):
    sentence = self.tokenized_sentences[idx]
    result = torch.tensor(sentence)
    return result

  def init_sentences(self, texts, typo_chance):
    for text in texts:
      for sentence in sent_tokenize(text):
        if self.augmentation:
          sentence = self.mutator(texts, typo_chance)
        sentence_tokens = tokenizer.encode(sentence, max_length=max_length, truncation=True)
        self.tokenized_sentences.append(sentence_tokens)
        

In [None]:
import torch.nn as nn
class PositionalEncoding(nn.Module):
    def __init__(self): 
        # TODO: Implement me!

    def forward(self):
        # TODO: Implement me!

In [None]:
from torch.nn import TransformerEncoderLayer, TransformerEncoder
import math

class TransformerModel(nn.Module):
    def __init__(self, max_length, ntokens, d_model, nhead, nhid, nlayers):
        self.max_length = max_length
        self.d_model = d_model
        self.nhead = nhead
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=nhid)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(max_length, d_model)
        self.decoder = nn.Linear(d_model, ntokens)
    
    def forward(self, src):
        # Process the src through the embedding and positional encoder
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)

        # Apply the transformer encoder
        output = self.transformer_encoder(src, self.src_mask)

        # Final linear layer to get predictions
        output = self.decoder(output)

        # Return the output
        return output
    
    

In [None]:
from transformers import GPT2Tokenizer

# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set model hyperparameters
max_length = 256 # Set maximum sequence length for later padding
ntokens = tokenizer.vocab_size # Size of vocabulary (num of unique tokens)
d_model = 256 # Embedding dimension
nhead = 4 # Number of attention heads
nhid = 512 # Number of feedforward network hidden units
nlayers = 4 # Number of Transformer layers
#dropout = 0.01 # Dropout rate

# Initialize the model
model = TransformerModel(
    ntokens=ntokens, 
    max_length=max_length, 
    d_model=d_model, 
    nhead=nhead, 
    nhid=nhid, 
    nlayers=nlayers)

In [None]:
from torch.utils.data import DataLoader
import torch.optim as optim

num_epochs = 5
batch_size = 1
learning_rate = 5e-5

# Set the path directory
file_path = '/content/drive/MyDrive/untitled.txt'

with open(file_path, 'r') as file:
    dataset = file.readlines()
    
sentence_dataset = SentenceDataset(dataset, tokenizer, max_length)
dataloader = DataLoader(sentence_dataset, batch_size=batch_size, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), learning_rate=learning_rate)

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch_idx, data in enumerate(dataloader):
        inputs = data.to(torch.long)  # Assuming your data is of type Long
        targets = inputs.clone()  # Adjust this line based on your task

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs.view(-1, ntokens), targets.view(-1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
    
    average_loss = total_train_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}")