<a href="https://colab.research.google.com/github/e-olang/NLP/blob/main/Auto%20Completion/Fine-tuned%20Fill-Mask%20Approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets transformers -q

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, AdamW, AutoTokenizer, AutoModelForMaskedLM
from transformers import get_linear_schedule_with_warmup
from datasets import load_dataset

In [3]:
# Load pre-trained model and tokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("eolang/SW-v1")
model = AutoModelForMaskedLM.from_pretrained("eolang/SW-v1")

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [4]:
# Custom dataset for loading your text data
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        for text in texts:
            encodings = tokenizer(text, truncation=True, max_length=max_length, padding='max_length')
            self.input_ids.append(torch.tensor(encodings['input_ids']))
            self.attn_masks.append(torch.tensor(encodings['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

# Assume `texts` contains your training sentences.
# texts = ['Example sentence 1', 'Example sentence 2', ...]
# dataset = TextDataset(texts, tokenizer, max_length=50)
# loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [5]:
data = load_dataset('swahili')
data

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 42069
    })
    test: Dataset({
        features: ['text'],
        num_rows: 3371
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3372
    })
})

In [6]:
tr = data['train']['text']
ts = data['test']['text']
val = data['validation']['text']

In [7]:
texts = tr + ts + val
print(len(texts))

dataset = TextDataset(texts, tokenizer, max_length=50)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

48812


In [8]:
# Set up training configurations
epochs = 1
learning_rate = 5e-5
warmup_steps = 1e2
total_steps = len(loader) * epochs


optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)



In [9]:
# Training Loop
model.train()
for epoch in range(epochs):
    for i, (input_ids, attn_masks) in enumerate(loader):
        input_ids = input_ids.to(device)
        attn_masks = attn_masks.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attn_masks, labels=input_ids)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        if i % 100 == 0:
            print(f'Epoch: {epoch}, Iteration: {i}, Loss: {loss.item()}')


Epoch: 0, Iteration: 0, Loss: 3.1255974769592285
Epoch: 0, Iteration: 100, Loss: 1.4144315719604492
Epoch: 0, Iteration: 200, Loss: 0.8890809416770935
Epoch: 0, Iteration: 300, Loss: 0.3105986714363098
Epoch: 0, Iteration: 400, Loss: 1.0406700372695923
Epoch: 0, Iteration: 500, Loss: 0.5258834958076477
Epoch: 0, Iteration: 600, Loss: 0.18848632276058197
Epoch: 0, Iteration: 700, Loss: 0.19386625289916992
Epoch: 0, Iteration: 800, Loss: 0.26122918725013733
Epoch: 0, Iteration: 900, Loss: 0.25111138820648193
Epoch: 0, Iteration: 1000, Loss: 0.2983989417552948
Epoch: 0, Iteration: 1100, Loss: 0.27889081835746765
Epoch: 0, Iteration: 1200, Loss: 0.21425330638885498
Epoch: 0, Iteration: 1300, Loss: 0.2454027682542801
Epoch: 0, Iteration: 1400, Loss: 0.27069780230522156
Epoch: 0, Iteration: 1500, Loss: 0.07453617453575134
Epoch: 0, Iteration: 1600, Loss: 0.08860261738300323
Epoch: 0, Iteration: 1700, Loss: 0.23084421455860138
Epoch: 0, Iteration: 1800, Loss: 0.05512172356247902
Epoch: 0, Ite

In [17]:
def generate_text(prompt, max_length=50, num_return_sequences=1, temperature=1.0, top_k=50, top_p=0.95):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(model.device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            temperature=1.0,  # higher value: more random, smaller value: more deterministic
            top_k=50,  # truncates to only consider top k words for next token
            top_p=0.95,  # nucleus sampling: limits next token selection to subset of vocab
            do_sample=True,  # Enable sampling
            num_beams=1  # Use single beam search. Increase if you want to use more beams.
        )

    # Decode and print the text
    generated_text = [tokenizer.decode(o, skip_special_tokens=True) for o in output]
    return generated_text


In [14]:
# Example usage
#prompt = "Tumefanya mabadiliko"
#predictions = generate_text(prompt, max_length=100, num_return_sequences=3)

In [18]:
predictions = generate_text(
    "tumefanya mabadiliko",
    max_length=100,
    num_return_sequences=3,
    temperature=2.0,
    top_k=40,
    top_p=0.85
)

In [19]:
# Display the generated texts
for i, pred in enumerate(predictions):
    print(f"Generated Text {i + 1}: {pred}\n")

Generated Text 1: tumefanya mabadiliko

Generated Text 2: tumefanya mabadiliko

Generated Text 3: tumefanya mabadiliko



----

In [10]:
# Model saving
#model.save_pretrained("path_to_save_model")
#tokenizer.save_pretrained("path_to_save_model")