# Install necessary packages (if not already installed)
# !pip install transformers torch

In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset
import re
def load_nynorsk_sentences(limit=2000):
    dataset = load_dataset("NbAiLab/NCC",streaming=False)
    train_stream = dataset["train"]
    sentences = []
    for example in train_stream:
        if example.get("lang_fasttext") == "nn":
            text = example["text"]
            for s in re.split(r"[.!?]\\s+", text):
                if len(s.split()) > 3:
                    sentences.append(s.strip())
        if len(sentences) >= limit:
            break
    return sentences

nynorsk_sentences = load_nynorsk_sentences()
print("Number of Nynorsk sentences:", len(nynorsk_sentences))

Number of Nynorsk sentences: 2000


In [12]:
from torch.utils.data import Dataset

class NorwegianDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=512):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        encoding = self.tokenizer(
            sentence,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        # Squeeze to remove extra dimension (batch dim inside each sample)
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        return encoding
from torch.utils.data import DataLoader
from transformers import BertTokenizer


In [13]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Set pad_token explicitly if not already set (GPT-2 often uses the EOS token as pad)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_name)

# For dataset usage: If needed, create the dataset with GPT2's tokenizer
dataset = NorwegianDataset(nynorsk_sentences, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [14]:
def tokenize_function(examples):
    # Adjust the key "text" based on how your dataset is structured.
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)



In [17]:
def generate_nynorsk_text(prompt, model, tokenizer, max_length=50):
    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    # Create attention mask - all tokens are attended to
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
    
    # Generate text while explicitly setting the pad_token_id.
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.pad_token_id,  # explicitly using pad token id
        max_length=max_length,
        num_return_sequences=1
    )
    
    # Decode the generated ids to text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Example usage:
prompt = "Eg likar å gjere "
generated_text = generate_nynorsk_text(prompt, model, tokenizer)
print("Generated text:", generated_text)

Generated text: Eg likar å gjere ikar å gjere ikar å gjere ikar å gjere ikar å gjere ikar å gjere 


In [18]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [19]:
# Function to generate text from a prompt
def generate_text(prompt, max_length=100):
    # Tokenize the input text
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    # Generate text
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
    
    # Decode the generated tokens into text
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text


In [23]:
# Example usage:
prompt = "Hva er Nynorsk?"
output_text = generate_text(prompt, max_length=150)
print("Generated text:\n", output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text:
 Hva er Nynorsk?

[A]n I have no idea what you mean.
. . .
 (I'm not sure what I mean.)
,
-
I've been looking for a way to get to the bottom of this. I've got a lot of questions. But I'm going to try to answer them. And I'll try my best to make sure that I don't get caught up in the whole thing. So, I guess I can't say anything. (Laughs.) I think I know what it is. It's a very, very complicated thing, and I hope that you'll understand it. You know, it's not like I was trying to tell you that. That


In [8]:
# Save the model and tokenizer to a directory
save_directory = "../../ml-models/gpt2"

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")


Model and tokenizer saved to ../../ml-models/gpt2
