Installing Datasets

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[

Loading the tokenized csv dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/content/drive/MyDrive/AI_Thesis_Helper/tokenized_combined_text.csv')
print(df.head())

                                      tokenized_text
0  [22058, 684, 38514, 422, 2034, 3157, 572, 12, ...
1  [27275, 2890, 290, 9104, 278, 4307, 6169, 1361...
2  [45536, 913, 48862, 485, 272, 34600, 7663, 422...
3  [8199, 333, 46374, 9552, 2295, 12293, 20410, 2...
4  [45, 19930, 12, 40, 1797, 31, 3843, 379, 12449...


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

GPT2Dataset dataset

In [4]:
class GPT2Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.input_ids = df['tokenized_text'].apply(eval).tolist()  # Convert string lists to actual lists
        self.input_ids = [ids[:max_length] for ids in self.input_ids]  # Truncate sequences if needed

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_ids[idx], dtype=torch.long)
        return {
            'input_ids': input_ids,
            'attention_mask': (input_ids != tokenizer.pad_token_id).long()
        }

Tokenizer and Dataset

In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [7]:
dataset = GPT2Dataset(df, tokenizer)

# Create a DataLoader for batching
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [8]:
from transformers import GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

Loading Pre-trained model

In [9]:
# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50257, 768)

Setting device to cuda and setting the hyperparameters

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [11]:
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(dataloader) * 1
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



Training Loop - 4 epochs

In [12]:
model.train()
for epoch in range(4):
    for batch in dataloader:
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(inputs, attention_mask=attention_mask, labels=inputs)
        loss = outputs.loss

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    print(f"Epoch {epoch + 1} completed with loss: {loss.item()}")

Epoch 1 completed with loss: 1.678558111190796
Epoch 2 completed with loss: 1.6851688623428345
Epoch 3 completed with loss: 1.3869553804397583
Epoch 4 completed with loss: 1.5436750650405884


Saving the fine-tuned model

In [13]:
# Save the fine-tuned model
model.save_pretrained('fine-tuned-gpt2_final')
tokenizer.save_pretrained('fine-tuned-gpt2_final')

('fine-tuned-gpt2_final/tokenizer_config.json',
 'fine-tuned-gpt2_final/special_tokens_map.json',
 'fine-tuned-gpt2_final/vocab.json',
 'fine-tuned-gpt2_final/merges.txt',
 'fine-tuned-gpt2_final/added_tokens.json')

Small test

In [14]:
from transformers import pipeline

generator = pipeline('text-generation', model='fine-tuned-gpt2_final', tokenizer=tokenizer)
output = generator("Your input prompt here", max_length=100, num_return_sequences=1)
print(output[0]['generated_text'])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Your input prompt here   Here we introduce a new feature for the prompt field. It lets the user decide the prompt content. We can encode it as a question phrase or question, and we also give it a keyword for the input question. This generates an input prompt and generates an output prompt for the output question.
We show that the prompt field should have a corresponding prompt and output prompt in a language, and then provide the key
for each text in the input prompt. We also provide


Zipping and seeing the size of zip file

In [15]:
!zip -r fine_tuned_gpt2_final.zip fine-tuned-gpt2_final

  adding: fine-tuned-gpt2_final/ (stored 0%)
  adding: fine-tuned-gpt2_final/config.json (deflated 51%)
  adding: fine-tuned-gpt2_final/tokenizer_config.json (deflated 55%)
  adding: fine-tuned-gpt2_final/vocab.json (deflated 68%)
  adding: fine-tuned-gpt2_final/special_tokens_map.json (deflated 74%)
  adding: fine-tuned-gpt2_final/model.safetensors (deflated 7%)
  adding: fine-tuned-gpt2_final/merges.txt (deflated 53%)
  adding: fine-tuned-gpt2_final/generation_config.json (deflated 24%)


In [16]:
!du -sh /content/fine_tuned_gpt2_final.zip

442M	/content/fine_tuned_gpt2_final.zip
