In [7]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

device  = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
class BilingualDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size=128):
        self.tokenizer = tokenizer
        self.examples = []

        # Load data from CSV
        data = pd.read_csv(file_path, names=["igala","english"])
        for _, row in data.iterrows():
            igala_word = row['igala']
            english_definition = row['english']
            # Create a single string for each example
            text = f"{igala_word}: {english_definition}"
            tokenized_text = tokenizer.encode(text, truncation=True, max_length=block_size)
            self.examples.append(tokenized_text)
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)

In [3]:

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


In [4]:

# Create dataset
file_path = 'igala.csv'  # Path to your CSV file
dataset = BilingualDataset(tokenizer, file_path)

In [5]:
# Create DataLoader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [6]:
# Load model
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [22]:
# Fine-tuning
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# Train the model
trainer.train()

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [23]:
pip list


Package                   Version
------------------------- ---------------
accelerate                0.30.1
anyio                     4.3.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 2.4.1
async-lru                 2.0.4
attrs                     23.2.0
Babel                     2.14.0
beautifulsoup4            4.12.3
bleach                    6.1.0
certifi                   2024.2.2
cffi                      1.16.0
charset-normalizer        3.3.2
colorama                  0.4.6
comm                      0.2.1
contourpy                 1.2.0
cycler                    0.12.1
debugpy                   1.8.1
decorator                 5.1.1
defusedxml                0.7.1
executing                 2.0.1
fastjsonschema            2.19.1
filelock                  3.9.0
fonttools                 4.49.0
fqdn                      1.5.1
fsspec                    2024.5.0
h11                       0.14.0
httpcore   


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: C:\Users\OWNER\Desktop\llm\cuda\Scripts\python.exe -m pip install --upgrade pip


In [10]:
device  = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu
