In [2]:
from datasets import load_dataset

# Load the dataset from your local JSONL file using the full path
dataset = load_dataset("json", data_files="/Users/nafey/Desktop/domain_data.jsonl")


# Check the dataset's content
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 20
    })
})


In [3]:
from transformers import GPT2Tokenizer
from datasets import load_dataset

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add pad token if not present
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the dataset
dataset = load_dataset("json", data_files="/Users/nafey/Desktop/domain_data.jsonl")

def tokenize_function(examples):
    # Concatenate prompt and completion for each example
    prompts = examples["prompt"]
    completions = examples["completion"]
    
    # Ensure both prompt and completion are lists of strings
    inputs = [prompt + tokenizer.eos_token + completion for prompt, completion in zip(prompts, completions)]
    
    # Tokenize with padding and truncation
    encoding = tokenizer(inputs, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    
    # Return the tokenized data (input_ids and attention_mask should now be tensors)
    return encoding

# Apply tokenization in batches
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "completion"])

# Check the tokenized dataset
print(tokenized_dataset['train'][0])  # Check the first example


{'input_ids': [2061, 318, 2199, 271, 38438, 30, 50256, 9771, 271, 38438, 318, 281, 13097, 9856, 13693, 326, 13692, 319, 38265, 2667, 262, 7625, 1022, 4569, 1524, 3707, 290, 6393, 1204, 4678, 13, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 

In [4]:
print(tokenized_dataset['train'][0])


{'input_ids': [2061, 318, 2199, 271, 38438, 30, 50256, 9771, 271, 38438, 318, 281, 13097, 9856, 13693, 326, 13692, 319, 38265, 2667, 262, 7625, 1022, 4569, 1524, 3707, 290, 6393, 1204, 4678, 13, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 

In [5]:
from datasets import DatasetDict

# Assuming you have already tokenized the dataset and stored it in `tokenized_dataset`

# Manually split the 'train' data into train, validation, and test sets
train_dataset = tokenized_dataset["train"].select(range(0, 14))  # 70% for training
val_dataset = tokenized_dataset["train"].select(range(14, 17))   # 15% for validation
test_dataset = tokenized_dataset["train"].select(range(17, 20))  # 15% for testing

# Create a DatasetDict
split_dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

# Print the resulting DatasetDict
print(split_dataset)


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 14
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3
    })
})


In [6]:
len(dataset['train'])

20

In [7]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the model and tokenizer
model_name = "gpt2"  # Replace with your desired model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
# Add pad token to the model's configuration if it's not already present
model.config.pad_token_id = tokenizer.pad_token_id


In [8]:
from datasets import DatasetDict
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, GPT2LMHeadModel
from datasets import Dataset
import torch

# Load the dataset (assuming dataset is already loaded as 'dataset')
# Example: dataset = load_dataset("csv", data_files={"train": "train.csv", "test": "test.csv"})
dataset = DatasetDict({
    "train": Dataset.from_dict({
        "prompt": ["What is AI?", "What is the capital of France?"],
        "completion": ["AI stands for Artificial Intelligence.", "The capital of France is Paris."]
    }),
    "test": Dataset.from_dict({
        "prompt": ["What is Python?", "What is machine learning?"],
        "completion": ["Python is a programming language.", "Machine learning is a subset of AI."]
    })
})

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token

# Tokenization function
def tokenize_function(examples):
    prompts = examples["prompt"]
    completions = examples["completion"]
    
    # Concatenate prompt and completion and tokenize them
    full_texts = [prompt + tokenizer.eos_token + completion for prompt, completion in zip(prompts, completions)]
    
    # Tokenize batch and ensure padding/truncation
    tokenized_batch = tokenizer(
        full_texts,
        truncation=True,
        padding="max_length",  # Ensure all are padded to the max length
        max_length=128,        # Adjust the max length as needed
        return_tensors="pt"    # Return as pytorch tensors
    )
    
    return tokenized_batch

# Apply tokenization on dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["prompt", "completion"]  # Remove original text columns after tokenization
)

# Manually split the 'train' data into train, validation, and test sets
train_dataset = tokenized_dataset["train"].select(range(0, 1))  # Example split: 70% for training
val_dataset = tokenized_dataset["train"].select(range(1, 2))    # Example split: 15% for validation
test_dataset = tokenized_dataset["test"].select(range(0, 1))    # Example split: 15% for testing

# Create a DatasetDict
split_dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

# Check sequence lengths for consistency between input_ids and attention_mask
for split in ['train', 'validation', 'test']:
    for idx in range(len(split_dataset[split])):
        example = split_dataset[split][idx]  # Access the example by index
        
        # Check if 'input_ids' and 'attention_mask' are lists
        if isinstance(example['input_ids'], list) and isinstance(example['attention_mask'], list):
            input_len = len(example['input_ids'])
            attention_len = len(example['attention_mask'])
            if input_len != attention_len:
                print(f"Inconsistent length at index {idx} in {split} split: input_ids={input_len}, attention_mask={attention_len}")
        else:
            print(f"Invalid data at index {idx} in {split} split: input_ids type={type(example['input_ids'])}, attention_mask type={type(example['attention_mask'])}")

# Define a data collator for language modeling (for causal models like GPT-2)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False  # Masked language modeling is False for causal models like GPT-2
)

# Convert the tokenized dataset to a PyTorch-friendly format
train_dataset = split_dataset["train"]
validation_dataset = split_dataset["validation"]
test_dataset = split_dataset["test"]

# Print final datasets to check if the changes worked
print(train_dataset[0])  # Check the first training sample
print(validation_dataset[0])  # Check the first validation sample
print(test_dataset[0])  # Check the first testing sample

# Optionally, load a GPT-2 model for fine-tuning (you can skip this part if not fine-tuning)
model = GPT2LMHeadModel.from_pretrained("gpt2")


input_ids = torch.tensor(train_dataset[0]['input_ids'])
attention_mask = torch.tensor(train_dataset[0]['attention_mask'])

# Apply unsqueeze to add the batch dimension
output = model(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
print(output)



Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

{'input_ids': [2061, 318, 9552, 30, 50256, 20185, 6296, 329, 35941, 9345, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, GPT2LMHeadModel, AutoTokenizer
from torch.optim import Adam
from tqdm import tqdm

# Load the dataset and tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# DataLoader: Prepare the DataLoader for batching the tokenized dataset
train_dataset = split_dataset['train']
validation_dataset = split_dataset['validation']
test_dataset = split_dataset['test']

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=2)
test_dataloader = DataLoader(test_dataset, batch_size=2)

# Move model to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up the optimizer (AdamW is a good choice for transformers)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the training loop
def train(model, train_dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc="Training"):
        # Convert lists to tensors
        input_ids = torch.tensor(batch['input_ids']).to(device)
        attention_mask = torch.tensor(batch['attention_mask']).to(device)

        # Forward pass: Compute logits and loss
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss  # The loss is returned as a tuple, with the first element being the loss

        # Backward pass: Compute gradients
        loss.backward()

        # Clip gradients to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update model parameters
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    # Return the average loss for the epoch
    return total_loss / len(train_dataloader)

# Define the validation loop
def validate(model, validation_dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(validation_dataloader, desc="Validating"):
            # Convert lists to tensors
            input_ids = torch.tensor(batch['input_ids']).to(device)
            attention_mask = torch.tensor(batch['attention_mask']).to(device)

            # Forward pass: Compute logits and loss
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

            total_loss += loss.item()

    # Return the average loss for the validation
    return total_loss / len(validation_dataloader)

# Define the testing loop
def test(model, test_dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Testing"):
            # Convert lists to tensors
            input_ids = torch.tensor(batch['input_ids']).to(device)
            attention_mask = torch.tensor(batch['attention_mask']).to(device)

            # Forward pass: Compute logits and loss
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

            total_loss += loss.item()

    return total_loss / len(test_dataloader)

# Training and evaluation loop
num_epochs = 10
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    # Train the model
    train_loss = train(model, train_dataloader, optimizer, device)
    print(f"Train Loss: {train_loss:.4f}")
    
    # Validate the model
    val_loss = validate(model, validation_dataloader, device)
    print(f"Validation Loss: {val_loss:.4f}")
    
    # Save the model after every epoch
    torch.save(model.state_dict(), f"gpt2_finetuned_epoch_{epoch+1}.pth")

# Optionally, test the model on the test dataset
test_loss = test(model, test_dataloader, device)
print(f"Test Loss: {test_loss:.4f}")




Epoch 1/10


Training: 100%|███████████████████████████████████| 1/1 [00:01<00:00,  1.26s/it]


Train Loss: 11.2271


Validating: 100%|█████████████████████████████████| 1/1 [00:00<00:00,  8.12it/s]


Validation Loss: 9.5173
Epoch 2/10


Training: 100%|███████████████████████████████████| 1/1 [00:00<00:00,  1.09it/s]


Train Loss: 8.8011


Validating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 10.08it/s]


Validation Loss: 7.4628
Epoch 3/10


Training: 100%|███████████████████████████████████| 1/1 [00:00<00:00,  1.50it/s]


Train Loss: 6.7323


Validating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 10.21it/s]


Validation Loss: 5.2942
Epoch 4/10


Training: 100%|███████████████████████████████████| 1/1 [00:00<00:00,  1.00it/s]


Train Loss: 4.6647


Validating: 100%|█████████████████████████████████| 1/1 [00:00<00:00,  9.44it/s]


Validation Loss: 3.6678
Epoch 5/10


Training: 100%|███████████████████████████████████| 1/1 [00:00<00:00,  1.24it/s]


Train Loss: 3.2459


Validating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 10.14it/s]


Validation Loss: 2.4349
Epoch 6/10


Training: 100%|███████████████████████████████████| 1/1 [00:00<00:00,  1.15it/s]


Train Loss: 1.9577


Validating: 100%|█████████████████████████████████| 1/1 [00:00<00:00,  9.17it/s]


Validation Loss: 1.6124
Epoch 7/10


Training: 100%|███████████████████████████████████| 1/1 [00:00<00:00,  1.34it/s]


Train Loss: 0.8962


Validating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 10.91it/s]


Validation Loss: 1.0473
Epoch 8/10


Training: 100%|███████████████████████████████████| 1/1 [00:00<00:00,  1.55it/s]


Train Loss: 0.9297


Validating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 10.35it/s]


Validation Loss: 0.6694
Epoch 9/10


Training: 100%|███████████████████████████████████| 1/1 [00:00<00:00,  2.14it/s]


Train Loss: 0.5888


Validating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 10.85it/s]


Validation Loss: 0.4585
Epoch 10/10


Training: 100%|███████████████████████████████████| 1/1 [00:00<00:00,  1.79it/s]


Train Loss: 0.4359


Validating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 10.28it/s]


Validation Loss: 0.3791


Testing: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 11.80it/s]

Test Loss: 0.2935





In [10]:
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")


('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json',
 './fine_tuned_gpt2/tokenizer.json')

In [11]:
# Save the fine-tuned model
output_dir = "./fine_tuned_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to ./fine_tuned_model


In [12]:
from transformers import GPT2LMHeadModel, AutoTokenizer

# Load the fine-tuned model
output_dir = "./fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = GPT2LMHeadModel.from_pretrained(output_dir)

# Ensure the model is in evaluation mode
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [13]:
input_text = "what is calisnova?"

# Encode the input text
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate attention mask
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

# Set pad_token_id to eos_token_id if not defined
tokenizer.pad_token = tokenizer.eos_token

# Generate output with adjusted parameters
output_ids = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=150,  # Increase tokens for more elaborate response
    temperature=1.2,  # Adjust for more creativity
    top_p=0.9,  # Use nucleus sampling for more diversity
    top_k=50,  # Use top-k sampling
    do_sample=True,  # Enable sampling
    no_repeat_ngram_size=2  # Prevent repetition
)

# Decode and print the output
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Generated Output:", generated_text)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Output: what is calisnova? and a final one that is at the same time different for this kind of thing and not even the original one) I hope people will try them with different styles.


In [14]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Save the fine-tuned model and tokenizer
model.save_pretrained('./finetuned_model')
tokenizer.save_pretrained('./finetuned_model')


('./finetuned_model/tokenizer_config.json',
 './finetuned_model/special_tokens_map.json',
 './finetuned_model/vocab.json',
 './finetuned_model/merges.txt',
 './finetuned_model/added_tokens.json',
 './finetuned_model/tokenizer.json')