In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the pre-trained model
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")




In [5]:
# Tokenize the essay text
input_text = "The general problem of simulating (or creating) intelligence has been broken down into sub-problems. These consist of particular traits or capabilities that researchers expect an intelligent system to display. The traits described below have received the most attention."

input_ids = torch.tensor(tokenizer.encode(input_text)).unsqueeze(0)

# Generate the title
generated_title = model.generate(input_ids, max_length=100, do_sample=True, top_p=0.95, top_k=20)
title = tokenizer.decode(generated_title[0], skip_special_tokens=True)

print("TITLE:")
print(title)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


TITLE:
The general problem of simulating (or creating) intelligence has been broken down into sub-problems. These consist of particular traits or capabilities that researchers expect an intelligent system to display. The traits described below have received the most attention. A number of traits were not included because of the difficulty of selecting a particular candidate.

1. Intelligence is an ability to perceive the world around us.

2. Intelligence is often thought to be derived from the ability to perceive things by chance.



# Fine Tuning Model

In [5]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments

In [3]:
# Load the data from the CSV file
data = pd.read_csv("output.csv")
data = data.reset_index(drop=True).dropna()

# Split the data into train and validation sets
train_data = data.sample(frac=0.8, random_state=42)
val_data = data.drop(train_data.index)



# Create a GPT-2 tokenizer and a GPT-2 model for sequence classification
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

model = GPT2ForSequenceClassification.from_pretrained("gpt2")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
print(train_data["passage"].dtype)


object


In [13]:
train_data["passage"] = train_data["passage"].fillna("").astype(str)
val_data["passage"] = val_data["passage"].fillna("").astype(str)

In [4]:
# Add special tokens to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Prepare the data for fine-tuning
train_inputs = tokenizer.batch_encode_plus(train_data["passage"].tolist(), max_length=512, pad_to_max_length=True, return_tensors="pt")
val_inputs = tokenizer.batch_encode_plus(val_data["passage"].tolist(), max_length=512, pad_to_max_length=True, return_tensors="pt")

train_labels = [tokenizer.encode(label, max_length=512, pad_to_max_length=True, truncation=True) for label in train_data["heading"]]
val_labels = [tokenizer.encode(label, max_length=512, pad_to_max_length=True, truncation=True) for label in val_data["heading"]]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [5]:
batch_size = 4

# Fine-tune the model
training_args = TrainingArguments(
    output_dir='./results',
    eval_steps=1000,
    per_device_train_batch_size=batch_size,
    save_steps=1000,
    num_train_epochs=3
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=lambda data: {'input_ids': data['input_ids'], 'labels': data['labels']}
)

from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
trainer.train()


NameError: name 'train_dataset' is not defined

In [7]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer

# Load the dataset from a CSV file
df = pd.read_csv("output.csv")
df = df.reset_index(drop=True).dropna()

# Define your custom dataset
class TitlesDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, dataset, max_length):
        self.data = []
        for i in range(len(dataset)):
            title = dataset.iloc[i]['heading']
            passage = dataset.iloc[i]['passage']
            inputs = tokenizer.encode(passage, add_special_tokens=True, max_length=max_length - 2, return_tensors="pt")
            labels = tokenizer.encode(title, add_special_tokens=False, return_tensors="pt")
            self.data.append({'input_ids': inputs, 'labels': labels})
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]


# Instantiate the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Prepare the dataset
dataset = TitlesDataset(tokenizer, df, max_length=512)
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(0.8 * len(dataset)), int(0.2 * len(dataset))])
data_collator = lambda data: {'input_ids': data['input_ids'], 'labels': data['labels']}

# Fine-tune the model
batch_size = 4

training_args = TrainingArguments(
    output_dir='./results',
    eval_steps=1000,
    per_device_train_batch_size=batch_size,
    save_steps=1000,
    num_train_epochs=3
)


loading file vocab.json from cache at C:\Users\sukri/.cache\huggingface\hub\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\vocab.json
loading file merges.txt from cache at C:\Users\sukri/.cache\huggingface\hub\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at C:\Users\sukri/.cache\huggingface\hub\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_in

In [8]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    data_collator=lambda data: {'input_ids': data['input_ids'], 'labels': data['labels']}
)
trainer.train()


***** Running training *****
  Num examples = 1238
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 930
  Number of trainable parameters = 124439808


  0%|          | 0/930 [00:00<?, ?it/s]

TypeError: 'DataLoader' object is not subscriptable