In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Use the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load the pre-trained model from "CoffeeAddict93/gpt2-medium-modest-proposal"
model_name = "CoffeeAddict93/gpt2-medium-modest-proposal"
model = GPT2LMHeadModel.from_pretrained(model_name)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

In [3]:
import pandas as pd
from datasets import Dataset

# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/data.csv')

# Prepare the dataset for Hugging Face
df['text'] = df.apply(lambda row: (
    f"RFP details: {row['RFP details']}\n"
    f"Organization Name: {row['Organization Name']}\n"
    f"Address: {row['Address']}\n"
    f"Program description: {row['Program description']}\n"
    f"Introduction: {row['Introduction']}\n"
    f"Executive Summary: {row['Executive Summary']}\n"
    f"Methods & Approach: {row['Methods & Approach']}\n"
    f"Additional comments (optional): {row['Additional comments (optional)']}\n"
    f"Flag: {row['flag']}\n"
    f"Username: {row['username']}\n"
), axis=1)

# Convert to Hugging Face dataset
hf_dataset = Dataset.from_pandas(df[['text']])



In [4]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1)  # 10% for validation

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df[['text']])
val_dataset = Dataset.from_pandas(val_df[['text']])

In [5]:
# Assign eos_token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    encodings = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
    encodings['labels'] = encodings['input_ids']
    return encodings

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [6]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab_Notebooks/proposal_gpt_model_result",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,  # Add validation dataset
)

# Fine-tune the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,9.860779
2,No log,8.382278
3,No log,7.755583


TrainOutput(global_step=6, training_loss=6.150688171386719, metrics={'train_runtime': 44.1763, 'train_samples_per_second': 0.272, 'train_steps_per_second': 0.136, 'total_flos': 11144408334336.0, 'train_loss': 6.150688171386719, 'epoch': 3.0})

In [7]:
model.save_pretrained("/content/drive/MyDrive/Colab_Notebooks/proposal_gpt_model_result/fine-tuned-gpt2-medium-modest-proposal")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab_Notebooks/proposal_gpt_model_result/fine-tuned-gpt2-medium-modest-proposal")

('/content/drive/MyDrive/Colab_Notebooks/proposal_gpt_model_result/fine-tuned-gpt2-medium-modest-proposal/tokenizer_config.json',
 '/content/drive/MyDrive/Colab_Notebooks/proposal_gpt_model_result/fine-tuned-gpt2-medium-modest-proposal/special_tokens_map.json',
 '/content/drive/MyDrive/Colab_Notebooks/proposal_gpt_model_result/fine-tuned-gpt2-medium-modest-proposal/vocab.json',
 '/content/drive/MyDrive/Colab_Notebooks/proposal_gpt_model_result/fine-tuned-gpt2-medium-modest-proposal/merges.txt',
 '/content/drive/MyDrive/Colab_Notebooks/proposal_gpt_model_result/fine-tuned-gpt2-medium-modest-proposal/added_tokens.json')

In [9]:
# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/Colab_Notebooks/proposal_gpt_model_result/fine-tuned-gpt2-medium-modest-proposal")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("/content/drive/MyDrive/Colab_Notebooks/proposal_gpt_model_result/fine-tuned-gpt2-medium-modest-proposal")

# Generate text
prompt = "What is the address of Alberta Innovates organization?"
input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors='pt')
output = fine_tuned_model.generate(input_ids, max_length=150, num_return_sequences=1)

# Decode and print the result
print(fine_tuned_tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is the address of Alberta Innovates organization?

Alberta Innovates is a non-profit organization that provides support to Alberta's small businesses.

What is the purpose of Alberta Innovates?

Alberta Innovates is a non-profit organization that provides support to Alberta's small businesses.

What is the Alberta Innovates website?

The Alberta Innovates website is a resource for Alberta's small businesses.

What is the Alberta Innovates website?

The Alberta Innovates website is a resource for Alberta's small businesses.

What is the Alberta Innovates website?

The Alberta Innovates website is a resource for Alberta's small businesses.

What is the Alberta Innovates website
