In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers
!pip install transformers datasets
!pip install torch
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, pipeline, AutoModelForSeq2SeqLM
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader
import torch
import pandas as pd
import accelerate
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import gc

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/sentiment140.csv', encoding='latin-1', header=None, names=['polarity', 'id', 'date', 'query', 'user', 'text'])
df = df.sample(frac=0.0001, random_state=42)
texts = df['text'].tolist()

In [6]:
# Load the model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

# Define a function to tokenize the data in batches
def tokenize_batch(batch_texts, tokenizer, max_length=128):
    return tokenizer(batch_texts, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
# Define the custom dataset class
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        text = self.texts[idx]
        encodings = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        item = {key: val.squeeze() for key, val in encodings.items()}
        item['labels'] = item['input_ids']
        return item

    def __len__(self):
        return len(self.texts)

# Create the dataset and dataloader
dataset = TweetDataset(texts, tokenizer)
dataloader = DataLoader(dataset, batch_size=10)

In [8]:
# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Remove padding
    logits = torch.tensor(logits, dtype=torch.float32)
    labels = torch.tensor(labels, dtype=torch.int64)

    # Mask to only consider non-padding tokens
    valid_labels = labels != tokenizer.pad_token_id

    # Flatten the logits and labels
    logits_flat = logits.view(-1, logits.size(-1))
    labels_flat = labels.view(-1)
    valid_labels_flat = valid_labels.view(-1)

    # Filter out padding tokens
    logits_flat = logits_flat[valid_labels_flat]
    labels_flat = labels_flat[valid_labels_flat]

    # Compute cross-entropy loss
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(logits_flat, labels_flat)
    perplexity = torch.exp(loss).item()

    # Calculate BLEU score
    smoothie = SmoothingFunction().method4
    references = [[tokenizer.decode(label, skip_special_tokens=True).split()] for label in labels.tolist()[:100]]
    candidates = [tokenizer.decode(pred, skip_special_tokens=True).split() for pred in predictions.tolist()[:100]]
    bleu = np.mean([sentence_bleu(ref, cand, smoothing_function=smoothie) for ref, cand in zip(references, candidates)])

    return {"perplexity": perplexity, "bleu": bleu}

In [9]:
# Define a function to evaluate the model before training
def evaluate_before_training(model, dataloader, compute_metrics):
    model.eval()
    all_logits = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: val.to(model.device) for key, val in batch.items()}
            outputs = model(**inputs)
            all_logits.append(outputs.logits.cpu())
            all_labels.append(inputs['labels'].cpu())

    # Concatenate results for metrics computation
    all_logits = torch.cat(all_logits)
    all_labels = torch.cat(all_labels)
    return compute_metrics((all_logits.numpy(), all_labels.numpy()))

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Evaluate before training
initial_metrics = evaluate_before_training(model, dataloader, compute_metrics)
print("Initial Metrics:", initial_metrics)

Initial Metrics: {'perplexity': 5393.50537109375, 'bleu': 0.01937126272009725}


In [10]:
# Adjust the model definition to use gradient checkpointing
model.gradient_checkpointing_enable()
model.config.use_cache = False

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # Gradient accumulation
    num_train_epochs=20,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,  # Mixed precision training
    dataloader_num_workers=0,  # Disable prefetching
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,  # Using the same dataset for simplicity; ideally, split into train and eval
    compute_metrics=compute_metrics
)

# Clear CUDA cache
torch.cuda.empty_cache()
gc.collect()

trainer.train()

Epoch,Training Loss,Validation Loss,Perplexity,Bleu
1,6.5668,0.956665,5590.197266,0.015604
2,0.8966,0.733531,3373.66333,0.026747
3,0.7318,0.620142,4728.700684,0.030365
4,0.627,0.51793,7379.48291,0.048509
5,0.5338,0.428118,13644.066406,0.086142
6,0.4525,0.350628,20806.363281,0.126742
7,0.3851,0.28522,38118.910156,0.180453
8,0.3221,0.219132,124525.101562,0.248837
9,0.2698,0.177616,213896.546875,0.316585
10,0.2358,0.14457,419201.46875,0.399182


TrainOutput(global_step=200, training_loss=0.609971489906311, metrics={'train_runtime': 318.8438, 'train_samples_per_second': 10.036, 'train_steps_per_second': 0.627, 'total_flos': 209033625600000.0, 'train_loss': 0.609971489906311, 'epoch': 20.0})

In [11]:
# Save the trained model
save_directory = "/content/drive/MyDrive/saved_model"
trainer.save_model(save_directory)

In [13]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Generate a response
input_text = "The weather is bad today."
response = generator(input_text, max_length=50)
print(response[0]['generated_text'])

The weather is bad today. The humidity gets top-breezy, which sucks 
