In [None]:
!pip install transformers


In [None]:
!pip install -q transformers==4.9.2


In [None]:
!pip install accelerate -U

In [None]:
!pip install transformers[torch] accelerate -U


In [None]:
import transformers
import accelerate

print(transformers.__version__)
print(accelerate.__version__)


In [None]:
!pip install datasets

# Important  libraries

In [None]:
import re
import torch
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import Dataset, DatasetDict


# Load Data Set 

In [None]:
path_to_file = 'shakespeare.txt'
text = open(path_to_file, 'r').read()

# **Text Cleaning**
Library Used: re (Regular Expression operations)

In [None]:
def clean_text(text):
    text = re.sub(r'\r', '', text)
    
    text = re.sub(r'\n', ' ', text)
 
    text = re.sub(r'[^a-zA-Z\s]', '', text)
 
    text = re.sub(r'\s+', ' ', text)
   
    text = text.lower()
    return text

cleaned_text = clean_text(text)

# **Preprocessing and Tokenization**
Library Used: transformers from Hugging Face

 Uses GPT2Tokenizer from the transformers library to tokenize the cleaned text and convert tokens to token IDs.

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
tokenizer.pad_token = tokenizer.eos_token


tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
tokenized_text = tokenizer.tokenize(cleaned_text)

In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

In [None]:
import numpy as np

np.save('shakespeare_token_ids.npy', token_ids)

# **Feature Engineering**
Library Used: numpy

 Uses numpy to create input sequences of token IDs.

In [None]:
def create_input_sequences(token_ids, seq_length):
    input_sequences = []
    for i in range(0, len(token_ids) - seq_length, seq_length):
        input_sequences.append(token_ids[i:i + seq_length])
    return np.array(input_sequences)

seq_length = 128
input_sequences = create_input_sequences(token_ids, seq_length)


np.save('shakespeare_input_sequences.npy', input_sequences)

# **Data Splitting**
Library Used: scikit-learn

 Uses **train_test_split** from **scikit-learn** to split the data into training and validation sets.

In [None]:
from sklearn.model_selection import train_test_split

train_sequences, val_sequences = train_test_split(input_sequences, test_size=0.1, random_state=42)

# **Dataset Conversion**
Library Used: **datasets** from Hugging Face

Uses **Datase**t and **DatasetDict** from the **datasets** library to create datasets for training and validation.

In [None]:
train_dataset = Dataset.from_dict({'input_ids': train_sequences.tolist()})
val_dataset = Dataset.from_dict({'input_ids': val_sequences.tolist()})


datasets = DatasetDict({'train': train_dataset,'validation': val_dataset})

# **Model Loading and Fine-Tuning**
Library Used: **transformers** from Hugging Face

Uses **GPT2LMHeadModel**, **DataCollatorForLanguageModeling**, **Trainer**, and **TrainingArguments** from the **transformers** library to load the GPT-2 model and fine-tune it on the Shakespeare dataset.

In [None]:
from transformers import GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments


model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./gpt2-shakespeare-finetuned',
    overwrite_output_dir=True,
    num_train_epochs=15,  
    per_device_train_batch_size=4,  
    gradient_accumulation_steps=2, 
    save_steps=1000, 
    save_total_limit=3,
    logging_steps=200,)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'])


trainer.train()

# **Text Generation**
Library Used: **transformers** from **Hugging Face**, **torch** for handling tensors.

Uses the **GPT2LMHeadMode**l and **GPT2Tokenizer** from the **transformers** library along with **torch** for tensor operations to generate text based on a seed input.

In [None]:
import torch

def generate_text(model, tokenizer, seed_text, max_length=100, temperature=0.7, top_k=50, top_p=0.95):
    model.eval()
    device = next(model.parameters()).device
    input_ids = tokenizer.encode(seed_text, return_tensors='pt').to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(device)

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=1.2,  
        pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)



seed_text = "AGRIPPA. My dear friend, I have seen the heavens fall and"
generated_text = generate_text(model.cuda(), tokenizer, seed_text, max_length=100,  temperature=0.7, top_k=40, top_p=0.9)
print(generated_text)


# **Working with Layers**
**Embedding Layer**

When the input text is tokenized and converted to token IDs, the embedding layer maps these IDs to dense vectors.

**Role:** The input_ids are fed into the embedding layer to obtain initial embeddings.

**Transformer Layers**

The core of GPT-2 consists of multiple transformer layers, each performing two main operations:

**1. Self-Attention Mechanism:**

This mechanism allows the model to weigh the importance of different tokens in the sequence when producing the next token. Each transformer layer has multiple heads (multi-head self-attention) to capture various relationships in the data.

**2. Feed-Forward Network:**

 After self-attention, the data is passed through a feed-forward neural network for further processing.

 **Fine-Tuning with Transformer Layers**

During fine-tuning, the Trainer class handles the forward and backward passes through these transformer layers.

**Forward Pass:**

 For each batch of data, the input sequences are passed through the embedding layer and then through each of the transformer layers. The model computes the loss based on the difference between the predicted and actual next tokens.

**Backward Pass:**

 The optimizer updates the model parameters in the transformer layers to minimize the loss.

 **Generating Text with Transformer Layers**

When generating text, the model uses the same transformer layers to predict the next token in the sequence.

**Self-Attention Mechanism:**

In each transformer layer, the self-attention mechanism calculates attention scores for the input sequence.

**Feed-Forward Network:**

The attention scores are processed through a feed-forward network to produce the next token's embedding.

**Output Layer:**

The final layer converts these embeddings into probabilities over the vocabulary, from which the most likely next token is sampled.