# Part 2 Transformer (Text generation)

## Setting Up PyTorch Device for GPU or CPU

In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'using device {device}')

using device cpu


In [2]:
import os

data_dir = 'data'
output_file = 'text_generation.txt'

def is_hidden(filepath):
    pass

with open(output_file, 'w') as outfile:
    for filename in os.listdir(data_dir):
        filepath = os.path.join(data_dir, filename)
        if os.path.isfile(filepath):  # Check if it's a file
            if not is_hidden(filepath):
                with open(filepath) as infile:
                    for line in infile:
                        if line.strip():
                            outfile.write(line)

In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorWithPadding
from torch.utils.data import Dataset

# i use GPT-2-medium
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') 
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

# set padding token to assure consistent sequence lengths  
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

2024-05-25 00:28:43.475020: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-25 00:28:43.663937: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-25 00:28:43.664003: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-25 00:28:43.697904: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-25 00:28:43.761794: I tensorflow/core/platform/cpu_feature_guar

## Create a class to Fine-Tuning with custom dataset

In [4]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size):
        self.tokenizer = tokenizer
        with open(file_path, "r") as f:
            self.text = f.read().splitlines()
    def __len__(self):
        return len(self.text)
    def __getitem__(self, idx):
        tokenized_input = self.tokenizer(
            self.text[idx],
            truncation=True,
            padding="max_length",
            max_length = 128,
            return_tensors = "pt"
        )
        tokenized_input['labels'] = tokenized_input['input_ids']
        return tokenized_input

## Load data

In [5]:
data = CustomDataset(tokenizer, "text_generation.txt", 128)

## Create a data collator that will dynamically pad the sequences
    Badged data to the maximum sequence lenght in a batch it pads each batch to the length of the longest sequence in that batch saving processing complexities by passing the tokenizer to data collator with padding we assure that padding is done correctly according to the tokenizer setting

In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Training arguments and trainer

In [7]:
training_args = TrainingArguments(
    per_device_train_batch_size = 4,
    num_train_epochs = 10,
    learning_rate = 1e-4,
    output_dir = './results',
    logging_dir = './logs',
    logging_steps = 10,
    load_best_model_at_end = False,
    evaluation_strategy="no",
    remove_unused_columns = False,
    push_to_hub= False
)



In [8]:
trainer = Trainer(
    model = model,
    args=training_args,
    train_dataset = data,
    eval_dataset= None,
    data_collator = data_collator
)

In [9]:
trainer.train()

Step,Training Loss
10,1.826
20,0.7601
30,0.4963
40,0.3464
50,0.3004
60,0.1825
70,0.1817
80,0.1385
90,0.1126
100,0.0889


TrainOutput(global_step=250, training_loss=0.20630556225776672, metrics={'train_runtime': 2407.3611, 'train_samples_per_second': 0.403, 'train_steps_per_second': 0.104, 'total_flos': 225209918423040.0, 'train_loss': 0.20630556225776672, 'epoch': 10.0})

# Save the model and tokenizer

In [10]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.json',
 './saved_model/merges.txt',
 './saved_model/added_tokens.json')

# Now i will Test GPT2

In [11]:
def generate_text(prompt, max_length=200):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        do_sample=True,
        top_k=50,
        temperature=1.5,
        num_beams=5,
        attention_mask=inputs.attention_mask,
        pad_token_id=tokenizer.pad_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


## Gradio Interface

## Load the saved model and tokenizer before launching the Gradio interface

In [12]:
# Gradio interface
import gradio as gr

def gradio_generate_text(prompt):
    return generate_text(prompt)

# Load the saved model and tokenizer before launching the Gradio interface
tokenizer = GPT2Tokenizer.from_pretrained('./saved_model')
model = GPT2LMHeadModel.from_pretrained('./saved_model')
model.to(device)

iface = gr.Interface(
    fn=gradio_generate_text, 
    inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), 
    outputs=gr.Textbox(lines=10), 
    title="GPT-2 Text Generation",
    description="Enter a prompt and the model will generate a continuation."
)



# Launch the interface

In [13]:
iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [20]:
def generate_text(prompt, max_length=50, temperature=0.7):
    # Tokenize the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

    # Generate text
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=temperature, pad_token_id=tokenizer.eos_token_id)

    # Decode the generated text and return it
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Refine the prompt strategy
prompt = "the Kingdom of Morocco"
generated_text = generate_text(prompt, max_length=150, temperature=0.8)
print("Generated Text:\n", generated_text)



Generated Text:
 the Kingdom of Morocco is a country located in North Africa. It is bordered by the Atlantic Ocean and the Mediterranean Sea to the west and north, Algeria to the east, and Western Sahara to the south.


In [21]:
prompt = "moroccan festivals"
generated_text = generate_text(prompt, max_length=150, temperature=0.8)
print("Generated Text:\n", generated_text)

Generated Text:
 moroccan festivals such as the Festival of World Sacred Music in Fez, the Mawazine World Music Festival in Rabat, and the Gnaoua World Music Festival in Essaouira attract artists and performers from around the world.


In [22]:
prompt = "explain to me how people live in morocco"
generated_text = generate_text(prompt, max_length=150, temperature=0.8)
print("Generated Text:\n", generated_text)

Generated Text:
 explain to me how people live in morocco, a country with a rich history and cultural heritage that continues to attract visitors from around the world. From bustling medinas to tranquil rural villages, Moroccans lead lives shaped by centuries of history, religion, and geography.


In [23]:
prompt = "Moroccan football"
generated_text = generate_text(prompt, max_length=150, temperature=0.8)
print("Generated Text:\n", generated_text)

Generated Text:
 Moroccan football traces its roots back to the early 20th century when the sport was introduced by European colonizers. The Moroccan Football Federation (FRMF) was founded in 1955, marking a significant milestone in the development of organized football in the country.
