In [1]:
# Install Necessary Libraries
!pip install transformers torch accelerate flask flask-ngrok




In [2]:
# Load Pre-trained Model and Tokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add a padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize Your Data
data = ["Sample text", "Another example text"]
tokenized_data = tokenizer(data, return_tensors='pt', padding=True, truncation=True)

# Add labels to the dataset (input_ids can be used as labels for language modeling)
tokenized_data['labels'] = tokenized_data.input_ids.detach().clone()

# Convert tokenized data to Dataset format
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

train_dataset = TextDataset(tokenized_data)


In [3]:
# Define the Fine-Tuning Process
from transformers import Trainer, TrainingArguments

# Resize token embeddings to account for new pad token
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=20,              # number of training epochs
    per_device_train_batch_size=4,   # batch size for training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset          # training dataset
)

trainer.train()


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Step,Training Loss


TrainOutput(global_step=20, training_loss=9.795569610595702, metrics={'train_runtime': 31.3761, 'train_samples_per_second': 1.275, 'train_steps_per_second': 0.637, 'total_flos': 61240320000.0, 'train_loss': 9.795569610595702, 'epoch': 20.0})

In [4]:
# Function to Generate Multiple Unique Texts
def generate_text(prompt, num_return_sequences=5):
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=100,
        num_return_sequences=num_return_sequences,
        do_sample=True,       # Enable sampling
        temperature=1.0,      # Increase temperature for more diversity
        top_k=50,             # Limits the sampling pool to top_k tokens
        top_p=0.9,            # Use nucleus sampling; limits to tokens with top_p cumulative probability
        repetition_penalty=1.2 # Apply a penalty to reduce repetitive output
    )
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts

# Example Usage
unique_texts = generate_text("Once upon a time", num_return_sequences=5)
for i, text in enumerate(unique_texts):
    print(f"Generated text {i+1}: {text}")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text 1: Once upon a time the times I did not like my own job (and even if they'd been good) and as in all things that were necessary.
If you have any more questions then please do this: We believe there was one person who would let me write about something called "Hollow Men's Workout." And it worked really well for three days, because no matter how hard anyone tried to stop him from writing he got into pretty bad form so at least their body can help take
Generated text 2: Once upon a time, with all our knowledge of the human condition that we can imagine and what it is impossible to do because in this very moment everything becomes clear. But there are those who will not believe or even understand these things."
When he spoke at last night's funeral Mass one thing was made painfully evident: The Prophet Muhammad has written an unsearchable book about his life by saying some words so far removed from God himself; but as I have said before they were just more than mere
Generat

In [None]:
# Create a Simple Flask App
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)

@app.route('/generate', methods=['POST'])
def generate():
    prompt = request.json['input']
    generated_text = generate_text(prompt)
    return jsonify({'generated_text': generated_text})

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
Exception in thread Thread-7:
Traceback (most recent call last):
  File "C:\Users\kadam\anaconda3\Lib\site-packages\urllib3\connection.py", line 199, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kadam\anaconda3\Lib\site-packages\urllib3\util\connection.py", line 85, in create_connection
    raise err
  File "C:\Users\kadam\anaconda3\Lib\site-packages\urllib3\util\connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\kadam\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 789, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kadam\anaconda3\Lib\site-packages\