# GPT 2 for Test Issues


In [None]:
import pandas as pd

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

## Anderer Ansatz von Kaggle NOtebook 

https://www.kaggle.com/code/changyeop/how-to-fine-tune-gpt-2-for-beginners

In [None]:
import re

In [None]:
def cleaning(s):
    s = str(s)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace("[\w*"," ")
    return s

In [None]:
df = pd.read_csv('fine_tuning_data.csv', encoding="ISO-8859-1")


df = df.dropna()
df['text'] = df['text'].astype(str)

print(df.shape)
print(df.head())
text_data = open('reviews.txt', 'w', encoding='ISO-8859-1')
for idx, row in df.iterrows():
  article = row["text"]
  text_data.write(article)
text_data.close()

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [None]:
# you need to set parameters 
train_file_path = "reviews.txt"
model_name = 'gpt2'
output_dir = 'result'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 1.0
save_steps = 100

In [None]:
# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

In [5]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "result"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

sequence = "I was at a nice restaurant and the food was"
max_len = 100
print("starting")
generate_text(sequence, max_len) # oil price for July June which had been low at as low as was originally stated Prices have since resumed

starting
I was at a nice restaurant and the food was good. He gave me the same thing as the waitress but in English instead.

I tried all the ingredients in the restaurant, but they all came in a box. The salad was terrible and the pasta was terrible. It was a great deal and I really like the salad. I ordered the beef stock for the second time in 2 weeks. It was horrible. This is one of the better sushi at the place. I got my lunch,


### versuch anders


In [None]:
# df_gpt2 = pd.read_csv('fine_tuning_data.csv')
# print(df_gpt2.shape)


# reviews = df_gpt2['text'].tolist()  
# reviews = [review.strip() for review in reviews]  
# reviews = [review for review in reviews if review]

# # Join all reviews into a single string for tokenization, but add a new line between each review

# data = ' '.join(reviews)


# # Tokenize the dataset
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# input_ids = tokenizer.encode(data, return_tensors='pt')

# # Finetune the model on the dataset
# model = GPT2LMHeadModel.from_pretrained('gpt2')
# model.train()
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
# for i in range(1000):
#     outputs = model(input_ids, labels=input_ids)
#     loss = outputs[0]
#     loss.backward()
#     optimizer.step()
#     optimizer.zero_grad()

# # Generate a fake review for a restaurant
# keyword = 'restaurant'
# input_text = 'I went to the ' + keyword + ' and '
# input_ids = tokenizer.encode(input_text, return_tensors='pt')
# generated = model.generate(input_ids, do_sample=True, max_length=100)
# generated_text = tokenizer.decode(generated[0], skip_special_tokens=True)

# print(generated_text)


# GPT 3

In [None]:
import json
import openai

# How to fine-tune a GPT-3 model for specific prompts

I'm constantly looking for ways to automate the work with support requests. An idea has been to fine-tune a GPT-3 model to answer common support-related questions.

**Here's how you can fine-tune a GPT-3 model with Python with your own data.**

In this walkthrough, we'll fine-tune a GPT-3 model to answer common support-related questions.

Detailed step-by-step intructions for this repo in this blog post: https://norahsakal.com/blog/fine-tune-gpt3-model

# Define OpenAI API keys

In [None]:
with open('apikey_openai.txt', 'r') as f:
    api_key = f.read()

openai.api_key = api_key

# Create training data

Make sure to end each `prompt` with a suffix. According to the [OpenAI API reference](https://beta.openai.com/docs/guides/fine-tuning "fine-tuning reference"), you can use ` ->`.

Also, make sure to end each `completion` with a suffix as well; I'm using `.\n`.

In [None]:
data_file = [{
    "prompt": "Prompt ->",
    "completion": " Ideal answer.\n"
},{
    "prompt":"Prompt ->",
    "completion": " Ideal answer.\n"
}]

In [None]:
print(data_file)

# Save dict as JSONL

Training data need to be a JSONL document.
JSONL file is a newline-delimited JSON file.
More info about JSONL: https://jsonlines.org/

In [None]:
file_name = "training_data.jsonl"

with open(file_name, 'w') as outfile:
    for entry in data_file:
        json.dump(entry, outfile)
        outfile.write('\n')

print("Done")
print(file_name)

# Check JSONL file

In [None]:
!openai tools fine_tunes.prepare_data -f training_data.jsonl

# Upload file to your OpenAI account

In [None]:
upload_response = openai.File.create(
  file=open(file_name, "rb"),
  purpose='fine-tune'
)
upload_response

# Save file name

In [None]:
file_id = upload_response.id
file_id

# Fine-tune a model

The default model is **Curie**. 

If you'd like to use **DaVinci** instead, then add it as a base model to fine-tune:

```openai.FineTune.create(training_file=file_id, model="davinci")```

In [None]:
fine_tune_response = openai.FineTune.create(training_file=file_id)
fine_tune_response

# Check fine-tune progress

Check the progress with `openai.FineTune.list_events(id=fine_tune_response.id)` and get a list of all the fine-tuning events

In [None]:
fine_tune_events = openai.FineTune.list_events(id=fine_tune_response.id)
fine_tune_events

Check the progress with `openai.FineTune.retrieve(id=fine_tune_response.id)` and get an object with the fine-tuning job data

In [None]:
retrieve_response = openai.FineTune.retrieve(id=fine_tune_response.id)
retrieve_response

# Save fine-tuned model

### Troubleshooting fine_tuned_model as null
During the fine-tuning process, the **fine_tuned_model** key may not be immediately available in the fine_tune_response object returned by `openai.FineTune.create()`.

To check the status of your fine-tuning process, you can call the `openai.FineTune.retrieve()` function and pass in the **fine_tune_response.id**. This function will return a JSON object with information about the training status, such as the current epoch, the current batch, the training loss, and the validation loss.

After the fine-tuning process is complete, you can check the status of all your fine-tuned models by calling `openai.FineTune.list()`. This will list all of your fine-tunes and their current status.

Once the fine-tuning process is complete, you can retrieve the fine_tuned_model key by calling the `openai.FineTune.retrieve()` function again and passing in the fine_tune_response.id. This will return a JSON object with the key fine_tuned_model and the ID of the fine-tuned model that you can use for further completions.

### Option 1

If `fine_tune_response.fine_tuned_model != None` then the key **fine_tuned_model** is availble from the fine_tune_response object

In [None]:
if fine_tune_response.fine_tuned_model != None:
    fine_tuned_model = fine_tune_response.fine_tuned_model

### Option 2

If `fine_tune_response.fine_tuned_model == None:` you can get the **fine_tuned_model** by listing all fine-tune events

In [None]:
if fine_tune_response.fine_tuned_model == None:
    fine_tune_list = openai.FineTune.list()
    fine_tuned_model = fine_tune_list['data'][0].fine_tuned_model

### Option 3

If `fine_tune_response.fine_tuned_model == None:` you can get the **fine_tuned_model** key by retrieving the fine-tune job

In [None]:
if fine_tune_response.fine_tuned_model == None:
    fine_tuned_model = openai.FineTune.retrieve(id=fine_tune_response.id).fine_tuned_model

# Test the new model on a new prompt

Remember to end the prompt with the same suffix as we used in the training data; ` ->`:

In [None]:
new_prompt = "NEW PROMPT ->"

In [None]:
answer = openai.Completion.create(
  model=fine_tuned_model,
  prompt=new_prompt,
  max_tokens=10, # Change amount of tokens for longer completion
  temperature=0
)
answer['choices'][0]['text']