In [1]:
from datasets import load_dataset
import pandas as pd
from pathlib import Path

DATA_PATH = Path('data/finetune_gpt/')
DATA_PATH.mkdir(parents=True, exist_ok=True)
dataset = load_dataset("statworx/haiku", cache_dir=DATA_PATH / 'model_cache')

updated_data = [{'text': item['text'], 'keywords': item['keywords']} for item in dataset['train']]
data_df = pd.DataFrame(updated_data)
work_data = data_df.dropna(subset=['text', 'keywords'])
work_data = work_data.drop_duplicates(subset=['text']).reset_index(drop=True)
work_data['text'] = work_data['text'].str.replace(' / ', ' ')
work_data = work_data[:15000]

In [2]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from pathlib import Path

class FineTuner:
    def __init__(self, 
                 model_name='gpt2', 
                 cache_dir='model_cache',
                 data_path=DATA_PATH):
        self.data_path = Path(data_path)
        
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=str(self.data_path / cache_dir))
        self.model = GPT2LMHeadModel.from_pretrained(model_name, cache_dir=str(self.data_path / cache_dir))

    def prepare_data(self, df):
        
        df['input'] = df.apply(lambda row: f"{row['keywords']} {self.tokenizer.eos_token}", axis=1)
        
        df['output'] = df.apply(lambda row: f"{row['text']} {self.tokenizer.eos_token}", axis=1)

        print(df)
        
        dataset_path = self.data_path / 'train_dataset.txt'

        with dataset_path.open('w', encoding='utf-8') as file:
            for input_text, target_text in zip(df['input'], df['output']):
                file.write(input_text + ' ' + target_text + '\n')
        return dataset_path

    def fine_tune(self, 
                  dataset_path, 
                  output_name='fine_tuned_model', 
                  num_train_epochs=16, 
                  per_device_train_batch_size=4, 
                  learning_rate=5e-5, 
                  save_steps=10_000):

        train_dataset = TextDataset(
            tokenizer=self.tokenizer,
            file_path=str(dataset_path),
            block_size=256
        )

        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer, mlm=False
        )

        training_args = TrainingArguments(
            output_dir=str(self.data_path / output_name),
            overwrite_output_dir=True,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            save_steps=save_steps,
            learning_rate=learning_rate,
            save_total_limit=2,
            logging_dir=str(self.data_path / 'logs')
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
        )

        trainer.train()
        
        self.model.save_pretrained(str(self.data_path / output_name))
        self.tokenizer.save_pretrained(str(self.data_path / output_name))




In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from pathlib import Path

class TextGenerator:
    def __init__(self, model_name='fine_tuned_model', data_path=DATA_PATH):
        model_path = Path(data_path) / model_name
        self.tokenizer = GPT2Tokenizer.from_pretrained(str(model_path))
        self.model = GPT2LMHeadModel.from_pretrained(str(model_path))
        self.model.eval()

    def generate_text(self, 
                    keywords: str,
                    max_length=120, 
                    num_return_sequences=1, 
                    temperature=1.0, 
                    top_k=0, 
                    top_p=1.0, 
                    do_sample=False):

        prompt_text = f"{keywords} {self.tokenizer.eos_token} "
        
        encoded_input = self.tokenizer.encode(prompt_text, return_tensors='pt')
        
        outputs = self.model.generate(
            encoded_input,
            max_length=max_length + len(encoded_input[0]),
            num_return_sequences=num_return_sequences,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            do_sample=do_sample,
            no_repeat_ngram_size=2
        )

        for output in outputs:
            print(output)
        
        all_texts = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        
        prompt_length = len(self.tokenizer.decode(encoded_input[0], skip_special_tokens=True))
        trimmed_texts = [text[prompt_length:] for text in all_texts]
        
        return {
            "full_texts": all_texts,
            "generated_texts": trimmed_texts
        }

In [4]:
finetuner = FineTuner()
dataset_path = finetuner.prepare_data(work_data)

                                                    text       keywords  \
0      Delicate savage. You'll never hold the cinder....         cinder   
1      A splash and a cry. Words pulled from the rive...  the riverside   
2      Steamy, mist rising. Rocks receiving downward ...    mist rising   
3      You were broken glass. But I touched you even ...   broken glass   
4      Eyes dance with firelight. The Moon and I are ...     eyes dance   
...                                                  ...            ...   
14995  So y'all just gonna. Force me to listen to thi...      listen to   
14996  Starlin Castro beat. That throw just like Addi...            his   
14997  I said the moment. I stopped having fun with i...     the moment   
14998  If y'all start acting. Like shit again after t...         ya mom   
14999  People worry, too. Much nowadays they forget. ...    they forget   

                             input  \
0             cinder <|endoftext|>   
1      the riverside <|

In [5]:
finetuner.fine_tune(dataset_path, output_name='fine_tuned_model_gpt_2')



Step,Training Loss
500,3.1462
1000,2.7916
1500,2.6226
2000,2.4642
2500,2.3652
3000,2.2543
3500,2.1737
4000,2.1001
4500,2.0425
5000,1.9951


In [6]:
u_keywords = work_data['keywords'].unique().tolist()
u_keywords[:10]

['cinder',
 'the riverside',
 'mist rising',
 'broken glass',
 'eyes dance',
 'haiku',
 'quit friend',
 'wind warms',
 'lion limped',
 'bloody scalpel']

In [8]:
keywords = u_keywords[5]

generator = TextGenerator(
    model_name='fine_tuned_model_gpt_2',
    data_path=DATA_PATH
)
generated_texts = generator.generate_text(
    keywords=keywords,
    max_length=16,
    num_return_sequences=3,
    do_sample=True,
    temperature=0.8,
    top_k=10,
    top_p=0.8
)
for i, text in enumerate(generated_texts['generated_texts']):
    print(f"Generated Text {i+1}: {text}")

for i, text in enumerate(generated_texts['full_texts']):
    print(f"Full Text {i+1}: {text}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([ 3099, 28643,   220, 50256,   220, 22503,   416,   317, 23013,    13,
          383,  3670,   286,   428,   387, 28643,    13,   632,   338,  1790,
           11])
tensor([ 3099, 28643,   220, 50256,   220, 22503,   416, 32840,  2584,    84,
           13,   383,   938,  6827, 21784,   340,   477,   510,    13,   314,
         2630])
tensor([ 3099, 28643,   220, 50256,   220, 22503,   416,   317,    13,   311,
           13, 32801,    11,   428,   387, 28643,   318,    13, 23762,   290,
         1479])
Generated Text 1:  Written by Aoi. The title of this haiku. It's short,
Generated Text 2:  Written by Ikkyu. The last sentence sums it all up. I wrote
Generated Text 3:  Written by A. S. Dawson, this haiku is. Beautiful and free
Full Text 1: haiku   Written by Aoi. The title of this haiku. It's short,
Full Text 2: haiku   Written by Ikkyu. The last sentence sums it all up. I wrote
Full Text 3: haiku   Written by A. S. Dawson, this haiku is. Beautiful and free
