In [3]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
from transformers import default_data_collator

# Load CSV data into a DataFrame
df = pd.read_csv('../../data/data-extracted.csv')

# Create prompts and responses
def create_prompt(row):
    return f"Extract the {row['entity_name']} from the following text: {row['extracted_text']}"

def create_response(row):
    return row['entity_value']

df['prompt'] = df.apply(create_prompt, axis=1)
df['response'] = df.apply(create_response, axis=1)

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(df[['prompt', 'response']])

# Split dataset into training and evaluation
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
eval_dataset = dataset['test']

# Initialize Tokenizer and Model
model_name = 'gpt2'  # or 'distilgpt2' for a smaller model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set pad_token to eos_token if it does not exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize the Dataset
def tokenize_function(examples):
    prompts = tokenizer(examples['prompt'], padding='max_length', truncation=True, max_length=512)
    responses = tokenizer(examples['response'], padding='max_length', truncation=True, max_length=128)
    
    # Ensure labels are correctly aligned with inputs
    labels = responses['input_ids']
    
    # Align the lengths of inputs and labels
    max_length = len(prompts['input_ids'][0])
    labels = [l[:max_length] + [tokenizer.pad_token_id] * (max_length - len(l)) for l in labels]
    
    return {
        'input_ids': prompts['input_ids'],
        'attention_mask': prompts['attention_mask'],
        'labels': labels
    }

# Tokenize the datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['prompt', 'response'])

# DataLoader to handle batching
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=2, collate_fn=default_data_collator)
eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=2, collate_fn=default_data_collator)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,  # Reduced batch size
    per_device_eval_batch_size=2,   # Reduced batch size
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
    save_total_limit=3,
    fp16=True,
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=default_data_collator,  # Automatically pads the data
)

# Start Training
trainer.train()

# Save the Fine-Tuned Model
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

# Example Usage
from transformers import pipeline

# Load the fine-tuned model
generator = pipeline('text-generation', model='./fine_tuned_gpt2', tokenizer=model_name)

# Example prompt
prompt = "Extract the item_weight from the following text: NATURE PROP@S' DESODORISER LE LINGE. QUI PERMETIENT LE NETTOYAGE & SEC DES TACHES RECALITRANTES SUR TOUTES IES SURFACES (MOQUETTE TAPIS, PARQUET...).ELLE EST AUST EFRICACE P @INGREDIENT : BENTONITE 100%) DOSAGE CONSEELLE : SELON USAGE. @ 5000 g STOCKAGE : DENG ON EMPRIAGE FORMS, A FABIL OR IN OR PROCOUNTIONS : TENER BARS BE PORDE DES ENTANTS INGREDIENT MENAGER ANTISION LOT: PNSON202021:003 80103 7585 ** PROP: LABORATORE PROPOS'NATURE MULTI-USAGE SOMEARIES TERRE DE 100% NATUREL DE ARGLLE 100% PURE ET NATUREILE, LA TEREE SOMMIERES PRESENTE DES PROPRIETES ABSORBANTES"

response = generator(prompt, max_length=50)
print(response)



Map:   0%|          | 0/89 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/33 [00:00<?, ?it/s]

Checkpoint destination directory ./checkpoint-10 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 8.3566, 'learning_rate': 1.7575757575757576e-05, 'epoch': 0.89}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 3.4077682495117188, 'eval_runtime': 1.7637, 'eval_samples_per_second': 5.67, 'eval_steps_per_second': 2.835, 'epoch': 0.98}


Checkpoint destination directory ./checkpoint-20 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 1.2943, 'learning_rate': 1.1515151515151517e-05, 'epoch': 1.78}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.08404259383678436, 'eval_runtime': 1.4338, 'eval_samples_per_second': 6.974, 'eval_steps_per_second': 3.487, 'epoch': 1.96}


Checkpoint destination directory ./checkpoint-30 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0754, 'learning_rate': 5.4545454545454545e-06, 'epoch': 2.67}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.041352976113557816, 'eval_runtime': 1.8493, 'eval_samples_per_second': 5.407, 'eval_steps_per_second': 2.704, 'epoch': 2.93}
{'train_runtime': 237.3229, 'train_samples_per_second': 1.125, 'train_steps_per_second': 0.139, 'train_loss': 2.952164835099018, 'epoch': 2.93}


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Extract the item_weight from the following text: NATURE PROP@S' DESODORISER LE LINGE. QUI PERMETIENT LE NETTOYAGE & SEC DES TACHES RECALITRANTES SUR TOUTES IES SURFACES (MOQUETTE TAPIS, PARQUET...).ELLE EST AUST EFRICACE P @INGREDIENT : BENTONITE 100%) DOSAGE CONSEELLE : SELON USAGE. @ 5000 g STOCKAGE : DENG ON EMPRIAGE FORMS, A FABIL OR IN OR PROCOUNTIONS : TENER BARS BE PORDE DES ENTANTS INGREDIENT MENAGER ANTISION LOT: PNSON202021:003 80103 7585 ** PROP: LABORATORE PROPOS'NATURE MULTI-USAGE SOMEARIES TERRE DE 100% NATUREL DE ARGLLE 100% PURE ET NATUREILE, LA TEREE SOMMIERES PRESENTE DES PROPRIETES ABSORBANTES"}]


In [5]:
response = generator("How are you?", max_length=50)
print(response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'How are you?'}]
