In [1]:
import pandas as pd
from transformers_ import CamembertTokenizer, EncoderDecoderModel, Trainer, TrainingArguments
from datasets import Dataset, load_metric
from transformers_ import CamembertConfig


# Load and prepare the dataset
train_df = pd.read_csv('data/train.csv')
validation_df = pd.read_csv('data/validation.csv')

2024-03-31 15:39:25.734066: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 15:39:25.734088: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 15:39:25.734724: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-31 15:39:25.738473: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Initialize the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
config = CamembertConfig.from_pretrained('camembert-base')
decoder_start_token_id = tokenizer.cls_token_id
config.decoder_start_token_id = decoder_start_token_id

print("Tokenizer initialized successfully!")


In [3]:
# Preprocess the data
def preprocess_data(examples):
    inputs = examples['text']
    targets = examples['titles']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
train_dataset = Dataset.from_pandas(train_df).map(preprocess_data, batched=True)
validation_dataset = Dataset.from_pandas(validation_df).map(preprocess_data, batched=True)

print("Data preprocessed successfully!")


In [5]:
# Load the model
model = EncoderDecoderModel.from_encoder_decoder_pretrained('camembert-base', 'camembert-base')
# Explicitly set the decoder_start_token_id
model.config.decoder_start_token_id = tokenizer.cls_token_id
# Ensure the pad_token_id is also set for the model
model.config.pad_token_id = tokenizer.pad_token_id
# It's also a good idea to check if eos_token_id needs to be set for your model
model.config.eos_token_id = tokenizer.eos_token_id

print("Model loaded successfully!")

Some weights of CamembertForCausalLM were not initialized from the model checkpoint at camembert-base and are newly initialized: ['roberta.encoder.layer.0.crossattention.output.LayerNorm.bias', 'roberta.encoder.layer.0.crossattention.output.LayerNorm.weight', 'roberta.encoder.layer.0.crossattention.output.dense.bias', 'roberta.encoder.layer.0.crossattention.output.dense.weight', 'roberta.encoder.layer.0.crossattention.self.key.bias', 'roberta.encoder.layer.0.crossattention.self.key.weight', 'roberta.encoder.layer.0.crossattention.self.query.bias', 'roberta.encoder.layer.0.crossattention.self.query.weight', 'roberta.encoder.layer.0.crossattention.self.value.bias', 'roberta.encoder.layer.0.crossattention.self.value.weight', 'roberta.encoder.layer.1.crossattention.output.LayerNorm.bias', 'roberta.encoder.layer.1.crossattention.output.LayerNorm.weight', 'roberta.encoder.layer.1.crossattention.output.dense.bias', 'roberta.encoder.layer.1.crossattention.output.dense.weight', 'roberta.encoder

Model loaded successfully!


In [6]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

print("Training arguments initialized successfully!")

Training arguments initialized successfully!


In [7]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
print("Trainer initialized successfully!")


Trainer initialized successfully!


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [8]:
# Train the model
trainer.train()

print("Training completed successfully!")

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Epoch,Training Loss,Validation Loss
1,1.2004,1.044489
2,0.919,0.979631
3,0.832,0.967372


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Training completed successfully!


In [11]:
# Save the fine-tuned model & tokenizer
model_save_path = "./my_fine_tuned_model"
tokenizer_save_path = "./my_fine_tuned_tokenizer"

# Saving the model
model.save_pretrained(model_save_path)

# Saving the tokenizer
tokenizer.save_pretrained(tokenizer_save_path)


('./my_fine_tuned_tokenizer/tokenizer_config.json',
 './my_fine_tuned_tokenizer/special_tokens_map.json',
 './my_fine_tuned_tokenizer/sentencepiece.bpe.model',
 './my_fine_tuned_tokenizer/added_tokens.json')