In [34]:
from transformers import EncoderDecoderModel, CamembertTokenizer

# Path where the model and tokenizer were saved
model_load_path = "./my_fine_tuned_model"
tokenizer_load_path = "./my_fine_tuned_tokenizer"

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained(tokenizer_load_path)

# Load the fine-tuned model
model = EncoderDecoderModel.from_pretrained(model_load_path)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import pandas as pd

In [7]:
validation_df = pd.read_csv('data/validation.csv')

In [8]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rougeL'])



In [21]:
from tqdm import tqdm

In [30]:
def transformer_summary(text: pd.core.series.Series):
    summaries = []
    num_lines = len(text)
    for idx, row in enumerate(tqdm(text,total=num_lines)):

        
        input_ids = tokenizer.encode(row, return_tensors="pt", max_length=512, truncation=True)
        outputs = model.generate(input_ids)  
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        summaries.append([idx,predicted_text])
    return summaries

In [None]:
transformer_summaries = transformer_summary(validation_df["text"])

In [None]:
#Compute Rouge-L score
transformer_rouge = []
# Calculate the rouge-l score for each of the generated summaries compared to the original titles
for idx, title in validation_df['titles'].iteritems():
    transformer_rouge.append(scorer.score(transformer_summaries[idx][1], title)['rougeL'][2])
    

In [None]:
avg_rouge_score_transformer = sum(transformer_rouge) / len(transformer_rouge)
print("Average Rouge-L F-Score with transformer: ", avg_rouge_score_transformer)

In [31]:
test_df = pd.read_csv('data/test_text.csv')
transformer_summaries_test = transformer_summary(test_df["text"])

100%|██████████| 1500/1500 [14:44<00:00,  1.70it/s]


In [32]:
transformer_submission_df = pd.DataFrame(transformer_summaries_test, columns=['ID', 'titles'])
transformer_submission_df.to_csv('transformer_submission.csv', index=False)
