In [7]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline

model_name = "t5-small"
output_dir = "t5-small-model"

model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


('t5-small-model\\tokenizer_config.json',
 't5-small-model\\special_tokens_map.json',
 't5-small-model\\spiece.model',
 't5-small-model\\added_tokens.json')

FINE TUNING THE MODEL

In [10]:
import pandas as pd
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, T5ForConditionalGeneration, T5Tokenizer

dataset = pd.read_csv('merged_business.csv')  

#initialize your tokenizer and model based on the downloaded model
tokenizer = T5Tokenizer.from_pretrained("t5-small-model")
model = T5ForConditionalGeneration.from_pretrained("t5-small-model")

#define the preprocess_function
def preprocess_function(row):
    article = row['articles']
    summary = row['summaries']

    #tokenize the article and summary
    inputs = tokenizer("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
    labels = tokenizer(summary, return_tensors="pt", max_length=150, truncation=True)

    return {
        'input_ids': inputs['input_ids'].squeeze(),
        'attention_mask': inputs['attention_mask'].squeeze(),
        'labels': labels['input_ids'].squeeze(),
    }

#apply the preprocess_function to tokenize the datasets
tokenized_datasets = dataset.apply(preprocess_function, axis=1)

#define the number of training examples based on your dataset size
num_train_examples = int(len(tokenized_datasets) * 0.8)  # 80% for training

#split your dataset into training and testing
train_dataset = tokenized_datasets.iloc[:num_train_examples]
test_dataset = tokenized_datasets.iloc[num_train_examples:]

#data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

#fine-tuning arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="fine-tuned-t5-small-model",  
    overwrite_output_dir=True,  
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    save_total_limit=3,
    num_train_epochs=3,
    save_steps=10_000,
    eval_steps=10_000,
    logging_dir="./logs",
    logging_steps=100,
    predict_with_generate=True,
)

#create a Seq2Seq Trainer
trainer = Seq2SeqTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=train_dataset, 
    eval_dataset=test_dataset,  
    tokenizer=tokenizer,
)

#start the fine-tuning process
trainer.train()


100%|██████████| 30/30 [05:22<00:00, 10.74s/it]

{'train_runtime': 322.3023, 'train_samples_per_second': 0.745, 'train_steps_per_second': 0.093, 'train_loss': 1.9931191762288412, 'epoch': 3.0}





TrainOutput(global_step=30, training_loss=1.9931191762288412, metrics={'train_runtime': 322.3023, 'train_samples_per_second': 0.745, 'train_steps_per_second': 0.093, 'train_loss': 1.9931191762288412, 'epoch': 3.0})

CHECKING GENERATION OF ONE ARTICLE

In [11]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

#load the fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("fine-tuned-t5-small-model")  # Use the correct path to your fine-tuned model
tokenizer = AutoTokenizer.from_pretrained("fine-tuned-t5-small-model")  # Use the same path as above

#input article
input_article =  "Ad sales boost Time Warner profit, Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier. The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL. Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband. TimeWarner also has to restate 2000 and 2003 results following a probe by the US Securities Exchange Commission (SEC), which is close to concluding. Time Warner's fourth quarter profits were slightly better than analysts' expectations. But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn. Our financial performance was strong, meeting or exceeding all of our full-year objectives and greatly enhancing our flexibility, chairman and chief executive Richard Parsons said. For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins. TimeWarner is to restate its accounts as part of efforts to resolve an inquiry into AOL by US market regulators. It has already offered to pay $300m to settle charges, in a deal that is under review by the SEC. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe, which it had reported as advertising revenue. It will now book the sale of its stake in AOL Europe as a loss on the value of that stake."
# Tokenize the input article
input_ids = tokenizer("summarize: " + input_article, return_tensors="pt", max_length=512, truncation=True)

#generate a summary
summary_ids = model.generate(input_ids.input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)

#decode the generated summary
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

#print the generated summary
print("Generated Summary:")
print(generated_summary)


Generated Summary:
time Warner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. its profits were buoyed by one-off gains which offset a profit dip. it lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters.


GENERATING SUMMARIES OF ALL THE ARTICLES

In [13]:
import os
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer

#define the path to the input CSV file containing articles
input_csv_file = "business_articles.csv"

#define the path to the output CSV file with predicted summaries
output_csv_file = "business_summaries_t5.csv"

#define the maximum chunk size for processing (adjust as needed)
chunk_size = 25

#initialize a T5 model and tokenizer (use the path to your fine-tuned model)
model_name = "fine-tuned-t5-small-model"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

#initialize lists to store articles and predicted summaries
article_chunks = []
predicted_summary_chunks = []

#read the input CSV in chunks and generate summaries
total_articles = 0
for chunk in pd.read_csv(input_csv_file, chunksize=chunk_size):
    articles = chunk["Actual"].tolist()
    summaries = []

    for i, article in enumerate(articles, 1):
        input_text = "summarize: " + article
        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)
        total_articles += 1
        print(f"Processed {total_articles} articles. {len(articles) - i} articles remaining.")

    article_chunks.extend(articles)
    predicted_summary_chunks.extend(summaries)

#read the existing CSV file with actual summaries
existing_data = pd.read_csv(output_csv_file)

#create a DataFrame with articles and predicted summaries
output_data = pd.DataFrame({"Actual": existing_data["Actual"], "Predicted": predicted_summary_chunks})

#save the DataFrame to the output CSV file
output_data.to_csv(output_csv_file, index=False, encoding="utf-8")

print(f"Predicted summaries appended to {output_csv_file}.")


Processed 1 articles. 24 articles remaining.
Processed 2 articles. 23 articles remaining.
Processed 3 articles. 22 articles remaining.
Processed 4 articles. 21 articles remaining.
Processed 5 articles. 20 articles remaining.
Processed 6 articles. 19 articles remaining.
Processed 7 articles. 18 articles remaining.
Processed 8 articles. 17 articles remaining.
Processed 9 articles. 16 articles remaining.
Processed 10 articles. 15 articles remaining.
Processed 11 articles. 14 articles remaining.
Processed 12 articles. 13 articles remaining.
Processed 13 articles. 12 articles remaining.
Processed 14 articles. 11 articles remaining.
Processed 15 articles. 10 articles remaining.
Processed 16 articles. 9 articles remaining.
Processed 17 articles. 8 articles remaining.
Processed 18 articles. 7 articles remaining.
Processed 19 articles. 6 articles remaining.
Processed 20 articles. 5 articles remaining.
Processed 21 articles. 4 articles remaining.
Processed 22 articles. 3 articles remaining.
Proc

BERT SCORE

In [14]:
import pandas as pd
from evaluate import load

bertscore = load("bertscore")

df = pd.read_csv("business_summaries_t5.csv")

#extract actual and predicted values from the DataFrame
predictions = df["Predicted"].tolist()
references = df["Actual"].tolist()

#compute BERT scores for each pair of actual and predicted values
results = bertscore.compute(predictions=predictions, references=references, model_type="distilbert-base-uncased")

#extract individual BERT scores (precision, recall, F1) for each pair
precision = results["precision"]
recall = results["recall"]
f1 = results["f1"]

#create a new DataFrame to store the BERT scores
output_df = pd.DataFrame({"Actual": references, "Predicted": predictions, "Precision": precision, "Recall": recall, "F1": f1})

#save the BERT scores to a new CSV file
output_df.to_csv("bert_scores_output.csv", index=False)

#calculate the final cumulative BERT scores
final_precision = sum(precision) / len(precision)
final_recall = sum(recall) / len(recall)
final_f1 = sum(f1) / len(f1)

#print the final cumulative BERT scores
print("Final Cumulative BERT Scores:")
print("Precision:", final_precision)
print("Recall:", final_recall)
print("F1 Score:", final_f1)


Final Cumulative BERT Scores:
Precision: 0.86978520154953
Recall: 0.7744840514659882
F1 Score: 0.8189856839179993


BLEU SCORE

In [16]:
import pandas as pd
from evaluate import load
from nltk.translate.bleu_score import sentence_bleu

#load BLEU score function from NLTK
def compute_bleu_score(prediction, references):
    return sentence_bleu(references, prediction)

#read data from CSV file
df = pd.read_csv("business_summaries_t5.csv")

#extract actual and predicted values from the DataFrame
predictions = df["Predicted"].tolist()
references_list = df["Actual"].apply(lambda x: [reference.strip() for reference in x.split(',')]).tolist()

#compute BLEU scores for each pair of actual and predicted values
bleu_scores = [compute_bleu_score(prediction, references) for prediction, references in zip(predictions, references_list)]

#create a new DataFrame to store the BLEU scores
output_df = pd.DataFrame({"Actual": df["Actual"], "Predicted": predictions, "BLEU Score": bleu_scores})

#save the BLEU scores to a new CSV file
output_df.to_csv("bleu_scores_output.csv", index=False)

#calculate the average BLEU score
average_bleu_score = sum(bleu_scores) / len(bleu_scores)

#print the average BLEU score
print("Average BLEU Score:", average_bleu_score)


Average BLEU Score: 0.6647678816485887


ROUGE SCORE

In [15]:
import pandas as pd
from evaluate import load

rouge = load('rouge')

df = pd.read_csv("business_summaries_t5.csv")

#extract actual and predicted values from the DataFrame
predictions = df["Predicted"].tolist()
references = df["Actual"].tolist()

#compute ROUGE scores for each pair of actual and predicted values
results = rouge.compute(predictions=predictions, references=references)

#save the ROUGE scores to a new CSV file
output_df = pd.DataFrame({"Actual": references, "Predicted": predictions, "ROUGE-1": results['rouge1'], "ROUGE-2": results['rouge2'], "ROUGE-L": results['rougeL'], "ROUGE-Lsum": results['rougeLsum']})

output_df.to_csv("rouge_scores_output.csv", index=False)

#print the ROUGE scores
print("ROUGE Scores:")
print("ROUGE-1:", results['rouge1'])
print("ROUGE-2:", results['rouge2'])
print("ROUGE-L:", results['rougeL'])
print("ROUGE-Lsum:", results['rougeLsum'])


ROUGE Scores:
ROUGE-1: 0.340177111036897
ROUGE-2: 0.2352473718909039
ROUGE-L: 0.2493709198443118
ROUGE-Lsum: 0.24992097166199087


METEOR SCORE

In [17]:
import pandas as pd
from nltk.translate.meteor_score import single_meteor_score
from nltk.tokenize import word_tokenize

#define the path to the CSV file containing actual and predicted summaries
csv_file_path = "business_summaries_t5.csv"

#load METEOR score function from NLTK
def compute_meteor_score(prediction, reference):
    return single_meteor_score(reference, prediction)

#read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

#extract actual and predicted values
actual_summaries = df["Actual"].tolist()
predicted_summaries = df["Predicted"].tolist()

#initialize a list to store METEOR scores
meteor_scores = []

#tokenize actual and predicted summaries and compute METEOR scores
for actual, predicted in zip(actual_summaries, predicted_summaries):
    actual_tokens = word_tokenize(actual)
    predicted_tokens = word_tokenize(predicted)
    meteor_scores.append(compute_meteor_score(predicted_tokens, actual_tokens))

#calculate the average METEOR score
average_meteor_score = sum(meteor_scores) / len(meteor_scores)

print(f"Average METEOR Score: {average_meteor_score}")


Average METEOR Score: 0.20869762086067684


SAVING ALL SCORES TO THE DATASET

In [18]:
import pandas as pd
from evaluate import load
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

#define the path to the CSV file containing actual and predicted summaries
csv_file_path = "business_summaries_t5.csv"

#load BERTScore and ROUGE
bertscore = load("bertscore")
rouge = load('rouge')

#read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

#extract actual and predicted values
actual_summaries = df["Actual"].tolist()
predicted_summaries = df["Predicted"].tolist()

#initialize lists to store scores
bert_precision_scores = []
bert_recall_scores = []
bert_f1_scores = []
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []
rouge_lsum_scores = []
bleu_scores = []
meteor_scores = []

#compute scores for each pair of actual and predicted summaries
for actual, predicted in zip(actual_summaries, predicted_summaries):
    #compute BERTScores
    bert_results = bertscore.compute(predictions=[predicted], references=[actual], model_type="distilbert-base-uncased")
    bert_precision_scores.append(bert_results["precision"])
    bert_recall_scores.append(bert_results["recall"])
    bert_f1_scores.append(bert_results["f1"])

    #compute ROUGE scores
    rouge_result = rouge.compute(predictions=[predicted], references=[actual])

    rouge_1_scores.append(rouge_result['rouge1'])
    rouge_2_scores.append(rouge_result['rouge2'])
    rouge_l_scores.append(rouge_result['rougeL'])
    rouge_lsum_scores.append(rouge_result['rougeLsum'])

    #compute BLEU score
    references = [actual.split()]  #assuming the reference is a list of words
    predicted_tokens = predicted.split()
    bleu_score = sentence_bleu(references, predicted_tokens)
    bleu_scores.append(bleu_score)

    #compute METEOR score
    try:
        meteor_score = single_meteor_score(references[0], predicted)  #use the METEOR function
        meteor_scores.append(meteor_score)
    except Exception as e:
        print(f"Error calculating METEOR score: {e}")
        meteor_scores.append(0.0)  #assign a default score of 0.0 for errors

#create a new DataFrame to store all the scores
scores_df = pd.DataFrame({
    "Actual": actual_summaries,
    "Predicted": predicted_summaries,
    "BERT Precision": bert_precision_scores,
    "BERT Recall": bert_recall_scores,
    "BERT F1": bert_f1_scores,
    "ROUGE-1 F1": rouge_1_scores,
    "ROUGE-2 F1": rouge_2_scores,
    "ROUGE-L F1": rouge_l_scores,
    "ROUGE-Lsum F1": rouge_lsum_scores,
    "BLEU Score": bleu_scores,
    "METEOR Score": meteor_scores
})

#calculate and print the average METEOR score
average_meteor_score = sum(meteor_scores) / len(meteor_scores)
print(f"Average METEOR Score: {average_meteor_score}")

#save all the scores to a new CSV file
output_csv = "all_scores_Q2.csv"
scores_df.to_csv(output_csv, index=False)

print(f"All scores saved to {output_csv}")


Error calculating METEOR score: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): time Warner profits jumped 76% to $1.13bn (£600m) for the three months to December. it said fourth quarter sales rose 2% to $11.1bn from $10.9bn. its profits were buoyed by one-off gains which offset a profit dip.
Error calculating METEOR score: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): in late trading in new york, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. the u.s. government's willingness to curb spending and rising household savings are factors which may help to reduce it. concerns about the deficit concerns about China remain.
Error calculating METEOR score: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): state-owned Rosneft bought the Yugansk unit for $9.3bn in a sale forced by Russia to part settle a $27.5bn tax claim against Yukos. the company has said it intends to take action against menatep to recover some of the tax cl

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Error calculating METEOR score: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): the demand for $280bn (£155bn) - filed by the Clinton administration in 1999 - was rejected in a 2-1 decision. the case could not be brought under federal anti-racketeering laws.
Error calculating METEOR score: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): the firm's revenue nearly tripled in the fourth quarter of 2004, exceeding $86m (£46m) the firm's $17m profit for the quarter was dwarfed by the $204m announced by rival Google earlier in the week.
Error calculating METEOR score: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): millions of Indonesians use kerosene for basic cooking. government has said it wants to curb fuel subsidies. critics argue cutting subsidies will hurt poorer families.
Error calculating METEOR score: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): the two firms signed a Memorandum of Understanding. they expect to seal a f