In [6]:
import pandas as pd

# Read CSV file with 'ISO-8859-1' encoding
df = pd.read_csv('../Dataset/train.csv', encoding='ISO-8859-1')


In [9]:
# Clean and convert to UTF-8
def clean_text(text):
    # Remove unsupported characters
    cleaned_text = ''.join(char for char in text if ord(char) < 128)
    # Encode to UTF-8
    utf8_text = cleaned_text.encode('utf-8', errors='ignore').decode('utf-8')
    return utf8_text

# Apply cleaning function to specific columns in DataFrame
df['summary'] = df['summary'].apply(clean_text)


In [10]:
# Save cleaned DataFrame to a new CSV file in UTF-8 format
df.to_csv('cleaned_file.csv', index=False, encoding='utf-8')


In [None]:
import pandas as pd
from transformers import T5Tokenizer

# Load data from CSV file
df = pd.read_csv('train.csv')  # Replace 'your_file.csv' with the path to your CSV file

# Extract articles and summaries from the DataFrame
articles = df['article'].tolist()
summaries = df['summary'].tolist()

# Tokenize articles and summaries
tokenizer = T5Tokenizer.from_pretrained('t5-large')
tokenized_articles = tokenizer(articles, truncation=True, padding=True)
tokenized_summaries = tokenizer(summaries, truncation=True, padding=True)

# Create tokenized dataset
tokenized_dataset = {
    'input_ids': tokenized_articles.input_ids,
    'attention_mask': tokenized_articles.attention_mask,
    'labels': tokenized_summaries.input_ids,
}


In [None]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

# Load your CSV file using pandas
df = pd.read_csv('train.csv', encoding='ISO-8859-1')

# Create a custom dataset using the 'article' column
custom_dataset = Dataset.from_pandas(df)

# Tokenize the custom dataset
tokenizer = T5Tokenizer.from_pretrained('t5-large')

def tokenize_function(example):
    inputs = tokenizer(example['article'], padding='max_length', truncation=True, return_tensors='pt')
    inputs['labels'] = inputs.input_ids.clone()
    return inputs

tokenized_dataset = custom_dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='Results',          # Output directory for the model
    num_train_epochs=3,            # Number of training epochs
    per_device_train_batch_size=1, # Batch size per device during training
    save_steps=500,                # Save model every 500 steps
    save_total_limit=2,            # Only last 2 models are saved. Older ones are deleted.
)

# Load pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained('t5-large')

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Fine-tune the model
trainer.train()


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = 'Results'  # Path to the directory where your trained model is saved
model = T5ForConditionalGeneration.from_pretrained("/content/Results/checkpoint-1500")
tokenizer = T5Tokenizer.from_pretrained('t5-large')


In [None]:
article = "Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee.Â 'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for space in the overhead lockers, crashing elbows and seat back kicking? Tests conducted by the FAA use planes with a 31 inch pitch, a standard which on some airlines has decreased . Many economy seats on United Airlines have 30 inches of room, while some airlines offer as little as 28 inches . Cynthia Corbertt, a human factors researcher with the Federal Aviation Administration, that it conducts tests on how quickly passengers can leave a plane. But these tests are conducted using planes with 31 inches between each row of seats, a standard which on some airlines has decreased, reported the Detroit News. The distance between two seats from one point on a seat to the same point on the seat behind it is known as the pitch. While most airlines stick to a pitch of 31 inches or above, some fall below this. While United Airlines has 30 inches of space, Gulf Air economy seats have between 29 and 32 inches, Air Asia offers 29 inches and Spirit Airlines offers just 28 inches. British Airways has a seat pitch of 31 inches, while easyJet has 29 inches, Thomson's short haul seat pitch is 28 inches, and Virgin Atlantic's is 30-31."

# Tokenize the article
inputs = tokenizer.encode("summarize: " + article, return_tensors="pt", max_length=1024, truncation=True)

# Generate the summary
summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Generated Summary:", summary)


In [None]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load your trained model and tokenizer
model_path = 'Results'  # Path to the directory where your trained model is saved
model = T5ForConditionalGeneration.from_pretrained("/content/Results/checkpoint-1500")
tokenizer = T5Tokenizer.from_pretrained('t5-large', model_max_length=1024)

# Load articles from the CSV file
df = pd.read_csv('/content/train.csv', encoding='utf-8')  # Replace 'your_input_file.csv' with your CSV file path
articles = df['article'].tolist()

# Generate summaries for each article
summaries = []
for article in articles:
    inputs = tokenizer.encode("summarize: " + article, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summaries.append(summary)

# Add the summaries to the DataFrame
df['summary'] = summaries

# Save the DataFrame with summaries to a new CSV file
df.to_csv('output_file.csv', index=False, encoding='utf-8')  # 'output_file.csv' is the name of the new CSV file


In [1]:
import pandas as pd
from evaluate import load

# Load the BERT score model
bertscore = load("bertscore")

# Read data from CSV file
df = pd.read_csv("../Dataset/final_q3.csv")

# Extract actual and predicted values from the DataFrame
predictions = df["Predicted"].tolist()
references = df["Actual"].tolist()

# Compute BERT scores for each pair of actual and predicted values
results = bertscore.compute(predictions=predictions, references=references, model_type="distilbert-base-uncased")

# Extract individual BERT scores (precision, recall, F1) for each pair
precision = results["precision"]
recall = results["recall"]
f1 = results["f1"]

# Create a new DataFrame to store the BERT scores
output_df = pd.DataFrame({"Actual": references, "Predicted": predictions, "Precision": precision, "Recall": recall, "F1": f1})

# Save the BERT scores to a new CSV file
output_df.to_csv("bert_scores_output.csv", index=False)

# Calculate the final cumulative BERT scores
final_precision = sum(precision) / len(precision)
final_recall = sum(recall) / len(recall)
final_f1 = sum(f1) / len(f1)

# Print the final cumulative BERT scores
print("Final Cumulative BERT Scores:")
print("Precision:", final_precision)
print("Recall:", final_recall)
print("F1 Score:", final_f1)


  from .autonotebook import tqdm as notebook_tqdm


Final Cumulative BERT Scores:
Precision: 0.7828612419366836
Recall: 0.8422643060684204
F1 Score: 0.8110984798669815


In [2]:
import pandas as pd
from evaluate import load

# Load the ROUGE score model
rouge = load('rouge')

# Read data from CSV file
df = pd.read_csv("../Dataset/final_q3.csv")

# Extract actual and predicted values from the DataFrame
predictions = df["Predicted"].tolist()
references = df["Actual"].tolist()

# Compute ROUGE scores for each pair of actual and predicted values
results = rouge.compute(predictions=predictions, references=references)

# Save the ROUGE scores to a new CSV file
output_df = pd.DataFrame({"Actual": references, "Predicted": predictions, "ROUGE-1": results['rouge1'], "ROUGE-2": results['rouge2'], "ROUGE-L": results['rougeL'], "ROUGE-Lsum": results['rougeLsum']})

output_df.to_csv("rouge_scores_output.csv", index=False)

# Print the ROUGE scores
print("ROUGE Scores:")
print("ROUGE-1:", results['rouge1'])
print("ROUGE-2:", results['rouge2'])
print("ROUGE-L:", results['rougeL'])
print("ROUGE-Lsum:", results['rougeLsum'])


ROUGE Scores:
ROUGE-1: 0.36774779748743025
ROUGE-2: 0.1667095171355023
ROUGE-L: 0.235075239718949
ROUGE-Lsum: 0.23484841763519826


In [3]:
import pandas as pd
from evaluate import load
from nltk.translate.bleu_score import sentence_bleu

# Load BLEU score function from NLTK
def compute_bleu_score(prediction, references):
    return sentence_bleu(references, prediction)

# Read data from CSV file
df = pd.read_csv("../Dataset/final_q3.csv")

# Extract actual and predicted values from the DataFrame
predictions = df["Predicted"].tolist()
references_list = df["Actual"].apply(lambda x: [reference.strip() for reference in x.split(',')]).tolist()

# Compute BLEU scores for each pair of actual and predicted values
bleu_scores = [compute_bleu_score(prediction, references) for prediction, references in zip(predictions, references_list)]

# Create a new DataFrame to store the BLEU scores
output_df = pd.DataFrame({"Actual": df["Actual"], "Predicted": predictions, "BLEU Score": bleu_scores})

# Save the BLEU scores to a new CSV file
output_df.to_csv("bleu_scores_output.csv", index=False)

# Calculate the average BLEU score
average_bleu_score = sum(bleu_scores) / len(bleu_scores)

# Print the average BLEU score
print("Average BLEU Score:", average_bleu_score)


Average BLEU Score: 0.2909516874985539
