In [1]:
!pip install transformers torch



In [2]:
import pandas as pd
from tqdm import tqdm
import torch
import re
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load the DataFrame from the Excel file
input_file = "GPT3.5_All_Summaries_Mergedd.xlsx"
df = pd.read_excel(input_file)

# Define a list to store SummaC scores
summac_scores = []

# Initialize BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to calculate BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model(**inputs)
    return outputs['last_hidden_state'].mean(dim=1).cpu().detach().numpy()

# Iterate over the rows in the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating Scores"):
    # Get the original text and the generated summary
    abstract = row['Abstract']
    generated_summary = str(row['Abstract Summary'])

    # Clean the abstract text
    abstract = re.sub(r'[^\x00-\x7F]+', '', str(abstract))

    # Get embeddings for original text and generated summary
    original_text_embedding = get_bert_embeddings(abstract)
    generated_summary_embedding = get_bert_embeddings(generated_summary)

    # Calculate cosine similarity between embeddings
    cos_sim = cosine_similarity(original_text_embedding, generated_summary_embedding)[0][0]

    # Append SummaC score to the list
    summac_scores.append(cos_sim)

# Add the scores to the DataFrame
df['SummaC_Score'] = summac_scores

# Save the updated DataFrame to a new Excel file
output_file = "Summary_SummaC_score_abstract_gpt.xlsx"
df.to_excel(output_file, index=False)

# Print the average scores
print("Average SummaC Score:", np.mean(summac_scores))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Calculating Scores: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1630/1630 [00:58<00:0

Average SummaC Score: 0.9453633


In [3]:
import pandas as pd
from tqdm import tqdm
import torch
import re
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load the DataFrame from the Excel file
input_file = "GPT3.5_All_Summaries_Mergedd.xlsx"
df = pd.read_excel(input_file)

# Define a list to store SummaC scores
summac_scores = []

# Initialize BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to calculate BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model(**inputs)
    return outputs['last_hidden_state'].mean(dim=1).cpu().detach().numpy()

# Iterate over the rows in the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating Scores"):
    # Get the original text and the generated summary
    abstract = row['Claims']
    generated_summary = str(row['Claims Summary'])

    # Clean the abstract text
    abstract = re.sub(r'[^\x00-\x7F]+', '', str(abstract))

    # Get embeddings for original text and generated summary
    original_text_embedding = get_bert_embeddings(abstract)
    generated_summary_embedding = get_bert_embeddings(generated_summary)

    # Calculate cosine similarity between embeddings
    cos_sim = cosine_similarity(original_text_embedding, generated_summary_embedding)[0][0]

    # Append SummaC score to the list
    summac_scores.append(cos_sim)

# Add the scores to the DataFrame
df['SummaC_Score'] = summac_scores

# Save the updated DataFrame to a new Excel file
output_file = "Summary_SummaC_score_Claims_gpt.xlsx"
df.to_excel(output_file, index=False)

# Print the average scores
print("Average SummaC Score:", np.mean(summac_scores))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Calculating Scores: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1630/1630 [02:04<00:0

Average SummaC Score: 0.9404963


In [4]:
import pandas as pd
from tqdm import tqdm
import torch
import re
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load the DataFrame from the Excel file
input_file = "GPT3.5_All_Summaries_Mergedd.xlsx"
df = pd.read_excel(input_file)

# Define lists to store SummaC scores
summac_scores = []

# Initialize BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to calculate BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model(**inputs)
    return outputs['last_hidden_state'].mean(dim=1).cpu().detach().numpy()

# Iterate over the rows in the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating Scores"):
    # Get the abstract summary and claims summary
    # Get the original text and the generated summary
    abstract = row['Abstract Summary']
    claims = row['Claims Summary']

    # Clean the abstract and claims text
    abstract = re.sub(r'[^\x00-\x7F]+', '', str(abstract))
    claims = re.sub(r'[^\x00-\x7F]+', '', str(claims))

    # Combine the cleaned abstract and claims
    original_text = abstract+' '+claims
    #abstract_summary = str(row['Abstract Summary'])
    generated_summary = str(row['Summary(Abstract Summary+ Claim Summary)'])
    
    #abstract_summary = str(row['Abstract Summary'])
    #claims_summary = str(row['Claims Summary'])  # Assuming this column contains claims summaries

    # Get embeddings for summaries
    abstract_summary_embedding = get_bert_embeddings(original_text)
    claims_summary_embedding = get_bert_embeddings(generated_summary)

    # Calculate cosine similarity between embeddings
    cos_sim = cosine_similarity(abstract_summary_embedding, claims_summary_embedding)[0][0]

    # Append SummaC scores to the list
    summac_scores.append(cos_sim)

# Add the scores to the DataFrame
df['SummaC_Score'] = summac_scores

# Save the updated DataFrame to a new Excel file
output_file = "Summary(Abstract Summary+ Claim Summary)_SummaC_gpt.xlsx"
df.to_excel(output_file, index=False)

# Print the average score
print("Average SummaC Score:", np.mean(summac_scores))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Calculating Scores: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1630/1630 [01:17<00:0

Average SummaC Score: 0.9511976


In [5]:
import pandas as pd
from tqdm import tqdm
import torch
import re
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load the DataFrame from the Excel file
input_file = "GPT3.5_All_Summaries_Mergedd.xlsx"
df = pd.read_excel(input_file)

# Define a list to store SummaC scores
summac_scores = []

# Initialize BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to calculate BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model(**inputs)
    return outputs['last_hidden_state'].mean(dim=1).cpu().detach().numpy()

# Iterate over the rows in the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating Scores"):
    # Get the original text and the generated summary
    abstract = row['Summary(Abstract Summary+ Claim Summary)']
    generated_summary = str(row['Summary of Summary(Abstract Summary+ Claim Summary)'])

    # Clean the abstract text
    abstract = re.sub(r'[^\x00-\x7F]+', '', str(abstract))

    # Get embeddings for original text and generated summary
    original_text_embedding = get_bert_embeddings(abstract)
    generated_summary_embedding = get_bert_embeddings(generated_summary)

    # Calculate cosine similarity between embeddings
    cos_sim = cosine_similarity(original_text_embedding, generated_summary_embedding)[0][0]

    # Append SummaC score to the list
    summac_scores.append(cos_sim)

# Add the scores to the DataFrame
df['SummaC_Score'] = summac_scores

# Save the updated DataFrame to a new Excel file
output_file = "Summary of Summary(Abstract Summary+ Claim Summary)_SummaC_gpt.xlsx"
df.to_excel(output_file, index=False)

# Print the average scores
print("Average SummaC Score:", np.mean(summac_scores))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Calculating Scores: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1630/1630 [00:56<00:0

Average SummaC Score: 0.55537665


In [6]:
import pandas as pd
from tqdm import tqdm
import torch
import re
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load the DataFrame from the Excel file
input_file = "GPT3.5_All_Summaries_Mergedd.xlsx"
df = pd.read_excel(input_file)

# Define a list to store SummaC scores
summac_scores = []

# Initialize BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to calculate BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model(**inputs)
    return outputs['last_hidden_state'].mean(dim=1).cpu().detach().numpy()

# Iterate over the rows in the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating Scores"):
    # Get the original text and the generated summary
    abstract = row['Abstract']
    claims = row['Claims']

    # Clean the abstract and claims text
    abstract = re.sub(r'[^\x00-\x7F]+', '', str(abstract))
    claims = re.sub(r'[^\x00-\x7F]+', '', str(claims))

    # Combine the cleaned abstract and claims
    original_text = abstract+' '+claims
    #abstract_summary = str(row['Abstract Summary'])
    generated_summary = str(row['Summary Of (Abstrct+Claim) As single input'])

    # Get embeddings for original text and generated summary
    original_text_embedding = get_bert_embeddings(original_text)
    generated_summary_embedding = get_bert_embeddings(generated_summary)

    # Calculate cosine similarity between embeddings
    cos_sim = cosine_similarity(original_text_embedding, generated_summary_embedding)[0][0]

    # Append SummaC score to the list
    summac_scores.append(cos_sim)

# Add the scores to the DataFrame
df['SummaC_Score'] = summac_scores

# Save the updated DataFrame to a new Excel file
output_file = "Summary Of (Abstrct+Claim) As single input_SummaC_gpt.xlsx"
df.to_excel(output_file, index=False)

# Print the average scores
print("Average SummaC Score:", np.mean(summac_scores))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Calculating Scores: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1630/1630 [02:00<00:0

Average SummaC Score: 0.91378164
