In [3]:
import pandas as pd
from tqdm import tqdm
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Load the DataFrame from the Excel file
input_file = "GPT3.5_All_Summaries_Mergedd.xlsx"
df = pd.read_excel(input_file)

# Define a list to store the N-gram scores
ngram_scores = []

# Define a function to compute N-gram overlap
def ngram_overlap(text1, text2, n=2):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(n, n))
    ngram_matrix = vectorizer.fit_transform([text1, text2])
    overlap = np.dot(ngram_matrix[0].toarray(), ngram_matrix[1].toarray().T)
    return overlap[0, 0]

# Iterate over the rows in the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating Scores"):
    # Get the original text and the generated summary
    abstract = row['Abstract']

    # Clean the abstract
    abstract = re.sub(r'[^\x00-\x7F]+', '', str(abstract))

    # Combine the cleaned abstract
    original_text = abstract
    generated_summary = str(row['Abstract Summary'])

    # Calculate N-gram overlap (using bi-grams)
    ngram_score = ngram_overlap(generated_summary, original_text, 2)

    # Append the score to the list
    ngram_scores.append(ngram_score)

# Add the scores to the DataFrame
df['N-gram_Score'] = ngram_scores

# Save the updated DataFrame to a new Excel file
output_file = "Summary_Scores_Ngram_abstract_gpt.xlsx"
df.to_excel(output_file, index=False)

# Print the average score
print("Average N-gram Score:", sum(ngram_scores) / len(ngram_scores))


Calculating Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1630/1630 [00:01<00:00, 952.34it/s]


Average N-gram Score: 70.64294478527607


In [4]:
import pandas as pd
from tqdm import tqdm
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Load the DataFrame from the Excel file
input_file = "GPT3.5_All_Summaries_Mergedd.xlsx"
df = pd.read_excel(input_file)

# Define a list to store the N-gram scores
ngram_scores = []

# Define a function to compute N-gram overlap
def ngram_overlap(text1, text2, n=2):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(n, n))
    ngram_matrix = vectorizer.fit_transform([text1, text2])
    overlap = np.dot(ngram_matrix[0].toarray(), ngram_matrix[1].toarray().T)
    return overlap[0, 0]

# Iterate over the rows in the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating Scores"):
    # Get the original text and the generated summary
    claims = row['Claims']

    # Clean the abstract
    claims = re.sub(r'[^\x00-\x7F]+', '', str(claims))

    # Combine the cleaned abstract
    original_text = claims
    generated_summary = str(row['Claims Summary'])

    # Calculate N-gram overlap (using bi-grams)
    ngram_score = ngram_overlap(generated_summary, original_text, 2)

    # Append the score to the list
    ngram_scores.append(ngram_score)

# Add the scores to the DataFrame
df['N-gram_Score'] = ngram_scores

# Save the updated DataFrame to a new Excel file
output_file = "Summary_Scores_Ngram_claims_gpt.xlsx"
df.to_excel(output_file, index=False)

# Print the average score
print("Average N-gram Score:", sum(ngram_scores) / len(ngram_scores))


Calculating Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1630/1630 [00:04<00:00, 362.68it/s]


Average N-gram Score: 2998.90245398773


In [5]:
import pandas as pd
from tqdm import tqdm
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Load the DataFrame from the Excel file
input_file = "GPT3.5_All_Summaries_Mergedd.xlsx"
df = pd.read_excel(input_file)

# Define a list to store the N-gram scores
ngram_scores = []

# Define a function to compute N-gram overlap
def ngram_overlap(text1, text2, n=2):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(n, n))
    ngram_matrix = vectorizer.fit_transform([text1, text2])
    overlap = np.dot(ngram_matrix[0].toarray(), ngram_matrix[1].toarray().T)
    return overlap[0, 0]

# Iterate over the rows in the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating Scores"):
    # Get the original text and the generated summary
    abstract = row['Abstract Summary']
    claims = row['Claims Summary']

    # Clean the abstract
    abstract = re.sub(r'[^\x00-\x7F]+', '', str(abstract))
    claims = re.sub(r'[^\x00-\x7F]+', '', str(claims))

    # Combine the cleaned abstract
    original_text = abstract+' '+claims
    generated_summary = str(row['Summary(Abstract Summary+ Claim Summary)'])

    # Calculate N-gram overlap (using bi-grams)
    ngram_score = ngram_overlap(generated_summary, original_text, 2)

    # Append the score to the list
    ngram_scores.append(ngram_score)

# Add the scores to the DataFrame
df['N-gram_Score'] = ngram_scores

# Save the updated DataFrame to a new Excel file
output_file = "Summary(Abstract Summary+ Claim Summary)_Ngram_gpt.xlsx"
df.to_excel(output_file, index=False)

# Print the average score
print("Average N-gram Score:", sum(ngram_scores) / len(ngram_scores))


Calculating Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1630/1630 [00:02<00:00, 607.32it/s]


Average N-gram Score: 309.44233128834355


In [6]:
import pandas as pd
from tqdm import tqdm
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Load the DataFrame from the Excel file
input_file = "GPT3.5_All_Summaries_Mergedd.xlsx"
df = pd.read_excel(input_file)

# Define a list to store the N-gram scores
ngram_scores = []

# Define a function to compute N-gram overlap
def ngram_overlap(text1, text2, n=2):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(n, n))
    ngram_matrix = vectorizer.fit_transform([text1, text2])
    overlap = np.dot(ngram_matrix[0].toarray(), ngram_matrix[1].toarray().T)
    return overlap[0, 0]

# Iterate over the rows in the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating Scores"):
    # Get the original text and the generated summary
    claims = row['Summary(Abstract Summary+ Claim Summary)']

    # Clean the abstract
    claims = re.sub(r'[^\x00-\x7F]+', '', str(claims))

    # Combine the cleaned abstract
    original_text = claims
    generated_summary = str(row['Summary of Summary(Abstract Summary+ Claim Summary)'])

    # Calculate N-gram overlap (using bi-grams)
    ngram_score = ngram_overlap(generated_summary, original_text, 2)

    # Append the score to the list
    ngram_scores.append(ngram_score)

# Add the scores to the DataFrame
df['N-gram_Score'] = ngram_scores

# Save the updated DataFrame to a new Excel file
output_file = "Summary of Summary(Abstract Summary+ Claim Summary)_Ngram_gpt.xlsx"
df.to_excel(output_file, index=False)

# Print the average score
print("Average N-gram Score:", sum(ngram_scores) / len(ngram_scores))


Calculating Scores: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1630/1630 [00:01<00:00, 1009.71it/s]


Average N-gram Score: 8.634355828220858


In [7]:
import pandas as pd
from tqdm import tqdm
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Load the DataFrame from the Excel file
input_file = "GPT3.5_All_Summaries_Mergedd.xlsx"
df = pd.read_excel(input_file)

# Define a list to store the N-gram scores
ngram_scores = []

# Define a function to compute N-gram overlap
def ngram_overlap(text1, text2, n=2):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(n, n))
    ngram_matrix = vectorizer.fit_transform([text1, text2])
    overlap = np.dot(ngram_matrix[0].toarray(), ngram_matrix[1].toarray().T)
    return overlap[0, 0]

# Iterate over the rows in the DataFrame with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Calculating Scores"):
    # Get the original text and the generated summary
    abstract = row['Abstract']
    claims = row['Claims']

    # Clean the abstract
    abstract = re.sub(r'[^\x00-\x7F]+', '', str(abstract))
    claims = re.sub(r'[^\x00-\x7F]+', '', str(claims))

    # Combine the cleaned abstract
    original_text = abstract+' '+claims
    generated_summary = str(row['Summary Of (Abstrct+Claim) As single input'])

    # Calculate N-gram overlap (using bi-grams)
    ngram_score = ngram_overlap(generated_summary, original_text, 2)

    # Append the score to the list
    ngram_scores.append(ngram_score)

# Add the scores to the DataFrame
df['N-gram_Score'] = ngram_scores

# Save the updated DataFrame to a new Excel file
output_file = "Summary Of (Abstrct+Claim) As single input_Ngram_gpt.xlsx"
df.to_excel(output_file, index=False)

# Print the average score
print("Average N-gram Score:", sum(ngram_scores) / len(ngram_scores))


Calculating Scores: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1630/1630 [00:04<00:00, 378.23it/s]


Average N-gram Score: 942.9846625766871
