In [3]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import spacy
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
class SummarizationBenchmark:
    def __init__(self, model_name="allenai/scibert_scivocab_uncased"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.nlp = spacy.load("en_core_web_sm")
        self.stop_words = set(stopwords.words('english'))

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    def semantic_similarity(self, doc, summary):
        doc_emb = self.get_embedding(doc)
        sum_emb = self.get_embedding(summary)
        return cosine_similarity(doc_emb, sum_emb)[0][0]

    def calculate_rouge(self, doc, summary):
        scores = self.rouge_scorer.score(doc, summary)
        return {key: value.fmeasure for key, value in scores.items()}

    def extract_keywords(self, text):
        doc = self.nlp(text)
        keywords = [token.text for token in doc if not token.is_stop and token.is_alpha]
        return keywords
    
    def preprocess_text(self, text):
        tokens = word_tokenize(text.lower())
        filtered_tokens = [token for token in tokens if token not in stopwords.words('english') and token.isalpha() and len(token) > 3]
        return filtered_tokens

    # def preprocess_text(self, text):
    #     result = []
    #     for token in simple_preprocess(text):
    #         if token not in stopwords.words('english') and len(token) > 3:
    #             result.append(token)
    #     return result

    def topic_modeling(self, doc, summary, num_topics=5):
        texts = [self.preprocess_text(doc), self.preprocess_text(summary)]

        # Join tokens back into strings
        texts = [' '.join(text) for text in texts]  

        vectorizer = TfidfVectorizer(max_df=1, min_df=1, stop_words='english') 
        vectorized_data = vectorizer.fit_transform(texts)
        
        lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=100)  # Adjust n_components
        lda_model.fit(vectorized_data)

        doc_topics = lda_model.transform(vectorized_data)[0]
        sum_topics = lda_model.transform(vectorized_data)[1]
        
        # Get the topics
        topics = []
        for i in range(lda_model.n_components):
            top_words = [vectorizer.get_feature_names_out()[j] for j in lda_model.components_[i].argsort()[:-11:-1]]
            topics.append(f"Topic {i}: {', '.join(top_words)}")

        return doc_topics, sum_topics, topics

    def entity_overlap(self, doc, summary):
        doc_entities = set([ent.text for ent in self.nlp(doc).ents])
        sum_entities = set([ent.text for ent in self.nlp(summary).ents])
        overlap = doc_entities.intersection(sum_entities)
        return len(overlap) / len(doc_entities) if doc_entities else 0

    def benchmark(self, document, summary):
        results = {}
        results['semantic_similarity'] = self.semantic_similarity(document, summary)
        # results['rouge_scores'] = self.calculate_rouge(document, summary)
        # results['conciseness'] = len(summary.split()) / len(document.split())
        # results['keyword_overlap'] = len(set(self.extract_keywords(document)) & set(self.extract_keywords(summary))) / len(set(self.extract_keywords(document)))
        # doc_topics, sum_topics, topics = self.topic_modeling(document, summary)
        # results['topic_similarity'] = sum(min(dt, st) for dt in doc_topics for st in sum_topics)
        # results['entity_overlap'] = self.entity_overlap(document, summary)
        # results['topics'] = topics
        return results

In [25]:
# Example usage
benchmark = SummarizationBenchmark()
document = "document here"
summary = "summary here"
results = benchmark.benchmark(document, summary)
def score_metrics(m): return min(1, max(0, 0.35*float(m['semantic_similarity']) + 0.15*sum(m['rouge_scores'].values())/3 + 0.1*float(m['conciseness']) + 0.25*(float(m['topic_similarity'])/5) + 0.05*float(m['keyword_overlap']) + 0.1*float(m['entity_overlap'])))
score_metrics(results)

{'semantic_similarity': np.float32(0.820957)}