In [2]:
!pip install keybert
!pip install bert-score

Collecting bert-score




  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
     ---------------------------------------- 61.1/61.1 kB 1.6 MB/s eta 0:00:00
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13


# KeyBERT

In [3]:
import pandas as pd
import numpy as np
from keybert import KeyBERT
from bert_score import BERTScorer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

In [5]:
article_clean = 'C:/Users/Krlozz/Documents/Tesis/TesisFinal/ProcessedData/article_clean.csv'
data = pd.read_csv(article_clean)
model_sentece_path = "C:/Users/Krlozz/Documents/Tesis/TesisFinal/Model/sentence_bert_model"

In [7]:
model_sentece = joblib.load(model_sentece_path)

In [8]:
keybert_model = KeyBERT(model=model_sentece)
scorer = BERTScorer(lang='en', model_type="bert-base-multilingual-cased", rescale_with_baseline=True)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [9]:
ngram_range = (1, 3)
results = []

In [11]:
for _, row in data.iterrows():
    article_id = row['id_article']
    abstract = row['abstract']
    original_keywords = row['keywords'] if not pd.isna(row['keywords']) else ""
    n_keywords = row['num_keywords'] if not pd.isna(row['num_keywords']) else 0

    if pd.isna(abstract) or n_keywords == 0:
        results.append([article_id, original_keywords, None, None, None, None, None])
        continue

    generated_keywords = keybert_model.extract_keywords(
        abstract,
        keyphrase_ngram_range=ngram_range,
        top_n=int(n_keywords),
        use_maxsum=True,
        use_mmr=True,
        nr_candidates=20,
        diversity=0.65
    )
    generated_keywords = [kw[0] for kw in generated_keywords]
    if original_keywords:
        reference_keywords = original_keywords.split(", ")
        P, R, F1 = scorer.score([", ".join(generated_keywords)], [", ".join(reference_keywords)])
        P, R, F1 = P.item(), R.item(), F1.item()
        generated_embeddings = model_sentece.encode(generated_keywords)
        original_embeddings = model_sentece.encode(reference_keywords)
        simi_cosine = cosine_similarity(
            np.mean(generated_embeddings, axis=0).reshape(1, -1),
            np.mean(original_embeddings, axis=0).reshape(1, -1)
        )[0][0]
    else:
        P, R, F1, simi_cosine = None, None, None, None
    
    results.append([article_id, original_keywords, ", ".join(generated_keywords), P, R, F1, simi_cosine])

In [12]:
columns = ['id_article', 'keywords', 'generated_keywords', 'Precision', 'Recall', 'F1', 'simi_cosine']
results_df = pd.DataFrame(results, columns=columns)

In [13]:
results_df

Unnamed: 0,id_article,keywords,generated_keywords,Precision,Recall,F1,simi_cosine
0,85133492759,"audio signals design process, experimental des...","audio reactive nature, art design research, fo...",0.326585,0.414834,0.370737,0.663579
1,85133293730,"facility layout problem, indicators, resilienc...","areas business resilience, type weakness cause...",0.483017,0.522297,0.503239,0.753503
2,85132518705,"confidence distribution, confidence interval, ...","powerful confidence distributions, family cond...",0.366932,0.384881,0.376815,0.817440
3,85112575431,"competitiveness, corporate social responsibili...","green innovation competitiveness, corporate so...",0.386677,0.541415,0.462127,0.729455
4,85109263966,"food industry, global supply chain, supply cha...","influenced supplier sustainability, logics exi...",0.089249,0.271203,0.177125,0.616395
...,...,...,...,...,...,...,...
37673,12371997,,"produce allergenic pollen, grasses limited, so...",,,,
37674,84918742422,,"pigs histamine shock, complex protect guinea, ...",,,,
37675,34347185703,,"reagents consumed titrimetric, unavoidable con...",,,,
37676,33947340215,,"sulfated ash method, standardized time tempera...",,,,


In [14]:
output_path = "C:/Users/Krlozz/Documents/Tesis/TesisFinal/ProcessedData/generated_keywords_articles.csv"
results_df.to_csv(output_path, index=False)
print(f"Resultados guardados en: {output_path}")

Resultados guardados en: C:/Users/Krlozz/Documents/Tesis/TesisFinal/ProcessedData/generated_keywords_articles.csv


In [19]:
columns = ['Precision', 'Recall', 'F1', 'simi_cosine']

In [20]:
metrics_final = {
    "Metric": ["Precision", "Recall", "F1 Score", "Cosine Similarity"],
    "Average Value": [results_df[column].mean(skipna=True) for column in columns],
    "Minimum Value": [results_df[column].min(skipna=True) for column in columns],
    "Maximum Value": [results_df[column].max(skipna=True) for column in columns],
}

In [21]:
summary_df = pd.DataFrame(metrics_final)

In [22]:
summary_df

Unnamed: 0,Metric,Average Value,Minimum Value,Maximum Value
0,Precision,0.230507,-0.375034,1.0
1,Recall,0.32288,-0.299159,1.0
2,F1 Score,0.275835,-0.265591,1.0
3,Cosine Similarity,0.635526,0.043009,1.0
