In [None]:
!pip install bertopic

# BERTopic

In [18]:
from bertopic import BERTopic
import pandas as pd
import joblib
import numpy as np

In [19]:
data_path = "C:/Users/Krlozz/Documents/Tesis/TesisFinal/ProcessedData/data_groups.csv"
article_clean_path = "C:/Users/Krlozz/Documents/Tesis/TesisFinal/ProcessedData/article_clean.csv"
sentence_bert_model_path = "C:/Users/Krlozz/Documents/Tesis/TesisFinal/Model/sentence_bert_model"
bertopic_model_path = "C:/Users/Krlozz/Documents/Tesis/TesisFinal/Model/bertopic_model"
output_csv_path = "C:/Users/Krlozz/Documents/Tesis/TesisFinal/ProcessedData/group_recommendations.csv"
topics_path = "C:/Users/Krlozz/Documents/Tesis/TesisFinal/Model/topics_TB.pkl"
probs_path = "C:/Users/Krlozz/Documents/Tesis/TesisFinal/Model/probs_TB.pkl"

In [20]:
data = pd.read_csv(data_path)
articles = pd.read_csv(article_clean_path)

In [21]:
model_sentence = joblib.load(sentence_bert_model_path)

In [5]:
abstracts = articles['abstract'].dropna().tolist()

In [6]:
model = BERTopic(
    verbose=True,
    language="multilingual",
    embedding_model=model_sentence,
    min_topic_size=10,
    calculate_probabilities=True
)

In [7]:
topics, probs = model.fit_transform(abstracts)

2024-12-02 22:21:27,041 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1178 [00:00<?, ?it/s]

2024-12-02 22:46:32,238 - BERTopic - Embedding - Completed ✓
2024-12-02 22:46:32,238 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-02 22:47:06,097 - BERTopic - Dimensionality - Completed ✓
2024-12-02 22:47:06,097 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-02 22:53:27,265 - BERTopic - Cluster - Completed ✓
2024-12-02 22:53:27,286 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-02 22:53:33,861 - BERTopic - Representation - Completed ✓


In [22]:
joblib.dump(model, bertopic_model_path)
joblib.dump(topics, topics_path)
joblib.dump(probs, probs_path)

['C:/Users/Krlozz/Documents/Tesis/TesisFinal/Model/probs_TB.pkl']

In [23]:
print(f"Modelo BERTopic guardado en: {bertopic_model_path}")
print(f"Variable 'topics' guardada en: {topics_path}")
print(f"Variable 'probs' guardada en: {probs_path}")

Modelo BERTopic guardado en: C:/Users/Krlozz/Documents/Tesis/TesisFinal/Model/bertopic_model
Variable 'topics' guardada en: C:/Users/Krlozz/Documents/Tesis/TesisFinal/Model/topics_TB.pkl
Variable 'probs' guardada en: C:/Users/Krlozz/Documents/Tesis/TesisFinal/Model/probs_TB.pkl


In [24]:
model_bertopic = joblib.load(bertopic_model_path)

In [25]:
topics = joblib.load(topics_path)
probs = joblib.load(probs_path)

In [26]:
print("Variable 'topics' cargada.")
print("Variable 'probs' cargada.")

Variable 'topics' cargada.
Variable 'probs' cargada.


In [27]:
recommendations = []

In [28]:
for idx, row in data.iterrows():
    group_keywords = row['keywords'].split(',') if pd.notna(row['keywords']) else []
    index_novelty = row['index_novelty']
    id_group = row['id_group']
    authors = row['authors']
    
    group_embedding = model_sentence.encode(group_keywords, convert_to_tensor=True)
    
    topic_info = []
    for topic_id in set(topics):
        if topic_id == -1:
            continue
        topic_keywords = model_bertopic.get_topic(topic_id)
        topic_keywords_list = [kw for kw, _ in topic_keywords]
        topic_embedding = model_sentence.encode(topic_keywords_list, convert_to_tensor=True)
        
        similarity = np.dot(group_embedding, topic_embedding.T).max().item()
        
        final_score = similarity * (index_novelty)
        topic_info.append((topic_id, final_score))
        
    topic_info = sorted(topic_info, key=lambda x: x[1], reverse=True)[:20]
    
    recommended_topics = [model_bertopic.get_topic(topic_id) for topic_id, _ in topic_info]
    scores = [score for _, score in topic_info]

    recommendations.append({
        "id_group": id_group,
        "authors": authors,
        "index_novelty": index_novelty,
        "topics": recommended_topics,
        "score_final": scores
    })

In [29]:
recommendations_df = pd.DataFrame(recommendations)

In [46]:
recommendations_df

Unnamed: 0,id_group,authors,index_novelty,topics,score_final
0,group1,"7403483234, 57200218104, 6603155100",0.164957,"[[(accidents, 0.059561232558306254), (traffic,...","[7.579230178692796, 7.159059091004564, 6.65685..."
1,group2,"24734179100, 7003513941, 7005237183",0.228035,"[[(pigs, 0.05495710726959359), (piglets, 0.034...","[10.740213204328938, 10.740213204328938, 8.469..."
2,group3,"24734179100, 7005237183, 7102187693",0.417438,"[[(epilepsy, 0.06778589132666105), (seizures, ...","[11.450527233615503, 11.165373317306148, 11.16..."
3,group4,"7003513941, 7005237183, 7102187693",0.227795,"[[(pigs, 0.05495710726959359), (piglets, 0.034...","[10.728905346782083, 10.728905346782083, 8.460..."
4,group5,"24734179100, 7003513941, 7102187693",0.225247,"[[(pigs, 0.05495710726959359), (piglets, 0.034...","[10.608903936976272, 10.608903936976272, 8.365..."
...,...,...,...,...,...
17619,group17673,"50361055400, 57437439400, 57438019100, 5722298...",0.257743,"[[(farms, 0.04311967722380283), (livestock, 0....","[9.627254444952994, 9.083607675762776, 8.87781..."
17620,group17674,"50361055400, 45961087900, 57437439400, 5743801...",0.278523,"[[(wound, 0.0483006429538537), (healing, 0.042...","[12.709334961936293, 10.403442507507862, 10.17..."
17621,group17675,"50361055400, 45961087900, 57437439400, 5722298...",0.276853,"[[(wound, 0.0483006429538537), (healing, 0.042...","[12.633114451383333, 10.341050910952616, 10.11..."
17622,group17676,"50361055400, 45961087900, 57438019100, 5722298...",0.276853,"[[(wound, 0.0483006429538537), (healing, 0.042...","[12.633114451383333, 10.341050910952616, 10.11..."


In [47]:
recommendations_df.to_csv(output_csv_path, index=False)
print(f"Recomendaciones guardadas en: {output_csv_path}")

Recomendaciones guardadas en: C:/Users/Krlozz/Documents/Tesis/TesisFinal/ProcessedData/group_recommendations.csv


# Recommendations for groups

In [32]:
import ast
from keybert import KeyBERT

In [33]:
input_csv_path = "C:/Users/Krlozz/Documents/Tesis/TesisFinal/ProcessedData/group_recommendations.csv"
output_csv_path_recommendations = "C:/Users/Krlozz/Documents/Tesis/TesisFinal/ProcessedData/recommendations.csv"

In [34]:
data = pd.read_csv(input_csv_path)

In [35]:
data

Unnamed: 0,id_group,authors,index_novelty,topics,score_final
0,group1,"7403483234, 57200218104, 6603155100",0.164957,"[[('accidents', 0.059561232558306254), ('traff...","[7.579230178692796, 7.159059091004564, 6.65685..."
1,group2,"24734179100, 7003513941, 7005237183",0.228035,"[[('pigs', 0.05495710726959359), ('piglets', 0...","[10.740213204328938, 10.740213204328938, 8.469..."
2,group3,"24734179100, 7005237183, 7102187693",0.417438,"[[('epilepsy', 0.06778589132666105), ('seizure...","[11.450527233615503, 11.165373317306148, 11.16..."
3,group4,"7003513941, 7005237183, 7102187693",0.227795,"[[('pigs', 0.05495710726959359), ('piglets', 0...","[10.728905346782083, 10.728905346782083, 8.460..."
4,group5,"24734179100, 7003513941, 7102187693",0.225247,"[[('pigs', 0.05495710726959359), ('piglets', 0...","[10.608903936976272, 10.608903936976272, 8.365..."
...,...,...,...,...,...
17619,group17673,"50361055400, 57437439400, 57438019100, 5722298...",0.257743,"[[('farms', 0.04311967722380283), ('livestock'...","[9.627254444952994, 9.083607675762776, 8.87781..."
17620,group17674,"50361055400, 45961087900, 57437439400, 5743801...",0.278523,"[[('wound', 0.0483006429538537), ('healing', 0...","[12.709334961936293, 10.403442507507862, 10.17..."
17621,group17675,"50361055400, 45961087900, 57437439400, 5722298...",0.276853,"[[('wound', 0.0483006429538537), ('healing', 0...","[12.633114451383333, 10.341050910952616, 10.11..."
17622,group17676,"50361055400, 45961087900, 57438019100, 5722298...",0.276853,"[[('wound', 0.0483006429538537), ('healing', 0...","[12.633114451383333, 10.341050910952616, 10.11..."


In [36]:
kw_model = KeyBERT()

In [37]:
ngram_range = (1, 3)
top_n = 20

In [39]:
def extract_phrases_with_keybert(topics_column, ngram_range, top_n):
    recommendations = []
    
    for topics_str in topics_column:
        topics = ast.literal_eval(topics_str)
        
        combined_keywords = []
        for topic in topics:
            keywords = [kw for kw, _ in topic]
            combined_keywords.extend(keywords)
        
        text = " ".join(combined_keywords)
        
        keyphrases = kw_model.extract_keywords(
            text, 
            keyphrase_ngram_range=ngram_range,
            top_n=top_n,
            use_maxsum=True,
            nr_candidates=20,
            diversity=0.80
        )
        
        unique_phrases = set([phrase for phrase, score in keyphrases])
        recommendations.append(", ".join(unique_phrases))
    
    return recommendations

In [40]:
data['recommendations_group'] = extract_phrases_with_keybert(data['topics'], ngram_range=ngram_range, top_n=top_n)

In [41]:
final_data = data[['id_group', 'authors', 'recommendations_group', 'index_novelty']]

In [42]:
final_data

Unnamed: 0,id_group,authors,recommendations_group,index_novelty
0,group1,"7403483234, 57200218104, 6603155100","earthquakes, accident driving vehicle, acciden...",0.164957
1,group2,"24734179100, 7003513941, 7005237183","pigs, livestock, breeds, animals pathogenic, p...",0.228035
2,group3,"24734179100, 7005237183, 7102187693","patients health, medical diseases, epileptic c...",0.417438
3,group4,"7003513941, 7005237183, 7102187693","pigs, livestock, breeds, animals pathogenic, p...",0.227795
4,group5,"24734179100, 7003513941, 7102187693","pigs, livestock, breeds, animals pathogenic, p...",0.225247
...,...,...,...,...
17619,group17673,"50361055400, 57437439400, 57438019100, 5722298...","cattle, livestock, cattle herds herd, herds he...",0.257743
17620,group17674,"50361055400, 45961087900, 57437439400, 5743801...","sperm cows, cattle, herd bovine, painful disab...",0.278523
17621,group17675,"50361055400, 45961087900, 57437439400, 5722298...","sperm cows, cattle, herd bovine, painful disab...",0.276853
17622,group17676,"50361055400, 45961087900, 57438019100, 5722298...","sperm cows, cattle, herd bovine, painful disab...",0.276853


In [45]:
final_data.to_csv(output_csv_path_recommendations, index=False)
print(f"Archivo guardado en: {output_csv_path_recommendations}")

Archivo guardado en: C:/Users/Krlozz/Documents/Tesis/TesisFinal/ProcessedData/recommendations.csv
