In [None]:
!pip install langdetect
!python -m spacy download de_core_news_sm
!pip install --upgrade openai


Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m99.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import os
import re
import nltk
import numpy as np
import spacy
import unicodedata
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from langdetect import detect
import pandas as pd

In [None]:
from openai import OpenAI



In [None]:
nltk.download('stopwords')

# Variables globales para parámetros
n_topics = 5
n_iter = 2500
learning_decay = 0.6
max_features = 5000
max_df = 0.75
min_df = 2
n_top_words = 100

# Cargar modelos de spaCy
nlp_en = spacy.load('en_core_web_sm')
nlp_en.max_length = 5000000
nlp_de = spacy.load('de_core_news_sm')
nlp_de.max_length = 5000000

# Stopwords
stopwords_en = set(stopwords.words('english'))
stopwords_de = set(stopwords.words('german'))

# Stopwords extendidas
extra_stopwords = {
    'al', 'biomarker', 'fluorescent', 'canis', 'keywords', 'conclusion', 'dogmammary', 'method',
    'lupus', 'carotid', 'subject', 'paper', 'study', 'neoplasia', 'hdmi', 'granuloma', 'august',
    'carcinoma', 'authors', 'discussion', 'microscopy', 'fluorescence', 'bratislava', 'slicing',
    'asthma', 'ptx', 'pathologist', 'myocardium', 'slide', 'transfection', 'malignancy', 'https',
    'immunohistochemistry', 'benign', 'metastasis', 'pathology', 'gmbh', 'harvest', 'phosphatase',
    'stage', 'clinics', 'centrifugation', 'department', 'publication', 'january', 'model', 'biomedicine',
    'email', 'procedure', 'receptor', 'microvascular', 'oncology', 'abstract', 'result', 'gct',
    'osteoporosis', 'march', 'clinic', 'copyright', 'cancer', 'hsp', 'states', 'tumor', 'april',
    'osteoblast', 'chemoresistance', 'nephropathy', 'hospital', 'moscow', 'analysis', 'aminelevulinic',
    'biophysic', 'medication', 'staining', 'et', 'war', 'therapie', 'malignant', 'canine', 'ukraine',
    'stroke', 'laboratory', 'graph', 'none', 'rights', 'caninemammary', 'reserved', 'tehran', 'radiotherapy',
    'veterinary', 'diagnostic', 'ischaemic', 'grade', 'granule', 'microscope', 'tumour', 'united',
    'overnight', 'group', 'histology', 'lysate', 'metastatic', 'dog', 'november', 'february', 'sclerose',
    'december', 'breastcancer', 'psf', 'figure', 'reperfusion', 'craiova', 'methodology', 'tables',
    'october', 'mammary', 'data', 'expression', 'diagnosis', 'lysis', 'doi', 'association', 'biopsy',
    'denmark', 'research', 'dose', 'micropapillary', 'year', 'preprint', 'dept', 'histopathology', 'mellitus',
    'phenotype', 'risk', 'sample', 'fibrosis', 'clinical', 'antibody', 'aortic', 'torino', 'biomarkers',
    'tumorexpression', 'significant', 'manchester', 'genetic', 'stockholm', 'measurement', 'progression',
    'patienten', 'survival', 'therapy', 'lungcancer', 'control', 'gfp', 'september', 'response', 'www', 'msc',
    'invasive', 'fig', 'trial', 'iran', 'blotting', 'introduction', 'inflammation', 'gene', 'protein',
    'slovenia', 'southampton', 'journal', 'treatment', 'freiburg', 'cell', 'cardiologist', 'pmcid', 'results',
    'smoker', 'june', 'galway', 'tsklinikum', 'immunohisto', 'imaging', 'license', 'level', 'author',
    'protocol', 'tissue', 'university', 'experiment', 'incubation', 'epilepsy', 'cirrhosis', 'july', 'volunteer',
    'chemotherapy', 'breast', 'perra', 'ppix', 'may', 'immune', 'purification', 'mutation', 'mri', 'pmid',
    'technique', 'histochemical', 'effect', 'psoriasis', 'vienna', 'lille', 'surgery', 'table', 'methods',
    'http', 'target', 'associate', 'network', 'detection', 'dataset', 'performance', 'value', 'set', 'anti', 'accuracy',
    'predict', 'pathway', 'signal', 'segmentation', 'detect', 'annotation', 'tool', 'therapeutic', 'information',
    'demonstrate', 'dna', 'section', 'score', 'deep', 'identify', 'drug', 'negative', 'medical', 'background',
    'induce', 'enhance', 'melanoma', 'medicine', 'promote', 'status', 'lung', 'performance',
    'positive', 'perform', 'disclosure', 'ihc', 'cation', 'test', 'mitotic',
    'stain', 'mouse', 'lesion', 'classification', 'learning', 'algorithm',
    'digital', 'design', 'macrophage', 'classi', 'molecular', 'node',
    'center', 'training', 'rate', 'tnbc', 'area', 'subtype', 'wang',
    'specific', 'nuclear', 'application', 'evaluate', 'present', 'train',
    'signi', 'role', 'lymph', 'radiomic', 'growth', 'mean', 'primary',
    'nanoparticle', 'specimen', 'tme', 'marker', 'cohort', 'difference',
    'inhibitor', 'assess', 'immunotherapy', 'change', 'tion', 'image', 'label', 'process',
    'evolution', 'technology'
}

stop_words = stopwords_en.union(stopwords_de).union(extra_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:


def load_text_from_dataset(dataset_path):
    df = pd.read_excel(dataset_path)
    texts = df['abstract'].tolist()
    processed_texts = []
    for text in texts:
        if not isinstance(text, str):
            processed_texts.append("")
            continue
        text = unicodedata.normalize('NFKC', text)
        text = re.sub(r'[\r\n\t]+', ' ', text)
        text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C')
        text = re.sub(r'[^a-zA-Z0-9áéíóúÁÉÍÓÚüÜñÑ.,;:()\-\'\" ]+', ' ', text)
        # Normaliza múltiples espacios a uno solo
        text = re.sub(r'\s+', ' ', text)
        # Recorta espacios al inicio y final
        text = text.strip()
        processed_texts.append(text)
    return processed_texts

def clean_text_remove_metadata(text):
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if re.search(r'\S+@\S+', line):  # Ignorar emails
            continue
        if re.search(r'http\S+', line):  # Ignorar URLs
            continue
        if len(line) < 30:  # Ignorar líneas cortas
            continue
        if sum(1 for c in line if c.isupper()) > len(line) * 0.5:  # Ignorar líneas con muchas mayúsculas
            continue
        cleaned_lines.append(line)
    return ' '.join(cleaned_lines)

def preprocess_text(text):
    text = clean_text_remove_metadata(text)
    text = re.sub(r'[^a-zA-ZäöüÄÖÜß\s]', ' ', text)  # Eliminar caracteres no alfabéticos
    text = text.lower()  # Convertir a minúsculas
    doc = nlp_en(text)

    tokens = [token.lemma_ for token in doc if token.lemma_ not in stop_words and len(token.lemma_) > 2]

    return ' '.join(tokens)

def train_lda(X):
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        max_iter=n_iter,
        learning_decay=learning_decay,
        random_state=42,
        n_jobs=-1
    )
    lda.fit(X)
    return lda

def vectorize_texts(texts):
    vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, ngram_range=(1, 2))
    X = vectorizer.fit_transform(texts)
    return X, vectorizer

def print_top_words(model, feature_names):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx+1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

def get_dominant_topics(lda_model, X):
    topic_distributions = lda_model.transform(X)
    dominant_topics = topic_distributions.argmax(axis=1)
    return dominant_topics

def count_articles_per_topic(dominant_topics):
    topic_counts = [0] * n_topics
    for topic in dominant_topics:
        topic_counts[topic] += 1
    return topic_counts

def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

def get_topic_distributions(lda_model, X):
    topic_distributions = lda_model.transform(X)
    return topic_distributions

def analyze_topic_distributions(topic_distributions):
    topic_counts = [0] * n_topics
    for distribution in topic_distributions:
        dominant_topic = distribution.argmax()
        topic_counts[dominant_topic] += 1
        print(f"Distribución de temas: {distribution} -> Tema dominante: {dominant_topic+1}")
    return topic_counts

def calculate_coherence(lda_model, vectorizer, X, n_top_words=10):
    """
    Calcula la coherencia de los temas generados por el modelo LDA.
    """
    # Obtener las palabras más representativas de cada tema
    feature_names = vectorizer.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_words)

    # Calcular la coherencia entre las palabras top para cada tema
    coherence_scores = []
    for topic_words in topics:
        coherence_score = 0
        for i, word1 in enumerate(topic_words):
            for word2 in topic_words[i+1:]:
                # Buscar las co-ocurrencias de palabras en el corpus
                word1_idx = vectorizer.vocabulary_.get(word1, -1)
                word2_idx = vectorizer.vocabulary_.get(word2, -1)
                if word1_idx != -1 and word2_idx != -1:
                    # Calcular la probabilidad conjunta de estas palabras en el modelo
                    prob_word1 = np.mean(X[:, word1_idx].toarray())
                    prob_word2 = np.mean(X[:, word2_idx].toarray())
                    coherence_score += prob_word1 * prob_word2  # Coherencia como probabilidad conjunta
        coherence_scores.append(coherence_score)

    # Promediar las coherencias de todos los temas
    average_coherence = np.mean(coherence_scores)
    print(f"Coherencia promedio de los temas: {average_coherence}")

    return average_coherence

def generate_title_with_gpt3(keywords, model="gpt-3.5-turbo"):
    # Construir el mensaje de la conversación
    prompt = f"Tomando en cuenta las siguientes palabras clave resúmelas en un título de pocas palabras que las aborden {', '.join(keywords)}"
    client = OpenAI(
        api_key='apikey',
    )
    try:
        # Hacer la solicitud a GPT-3
        response = client.chat.completions.create(
            model=model,       # Puede ser "gpt-3.5-turbo" o "gpt-4" si lo tienes
            messages=[
                {"role": "system", "content": "Eres un asistente que genera títulos concisos para temas de investigación."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=60,      # Limitar la longitud de la respuesta
            n=1,                # Número de respuestas que queremos
            temperature=0.7     # Controlar la creatividad en la respuesta (0.7 es un buen valor general)
        )
        # Extraer y devolver el título generado - FIX AQUÍ
        title = response.choices[0].message.content.strip()
        return title
    except Exception as e:
        print(f"Error al generar título con GPT-3: {e}")
        # Fallback: crear título simple con las palabras clave
        return f"Tema sobre {', '.join(keywords[:3])}"

def generate_topic_titles_with_gpt3(lda_model, feature_names, n_top_words=10):
    topic_titles = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words_indices = topic.argsort()[:-n_top_words - 1:-1]  # Obtener índices de las palabras más importantes
        top_words = [feature_names[i] for i in top_words_indices]  # Obtener las palabras
        # Usar GPT-3 para generar un título
        title = generate_title_with_gpt3(top_words)
        topic_titles.append(f"Topic #{topic_idx + 1}: {title}")
    return topic_titles

def assign_topics_to_documents(lda_model, X, topic_titles):
    # Obtener las distribuciones de los temas para cada documento
    topic_distributions = lda_model.transform(X)
    dominant_topics = topic_distributions.argmax(axis=1)

    # Asignar el tema dominante a cada documento
    topics = [topic_titles[dominant_topic] for dominant_topic in dominant_topics]

    return topics


def process_group(texts, dataset_path):
    print(f"\nProcesando grupo con {len(texts)} documentos.")
    processed_texts = [preprocess_text(text) for text in texts]
    X, vectorizer = vectorize_texts(processed_texts)
    lda_model = train_lda(X)
    feature_names = vectorizer.get_feature_names_out()
    print_top_words(lda_model, feature_names)

    topic_distributions = get_topic_distributions(lda_model, X)
    topic_counts = analyze_topic_distributions(topic_distributions)

    for i in range(n_topics):
        print(f"Topic #{i+1} tiene {topic_counts[i]} artículos.")

    # Evaluar la coherencia de los temas
    coherence_score = calculate_coherence(lda_model, vectorizer, X, n_top_words)

    # Generar títulos para los temas usando GPT-3
    topic_titles = generate_topic_titles_with_gpt3(lda_model, feature_names)

    # Asignar los temas a los documentos y agregar al dataset
    topics = assign_topics_to_documents(lda_model, X, topic_titles)

    # Cargar el dataset original
    df = pd.read_excel(dataset_path)

    # Asignar la columna 'topic' al dataset
    df['topic'] = topics

    # Guardar el dataset actualizado
    df.to_excel('dataset_with_topics.xlsx', index=False)

    print(f"Coherencia del modelo LDA: {coherence_score}")

    return lda_model, coherence_score


In [None]:
if __name__ == "__main__":
    dataset = 'dataset_con_documento.xlsx'  # Cambia aquí por tu ruta
    raw_texts = load_text_from_dataset(dataset)
    langs = [detect_language(text) for text in raw_texts]

    texts_en = [text for text, lang in zip(raw_texts, langs) if lang == 'en']

    lda_en, coherence_score = process_group(texts_en, dataset)
    print(f"Coherencia del modelo LDA: {coherence_score}")
    print(f"Dataset con temas asignados guardado como 'dataset_with_topics.xlsx'.")


Procesando grupo con 110 documentos.
Topic #1:
use discuss human like mmp review mitochondrial high lipid remain biological acid transcription feature evs aim challenge include disease alone non outcome improve impact basal like basal base advance compare inflammatory diagnose extracellular knockout mammography understanding long combine well critical translational vivo molecule woman cadherin host discovery prostate interaction factor surface standard enable nucleic nucleic acid future delivery human blbc blbc cross determine increase long term matrix development term either death relate parameter part classical significantly field focus function due overall large wild wild type biology patient radiation characteristic strategy effective one dynamic efficacy utilize mrna reveal size lack approach cellular combination advantage order overview

Topic #2:
peptide oncolytic biosensor resistance material potential base gagnps electrochemical muc neural print strategy relevance oncolytic p