In [1]:
import pandas as pd
from datetime import datetime
import pytz
import os

In [2]:
"""import tensorflow as tf
print(tf.__version__)"""

'import tensorflow as tf\nprint(tf.__version__)'

In [3]:
temas = ["active aging elderly", 
             "recommender systems human learning",  
             "ontology movement", 
             "human movement language models",  
             "tai chi",
             "tai chi ontology", 
             "pathology ontology",
             ]

In [4]:
import glob

# buscar todos los archivos CSV en el directorio actual
archivos_csv = glob.glob("*.csv")

print("Archivos CSV encontrados:")
for archivo in archivos_csv:
    print(archivo)

Archivos CSV encontrados:
arxiv_active_aging_elderly.csv
arxiv_human_movement_language_models.csv
arxiv_recommender_systems_human_learning.csv
scopus_metadata_active_aging_elderly.csv
scopus_metadata_human_movement_language_models.csv
scopus_metadata_ontology_movement.csv
scopus_metadata_pathology_ontology.csv
scopus_metadata_recommender_systems_human_learning.csv
scopus_metadata_tai_chi.csv
scopus_metadata_tai_chi_ontology.csv


In [5]:
#relevance_score, simple
def relevance_score(title, tema):
    title_lower = title.lower()
    score = 0
    if tema.lower() in title_lower: #check full match
        score += 1
    return score

In [6]:
#TF-IDF based scoring + cosine similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def tfidf_cosine_scoring(df, tema):
    corpus = df["title"].to_list() + [tema]

    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # similitud coseno entre cada título y los temas
    similarities = cosine_similarity(tfidf_matrix[:-1], tfidf_matrix[-1])

    df["tfidf_cosine_score"] = similarities.flatten()
    return df

In [7]:
# scoring semántico con embeddings
from sentence_transformers import SentenceTransformer, util

def semantic_scoring_per_theme(df, tema):
    from sentence_transformers import SentenceTransformer, util
    import torch

    model = SentenceTransformer("all-MiniLM-L6-v2")

    # embeddings
    title_embeddings = model.encode(df["title"].to_list(), convert_to_tensor=True)
    temas_embeddings = model.encode([tema], convert_to_tensor=True)

    # matriz de similitud coseno: shape (n_titles, n_temas)
    cos_sim = util.cos_sim(title_embeddings, temas_embeddings)  # torch.Tensor

    # añadir columnas separadas al DataFrame
    df["score_transformers"] = cos_sim[:, 0].cpu().numpy()

    return df


2026-01-14 21:47:36.405028: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
# cargar CSV de metadatos
list_df = []
for i in archivos_csv:
    df = pd.read_csv(i)
    if 'published' in df.columns:
        dt_utc = df["published"].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ"))
        dt_utc = dt_utc.apply(lambda x: x.replace(tzinfo=pytz.UTC))
        local_tz = pytz.timezone("Europe/Madrid")  # cambia si hace falta
        dt_local = dt_utc.apply(lambda x: x.astimezone(local_tz).year)
        df['year'] = dt_local
    #if len(df)>2:
    #    list_df.append(df)
    print(i, len(df), min(df['year']))
    tema = i.replace(".csv", "").replace("arxiv_", "").replace("scopus_metadata_", "").replace("_", " ")
    print(tema)
    df["simple score"] = df["title"].apply(lambda x: relevance_score(x, tema))
    df = tfidf_cosine_scoring(df, tema)
    df = semantic_scoring_per_theme(df, tema)

    # Estadísticas descriptivas
    stats = df.describe(include='all')

    # Nombre del Excel de salida
    output_file = os.path.join("scores", f"scored_{i.replace('.csv', '.xlsx')}")

    # Guardar en Excel con dos hojas
    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        df.to_excel(writer, sheet_name="data", index=False)
        stats.to_excel(writer, sheet_name="describe")

arxiv_active_aging_elderly.csv 50 2021
active aging elderly
arxiv_human_movement_language_models.csv 555 2023
human movement language models
arxiv_recommender_systems_human_learning.csv 550 2024
recommender systems human learning
scopus_metadata_active_aging_elderly.csv 525 2025
active aging elderly
scopus_metadata_human_movement_language_models.csv 525 2023
human movement language models
scopus_metadata_ontology_movement.csv 500 2025
ontology movement
scopus_metadata_pathology_ontology.csv 500 2025
pathology ontology
scopus_metadata_recommender_systems_human_learning.csv 464 2022
recommender systems human learning
scopus_metadata_tai_chi.csv 525 2024
tai chi
scopus_metadata_tai_chi_ontology.csv 1 2022
tai chi ontology


In [None]:
df.describe()

Unnamed: 0,scopus_id,year,simple score,tfidf_cosine_score,score_transformers
count,1.0,1.0,1.0,1.0,1.0
mean,85140630000.0,2022.0,0.0,0.197853,0.376634
std,,,,,
min,85140630000.0,2022.0,0.0,0.197853,0.376634
25%,85140630000.0,2022.0,0.0,0.197853,0.376634
50%,85140630000.0,2022.0,0.0,0.197853,0.376634
75%,85140630000.0,2022.0,0.0,0.197853,0.376634
max,85140630000.0,2022.0,0.0,0.197853,0.376634
