In [1]:
import numpy as np
import pandas as pd
from itertools import combinations
from functools import reduce
import pathlib
import pickle
import glob
import tomotopy as tp
from scipy.sparse import issparse
from tqdm import tqdm
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
path_models = "/export/usuarios_ml4ds/lbartolome/Repos/my_repos/UserInLoopHTM/data/models_v3"
datasets = ["cordis", "cancer", "ai"]
n_words = 15

## Functions

In [3]:
"""
def compute_CLNPMI(parent_diff_words, child_diff_words, all_bow, vocab):
    npmi_list = list()

    for p_w in parent_diff_words:
        flag_n = all_bow[:, vocab[p_w]] > 0
        p_n = np.sum(flag_n) / len(all_bow)

        for c_w in child_diff_words:
            flag_l = all_bow[:, vocab[c_w]] > 0
            p_l = np.sum(flag_l)
            p_nl = np.sum(flag_n * flag_l)

            if p_nl == len(all_bow):
                npmi_score = 1
            else:
                p_l = p_l / len(all_bow)
                p_nl = p_nl / len(all_bow)
                p_nl += 1e-10
                npmi_score = np.log(p_nl / (p_l * p_n)) / -np.log(p_nl)

            npmi_list.append(npmi_score)

    return npmi_list
"""
def compute_CLNPMI(parent_diff_words, child_diff_words, all_bow, vocab):
    npmi_list = list()

    # Longitud de la matriz
    total_docs = all_bow.shape[0]

    for p_w in parent_diff_words:
        # Índice de la palabra "parent" en el vocabulario
        p_idx = vocab[p_w]

        # Filtramos documentos donde aparece la palabra p_w
        if issparse(all_bow):
            flag_n = all_bow[:, p_idx].toarray().flatten() > 0
        else:
            flag_n = all_bow[:, p_idx] > 0
        p_n = np.sum(flag_n) / total_docs

        for c_w in child_diff_words:
            # Índice de la palabra "child" en el vocabulario
            try:
                c_idx = vocab[c_w]
                # Filtramos documentos donde aparece la palabra c_w
                if issparse(all_bow):
                    flag_l = all_bow[:, c_idx].toarray().flatten() > 0
                else:
                    flag_l = all_bow[:, c_idx] > 0
                p_l = np.sum(flag_l)
    
                # Coincidencia entre documentos que contienen p_w y c_w
                p_nl = np.sum(flag_n & flag_l)
                if p_l == 0:
                    import pdb; pdb.set_trace()
    
                if p_nl == total_docs:
                    npmi_score = 1
                else:
                    p_l = p_l / total_docs
                    p_nl = p_nl / total_docs
                    p_nl += 1e-10
                    npmi_score = np.log(p_nl / (p_l * p_n)) / -np.log(p_nl)

                # if p_l = 0 implica que la palabra no aparece en ningún documento
                
                npmi_list.append(npmi_score)
    
            except Exception as e:
                print(e)
                print(f"word {c_w} not in vocab")

    return npmi_list


def compute_TD(texts):
    K = len(texts)
    T = len(texts[0].split())
    vectorizer = CountVectorizer()
    counter = vectorizer.fit_transform(texts).toarray()

    TF = counter.sum(axis=0)
    TD = (TF == 1).sum() / (K * T)

    return TD

def get_CLNPMI(PC_pair_groups, all_bow, vocab):
    CNPMI_list = list()
    for group in tqdm(PC_pair_groups):
        layer_CNPMI = []
    
       
        parent_words = set(group[0])
        child_words = set(group[1])

        inter = parent_words.intersection(child_words)
        parent_diff_words = list(parent_words.difference(inter))
        child_diff_words = list(child_words.difference(inter))
        
        npmi_list = compute_CLNPMI(parent_diff_words, child_diff_words, all_bow, vocab)

        # Handle repetitive word pair NPMI assignments if necessary
        num_repetition = (
            len(parent_words) - len(parent_diff_words)
        ) * (len(child_words) - len(child_diff_words))
        npmi_list.extend([-1] * num_repetition)

        layer_CNPMI.extend(npmi_list)
    
        CNPMI_list.append(np.mean(layer_CNPMI))  # Append the group's result


    return CNPMI_list

def compute_diff_topic_pair(topic_words_a, topic_words_b):
    word_counter = Counter()
    word_counter.update(topic_words_a)
    word_counter.update(topic_words_b)
    diff = (np.asarray(list(word_counter.values())) == 1).sum() / (len(topic_words_a) + len(topic_words_b))
    return diff


def get_topics_difference(topic_pair_groups):
    diff_list = list()
    for group in tqdm(topic_pair_groups):
        layer_diff = list()
        diff = compute_diff_topic_pair(group[0], group[1])
        layer_diff.append(diff)
        diff_list.append(np.mean(layer_diff))

    return diff_list

def get_Sibling_TD(sibling_groups):
    sibling_TD = list()
    for group in sibling_groups:
        layer_sibling_TD = list()
        for sibling_topics in group:
            TD = compute_TD(sibling_topics)
            layer_sibling_TD.append(TD)
        sibling_TD.append(np.mean(layer_sibling_TD))
    return sibling_TD

#### HPAM
def assign_subtopics_to_supertopics(model):
    """
    Asigna cada subtópico a un supertópico basado en la máxima probabilidad condicional.

    Args:
        model: Modelo HPAM entrenado.

    Returns:
        hierarchy: Diccionario donde las claves son supertópicos (nivel 1) y los valores son listas
                   de subtópicos (nivel 2) que cuelgan de ellos.
    """
    hierarchy = {k1: [] for k1 in range(model.k1)}  # Inicializa el diccionario para supertópicos

    for k2 in range(model.k2):  # Iterar sobre los subtópicos
        max_prob = 0
        assigned_super_topic = None
        
        for k1 in range(model.k1):  # Iterar sobre los supertópicos
            # Obtener la probabilidad del subtópico dado el supertópico
            prob = model.get_sub_topic_dist(k1)[k2]
            if prob > max_prob:  # Asignar al supertópico con mayor probabilidad
                max_prob = prob
                assigned_super_topic = k1
        
        if assigned_super_topic is not None:
            hierarchy[assigned_super_topic].append(k2)
    
    return hierarchy

In [4]:
def mallet_corpus_to_df(corpusFile: pathlib.Path):
    """Converts a Mallet corpus file (i.e., file required for the Mallet import command) to a pandas DataFrame

    Parameters
    ----------
    corpusFile: pathlib.Path
        Path to the Mallet corpus file

    Returns
    -------
    :   pandas.DataFrame
        DataFrame with the corpus
    """

    corpus = [line.rsplit(' 0 ')[1].strip() for line in open(
        corpusFile, encoding="utf-8").readlines()]
    indexes = [line.rsplit(' 0 ')[0].strip() for line in open(
        corpusFile, encoding="utf-8").readlines()]
    corpus_dict = {
        'id': indexes,
        'text': corpus
    }
    return pd.DataFrame(corpus_dict)

In [5]:
def get_root_models(path_models:pathlib.Path):
    dfs = []
    for entry in path_models.iterdir():
        # check if it is a root model
        if entry.name.startswith("htm"):#root
            
            # Path to the root model
            path = entry

            # Thr and exp_tpc do not apply for the root model
            thr = -1
            exp_tpc = -1

            # Experiment iteration
            iter_ = -1
            
            # tr_topics
            tr_topics = int(path.name.split("_")[1])

            # Size of the topics
            alphas = np.load(path.joinpath('TMmodel/alphas.npy')).tolist()
            alphas = list(map(lambda x: x * 100, alphas))

            # Coherences (CV and NPMI)
            cohrs_cv = np.load(path.joinpath('TMmodel/c_v_ref_coherence.npy')).tolist()
            cohrs_npmi = np.load(path.joinpath('TMmodel/c_npmi_ref_coherence.npy')).tolist()

            # Topics' entropies
            entropies = np.load(path.joinpath('TMmodel/topic_entropy.npy')).tolist()

            # TD
            td = np.load(path.joinpath('TMmodel/td.npy'))

            # IRBO 
            rbo = np.load(path.joinpath('TMmodel/rbo.npy'))
            
            # tpc_Desc
            with path.joinpath('TMmodel/tpc_descriptions.txt').open('r', encoding='utf8') as fin:
                tpc_descriptions = [el.strip() for el in fin.readlines()]

            # Ids of the topics
            tpc_ids = np.arange(0,len(alphas),1)

            # Corpus size
            if path.joinpath('corpus.txt').is_file():
                corpus = [line.rsplit(' 0 ')[1].strip() for line in open(
                    path.joinpath('corpus.txt'), encoding="utf-8").readlines()]
                size = len(corpus)
            elif path.joinpath('corpus.parquet').is_dir():
                dfc = pd.read_parquet(path.joinpath('corpus.parquet'))
                size = len(dfc)

            # Create dataframe for the root model
            root_tpc_df = pd.DataFrame(
                {'iter': [iter_] * len(alphas),
                 'path': [path] * len(alphas),
                 'cohrs_cv': cohrs_cv,
                 'cohrs_npmi': cohrs_npmi,
                 'entropies': entropies,
                 'td': [td] * len(alphas),
                 'rbo': [rbo] * len(alphas),
                 'alphas': alphas,
                 'tpc_ids': tpc_ids,
                 'thr': [thr] * len(alphas),
                 'exp_tpc': [exp_tpc] * len(alphas),
                 'tr_tpcs': [tr_topics] * len(alphas),
                 'tpc_descriptions': tpc_descriptions,
                 'father': [iter_] * len(alphas),
                'father_tpc_descriptions':[iter_] * len(alphas),
                })

            # Get root size
            if root_tpc_df.iloc[0].path.joinpath('corpus.txt').is_file():
                corpus = [line.rsplit(' 0 ')[1].strip() for line in open(
                            root_tpc_df.iloc[0].path.joinpath('corpus.txt'), encoding="utf-8").readlines()]
                root_size = len(corpus)
            elif root_tpc_df.iloc[0].path.joinpath('corpus.parquet').is_dir() or root_tpc_df.iloc[0].path.joinpath('corpus.parquet').is_file():
                dfc = pd.read_parquet(root_tpc_df.iloc[0].path.joinpath('corpus.parquet'))
                root_size = len(dfc) 

            root_tpc_df["root_size"] = [root_size] * len(alphas)

            # Append to the list of dataframes to concatenate them
            dfs.append(root_tpc_df)
    df = pd.concat(dfs)
    df = df.sort_values(by=['iter'])
    return df


def get_submodules(df:pd.DataFrame):
    # Iter over each root model (according to its corresponding iteration, iter)
    concats = [df]
    not_finished = []
    paths_root = df.path.unique()
    print(paths_root)
    for path_root in paths_root:
        print(path_root)
        root_size = df[df.path == path_root].iloc[0].root_size
        
        for entry in path_root.iterdir():
            if entry.joinpath('TMmodel/topic_coherence.npy').is_file() and not entry.as_posix().endswith("old"):
                try:
                    if "submodel_htm-ws" in entry.as_posix():
                        thr_ = 0
                        size = 0
                    else:
                        thr_ = float(entry.as_posix().split("thr_")[1].split("_")[0])
                        
                        if entry.joinpath('corpus.txt').is_file():
                            corpus = [line.rsplit(' 0 ')[1].strip() for line in open(
                                        entry.joinpath('corpus.txt'), encoding="utf-8").readlines()]
                            size = len(corpus)
                        elif entry.joinpath('corpus.parquet').is_dir():
                            dfc = pd.read_parquet(entry.joinpath('corpus.parquet'))
                            size = len(dfc)
                        size = size * 100 / root_size
                        
                    # get iter
                    iter_ = int(entry.name.split("_iter_")[1].split("_")[0])
                    
                    # get topic from which the submodel is generated
                    exp_tpc = int(entry.as_posix().split("from_topic_")[1].split("_")[0])   
                    aux = df[(df.path == path_root)]
                    father_tpc_desc = aux[aux.tpc_ids == exp_tpc].tpc_descriptions.values.tolist()[0]
    
                    # Size of the topics
                    alphas = np.load(entry.joinpath('TMmodel/alphas.npy')).tolist()
                    alphas = list(map(lambda x: x * 100, alphas))
    
                    # Alphas submodel is the mean of the cohr of its topics
                    alpha = np.mean(alphas)
    
                    # Coheerences (CV and NPMI)
                    cohrs_cv = np.load(entry.joinpath('TMmodel/c_v_ref_coherence.npy')).tolist()
                    cohrs_npmi = np.load(entry.joinpath('TMmodel/c_npmi_ref_coherence.npy'), allow_pickle=True).tolist()
                    if cohrs_npmi is None:
                        cohrs_npmi = [0]*len(cohrs_cv)
    
                    # cohr submodel is the mean of the cohr of its topics
                    cohr_cv = np.mean(cohrs_cv)
                    cohr_npmi = np.mean(cohrs_npmi)
    
                    # TD
                    td = np.load(entry.joinpath('TMmodel/td.npy'))
    
                    # IRBO 
                    rbo = np.load(entry.joinpath('TMmodel/rbo.npy'))
    
                    # Topics' entropies
                    entropy = np.mean(np.load(entry.joinpath('TMmodel/topic_entropy.npy')).tolist())
    
                    tr_tpcs = int(entry.as_posix().split("train_with_")[1].split("_")[0])
    
                    # tpc_Desc
                    with entry.joinpath('TMmodel/tpc_descriptions.txt').open('r', encoding='utf8') as fin:
                        tpc_descriptions = [el.strip() for el in fin.readlines()]
                    
                    root_tpc_df = pd.DataFrame(
                    {'iter': [iter_],
                     'path': [entry],
                     'cohrs_cv': [cohr_cv],
                     'cohrs_npmi': [cohr_npmi],
                     'entropies': [entropy],
                     'alphas': [alpha],
                     'td': [td],
                     'rbo': [rbo],
                     'tpc_ids': [exp_tpc],
                     'thr': [thr_],
                     'exp_tpc': [exp_tpc],
                     'size': [size],
                     'tr_tpcs': [tr_tpcs],
                     'tpc_descriptions': [tpc_descriptions],
                     'father': [path_root.as_posix()],
                     'father_tpc_descriptions':[father_tpc_desc] 
                    })
                                    
                    concats.append(root_tpc_df)
                except Exception as e:
                    not_finished.append(entry)
    
            else:
                not_finished.append(entry)
    df = pd.concat(concats)
    
    return df

def get_df_all_models(path_models:pathlib.Path):
    df_root = get_root_models(path_models)
    df_all = get_submodules(df_root)
    return df_root, df_all

## Baselines

### Hyperminer and Traco baselines (directly load from TopMost)

In [6]:
print("--- Calculating topmost.... ")
algorithms = ["hyperminer", "traco"]

results_df = []
for algo in algorithms:
    for dtset in datasets:
        search_key = f"{path_models}/{algo}/{dtset}/{dtset}_trained_model_iter_*.pkl"
        results_files = glob.glob(search_key)
        for i, file in enumerate(results_files):
            with open(file, "rb") as f:
                loaded_data = pickle.load(f)
                results = loaded_data["hierarchy_quality_results"]
                results["dataset"] = dtset
                results["algorithm"] = algo
                results["iter"] = i
                if isinstance(results, dict):
                    results = [results] 
                results_df.append(pd.DataFrame(results))
results_df = pd.concat(results_df, ignore_index=True)

--- Calculating topmost.... 


In [25]:
print(results_df)

         PCC       PCD  Sibling_TD      PnCD dataset   algorithm  iter
0  -0.219832  0.676000    0.730667  0.981481  cordis  hyperminer     0
1  -0.190614  0.690667    0.742667  0.985481  cordis  hyperminer     1
2  -0.163313  0.694667    0.712000  0.979852  cordis  hyperminer     2
3  -0.376846  0.560000    0.935000  0.994236  cancer  hyperminer     0
4  -0.421895  0.531667    0.930000  0.995069  cancer  hyperminer     1
5  -0.395936  0.548333    0.896667  0.993264  cancer  hyperminer     2
6  -0.450961  0.510000    0.890000  0.991875      ai  hyperminer     0
7  -0.416273  0.535000    0.903333  0.993333      ai  hyperminer     1
8  -0.541641  0.451667    0.861667  0.991389      ai  hyperminer     2
9  -0.280082  0.978667    0.998667  0.999556  cordis       traco     0
10 -0.274823  0.978667    0.998667  0.999852  cordis       traco     1
11 -0.289854  0.980000    0.998667  0.999556  cordis       traco     2
12 -0.256897  0.913333    0.998333  0.999861  cancer       traco     0
13 -0.

In [8]:
results_df.to_csv("traco_hyper.csv")

### hLDA and HDP baselines

In [17]:
print("--- Calculating tomo.... ")

results_df_tomo = []
for algo in ["hpam", "hlda"]: #["hdp", "hpam"]
    for dtset in datasets: 
        print(f"Extracting for {dtset}...")

        ### Load corpus information
        corpus_path_ = "/export/usuarios_ml4ds/lbartolome/Repos/my_repos/UserInLoopHTM/data/tomo_corpus_objects/XXXX.pkl"
        # Cancer, CORDIS, S2CS-AI

        if dtset == "cordis":
            corpus_path = corpus_path_.replace("XXXX", "CORDIS")
        elif dtset == "cancer":
            corpus_path = corpus_path_.replace("XXXX", "Cancer")
        else:
            corpus_path = corpus_path_.replace("XXXX", "S2CS-AI") 

        with open(corpus_path, 'rb') as f:
            corpus = pickle.load(f)

        if dtset == "cordis":
            # reduce by a factor of 10
            corpus = corpus[:len(corpus)//10]
        else:
            # reduce by a factor of 100
            corpus = corpus[:len(corpus)//100]

        print(f"CORPUS: {corpus[0]}")
        
        corpus_join = [" ".join(doc) for doc in corpus]
        vectorizer = CountVectorizer(tokenizer=lambda x: x.split())
        bow = vectorizer.fit_transform(corpus_join).toarray()
        vocab = vectorizer.vocabulary_

        print(f"Vocabulary and BoW calculated")

        ### Load model information
        search_key = f"{path_models}/{algo}/{dtset}/run*.bin"
        results_files = glob.glob(search_key)
        for iter_, file in enumerate(results_files):
            if algo == "hlda":
                mdl = tp.HLDAModel.load(file)
    
                # get topic words
                betas = np.array([mdl.get_topic_word_dist(el) for el in range(mdl.k)])
                tpc_descs = []
                for i in range(mdl.k):
                    words = [mdl.vocabs[idx2] for idx2 in np.argsort(betas[i])[::-1][0:n_words]]
                    tpc_descs.append((i, words))
    
                # get groups
                root_desc = tpc_descs[0][1]
                tpc_descs = [el[1] for el in tpc_descs if el[0] !=0]
    
                PC_pair_groups = [[root_desc, el] for el in tpc_descs]
    
                # sibling_groups: length == num_layers
                # each element in the list is a group of sibling topics at a layer.
                sibling_groups = [# conjunto de all sibling topics (one per layer)
                    [
                        [" ".join(el) for el in tpc_descs]
                    ]
                ]
            else:

                mdl = tp.HPAModel.load(file)
                
                supertopics_lst = []  # Lista para los tópicos de nivel 1
                subtopics_lst = []    # Lista para los tópicos de nivel 2
                
                # Extraer super tópicos (nivel 1)
                for k1 in range(mdl.k1):  # Iterar sobre los tópicos de nivel 1
                    words = [word for word, _ in mdl.get_topic_words(k1, top_n=n_words)]
                    supertopics_lst.append(words)
                
                # Extraer sub tópicos (nivel 2)
                for k2 in range(mdl.k1, mdl.k1 + mdl.k2):  # Iterar sobre los tópicos de nivel 2
                    words = [word for word, _ in mdl.get_topic_words(k2, top_n=n_words)]
                    subtopics_lst.append(words)
                    
                # Asignar subtópicos a supertópicos
                hierarchy = assign_subtopics_to_supertopics(mdl)
                
                # Imprimir la jerarquía
                #print("Jerarquía de supertópicos y subtópicos:")
                #for super_topic, sub_topics in hierarchy.items():
                #    print(f"Super tópico {super_topic}: Sub tópicos {sub_topics}")
                
                # PC_pair_groups: pares de supertópicos y sus subtópicos
                PC_pair_groups = []
                for super_topic, sub_topic_ids in hierarchy.items():
                    for sub_topic in sub_topic_ids:
                        PC_pair_groups.append([supertopics_lst[super_topic], subtopics_lst[sub_topic]])
                
                # sibling_topics: combinaciones de palabras de supertópicos y subtópicos
                sibling_topics = [
                    [[" ".join(supertopics_lst[a]), " ".join(supertopics_lst[b])] for a, b in combinations(range(len(supertopics_lst)), 2)],  # Nivel 1
                    [[" ".join(subtopics_lst[a]), " ".join(subtopics_lst[b])] for a, b in combinations(range(len(subtopics_lst)), 2)]        # Nivel 2
                ]
                
                # P_noC_pair_groups: supertópicos con subtópicos que no son directamente suyos
                P_noC_pair_groups = []
                for super_topic, sub_topic_ids in hierarchy.items():
                    non_child_subtopics = [
                        sub_topic for sub_topic in range(len(subtopics_lst))
                        if sub_topic not in sub_topic_ids
                    ]
                    for sub_topic in non_child_subtopics:
                        P_noC_pair_groups.append([supertopics_lst[super_topic], subtopics_lst[sub_topic]])
                                

            results = defaultdict()
            results["dataset"] = dtset
            results["algorithm"] = algo
            results["iter"] = iter_
            results["PCC"] = np.mean(get_CLNPMI(PC_pair_groups, bow, vocab)) # Parent and Child topic Coherence (PCC)
            #print(f"PCC : {results['PCC']}")
            results["PCD"] = np.mean(get_topics_difference(PC_pair_groups)) # Parent and Child topic Diversity (PCD)
            #print(f"PCD : {results['PCD']}")
            results["Sibling_TD"] = np.mean(get_Sibling_TD(sibling_topics)) # Sibling Topic Diversity (SD)
            #print(f"Sibling_TD : {results['Sibling_TD']}")
            results["PnCD"] = np.mean(get_topics_difference(P_noC_pair_groups)) if algo == "hpam" else 0
            #print(f"PnCD : {results['PnCD']}")

            results_df_tomo.append(pd.DataFrame([results]))
            
results_df_tomo = pd.concat(results_df_tomo, ignore_index=True)
print(results_df_tomo)

--- Calculating tomo.... 
Extracting for cordis...
CORPUS: ['methods', 'processes', 'embedded_systems', 'embed', 'critical', 'variety', 'couple', 'constraint', 'enabler', 'interoperable', 'absence', 'recognize', 'limit', 'factor', 'term', 'com', 'enabler', 'significant', 'conclusive', 'particular', 'formalization', 'viewpoint', 'multi_criterion', 'component', 'space_exploration', 'comprise', 'multi_criterion', 'addition', 'intend', 'deploy', 'customizable', 'possible', 'available', 'significant', 'step', 'term', 'help', 'standardization', 'rely', 'point_view', 'rely', 'maturity', 'input', 'trl', 'maturity', 'trl', 'aspect', 'aspect']
Vocabulary and BoW calculated











00%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 65889.47it/s]

Extracting for cancer...
CORPUS: ['ribonucleic_acid', 'meet', 'train', 'jena', 'participant', 'join', 'young', 'scientist', 'meeting', 'german_society', 'biological', 'entitle', 'ribonucleic_acid', 'excellent', 'speaker', 'world', 'graduate_student', 'young', 'leader', 'enjoy', 'meeting', 'familiar', 'atmosphere', 'exchange', 'inspire', 'new', 'vibrant', 'scientific', 'discussion', 'fascinating', 'exciting', 'non_coding', 'ribonucleic_acid', 'microrna', 'pirna', 'long', 'non_coding', 'ribonucleic_acid', 'diabetes', 'neurodegenerative']
Vocabulary and BoW calculated











00%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3800/3800 [00:00<00:00, 53309.10it/s]

Extracting for ai...
CORPUS: ['remark', 'speech_recognition', 'tea', 'sources', 'sensor', 'signal', 'study', 'capacity', 'deep_learning', 'distinguish', 'tea', 'source', 'aroma', 'aroma', 'tea', 'source', 'contain', 'sensor', 'response', 'measure', 'gas', 'sense', 'mass', 'sensitive', 'chemical', 'sensor', 'evaluate', 'speech_recognition', 'deep_learning', 'aroma', 'speech_recognition', 'experiment', 'frequency', 'analysis', 'continuous', 'wavelet_transform', 'morlet', 'mother_wavelet', 'extraction', 'feature', 'sensor', 'signal', 'deep_learning', 'achieve', 'speech_recognition', 'tea', 'source', 'gas', 'indoor', 'air', 'speech_recognition', 'deep_learning', 'obtain', 'pattern', 'speech_recognition', 'naive_bayes', 'random_forest', 'experimental', 'demonstrate', 'effectiveness', 'deep_learning']
Vocabulary and BoW calculated











00%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3800/3800 [00:00<00:00, 67266.91it/s]

Extracting for cordis...
CORPUS: ['methods', 'processes', 'embedded_systems', 'embed', 'critical', 'variety', 'couple', 'constraint', 'enabler', 'interoperable', 'absence', 'recognize', 'limit', 'factor', 'term', 'com', 'enabler', 'significant', 'conclusive', 'particular', 'formalization', 'viewpoint', 'multi_criterion', 'component', 'space_exploration', 'comprise', 'multi_criterion', 'addition', 'intend', 'deploy', 'customizable', 'possible', 'available', 'significant', 'step', 'term', 'help', 'standardization', 'rely', 'point_view', 'rely', 'maturity', 'input', 'trl', 'maturity', 'trl', 'aspect', 'aspect']
Vocabulary and BoW calculated








00%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [00:00<00:00, 58434.58it/s]

Extracting for cancer...
CORPUS: ['ribonucleic_acid', 'meet', 'train', 'jena', 'participant', 'join', 'young', 'scientist', 'meeting', 'german_society', 'biological', 'entitle', 'ribonucleic_acid', 'excellent', 'speaker', 'world', 'graduate_student', 'young', 'leader', 'enjoy', 'meeting', 'familiar', 'atmosphere', 'exchange', 'inspire', 'new', 'vibrant', 'scientific', 'discussion', 'fascinating', 'exciting', 'non_coding', 'ribonucleic_acid', 'microrna', 'pirna', 'long', 'non_coding', 'ribonucleic_acid', 'diabetes', 'neurodegenerative']
Vocabulary and BoW calculated








00%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 47/47 [00:00<00:00, 55234.60it/s]

Extracting for ai...
CORPUS: ['remark', 'speech_recognition', 'tea', 'sources', 'sensor', 'signal', 'study', 'capacity', 'deep_learning', 'distinguish', 'tea', 'source', 'aroma', 'aroma', 'tea', 'source', 'contain', 'sensor', 'response', 'measure', 'gas', 'sense', 'mass', 'sensitive', 'chemical', 'sensor', 'evaluate', 'speech_recognition', 'deep_learning', 'aroma', 'speech_recognition', 'experiment', 'frequency', 'analysis', 'continuous', 'wavelet_transform', 'morlet', 'mother_wavelet', 'extraction', 'feature', 'sensor', 'signal', 'deep_learning', 'achieve', 'speech_recognition', 'tea', 'source', 'gas', 'indoor', 'air', 'speech_recognition', 'deep_learning', 'obtain', 'pattern', 'speech_recognition', 'naive_bayes', 'random_forest', 'experimental', 'demonstrate', 'effectiveness', 'deep_learning']
Vocabulary and BoW calculated








00%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 28428.55it/s]

   dataset algorithm  iter       PCC       PCD  Sibling_TD      PnCD
0   cordis      hpam     0 -0.092341  0.964444    0.955414  0.965778
1   cordis      hpam     1 -0.051832  0.941111    0.926507  0.964889
2   cordis      hpam     2 -0.087348  0.963333    0.920876  0.962667
3   cancer      hpam     0 -0.246210  0.995667    0.977453  0.994772
4   cancer      hpam     1 -0.273212  0.998000    0.980479  0.996632
5   cancer      hpam     2 -0.271764  0.996667    0.981429  0.996614
6       ai      hpam     0 -0.160386  0.991000    0.968773  0.991263
7       ai      hpam     1 -0.164539  0.989333    0.957675  0.991263
8       ai      hpam     2 -0.181021  0.992667    0.982173  0.992877
9   cordis      hlda     0 -0.072132  0.946667    0.982173  0.000000
10  cordis      hlda     1 -0.061312  0.946667    0.982173  0.000000
11  cordis      hlda     2 -0.066540  0.958730    0.982173  0.000000
12  cancer      hlda     0 -0.206344  0.957447    0.982173  0.000000
13  cancer      hlda     1 -0.1835

In [26]:
results_df_tomo #.to_csv("tomo.csv")

Unnamed: 0,dataset,algorithm,iter,PCC,PCD,Sibling_TD,PnCD
0,cordis,hpam,0,-0.092341,0.964444,0.955414,0.965778
1,cordis,hpam,1,-0.051832,0.941111,0.926507,0.964889
2,cordis,hpam,2,-0.087348,0.963333,0.920876,0.962667
3,cancer,hpam,0,-0.24621,0.995667,0.977453,0.994772
4,cancer,hpam,1,-0.273212,0.998,0.980479,0.996632
5,cancer,hpam,2,-0.271764,0.996667,0.981429,0.996614
6,ai,hpam,0,-0.160386,0.991,0.968773,0.991263
7,ai,hpam,1,-0.164539,0.989333,0.957675,0.991263
8,ai,hpam,2,-0.181021,0.992667,0.982173,0.992877
9,cordis,hlda,0,-0.072132,0.946667,0.982173,0.0


### HTM-WS / HTM-DS

In [6]:
print("--- Calculating ours.... ")
algorithms = ["ws", "ds"]
PATH_MODELS = "/export/usuarios_ml4ds/lbartolome/Datasets/XXXX/htm_variability_models"
PATH_MODELS_PREPROC = "/export/usuarios_ml4ds/lbartolome/Datasets/XXXX/models_preproc/iter_0"
PATH_CORPUS = "/export/usuarios_ml4ds/lbartolome/Datasets/XXXX/models_preproc/iter_0/corpus.txt"

results_df_ours = []

for dtset in ["ai"]:

    print(f"Executing for dtset: {dtset}")

    ##### GET CORPUS, VOCAB AND BOW #####
    if dtset == "cordis":
        path_corpus = PATH_CORPUS.replace("XXXX", "CORDIS")
        path_models = pathlib.Path(PATH_MODELS.replace("XXXX", "CORDIS"))
        model_path_preproc = PATH_MODELS_PREPROC.replace("XXXX", "CORDIS")
        nr_topics = "6"
    elif dtset == "cancer":
        path_corpus = PATH_CORPUS.replace("XXXX", "Cancer")
        path_models = pathlib.Path(PATH_MODELS.replace("XXXX", "Cancer"))
        model_path_preproc = PATH_MODELS_PREPROC.replace("XXXX", "Cancer")
        nr_topics = "20"
    else:
        path_corpus = PATH_CORPUS.replace("XXXX", "S2CS-AI") 
        path_models = pathlib.Path(PATH_MODELS.replace("XXXX", "S2CS-AI"))
        model_path_preproc = PATH_MODELS_PREPROC.replace("XXXX", "S2CS-AI")
        nr_topics = "20"
        
    df = mallet_corpus_to_df(path_corpus)
    
    vocab_w2id = {}
    with (pathlib.Path(model_path_preproc)/'vocabulary.txt').open('r', encoding='utf8') as fin:
        for i, line in enumerate(fin):
            wd = line.strip()
            vocab_w2id[wd] = i
    
    vectorizer = CountVectorizer(vocabulary=vocab_w2id.keys(), tokenizer=lambda x: x.split())
    bow = vectorizer.fit_transform(df.text.values.tolist())
    bow = bow#.toarray()

    ##### LOAD MODELS #####
    df_root, df_all = get_df_all_models(path_models)
    father = [el.as_posix() for el in list(df_root.path.unique()) if nr_topics in el.as_posix()][0]
    print(f"This is the father: {father}")

    for algo in algorithms:
        for iter_ in [0,1,2]:
            PC_pair_groups = []
            P_noC_pair_groups = []
            sibling_groups = []
            
            filtered_df = df_all[
                (df_all.father == father) &
                (df_all.tr_tpcs == 10) &
                (df_all.iter == iter_) & # Cambia iter por iter_ si este es el nombre correcto
                (df_all.thr.isin([0, 0.6]))  # Descomentar si se requiere
            ]
    
            # append the descriptions of topics at the first level
            sibling_groups.append([" ".join(el.split(", ")) for el in filtered_df.iloc[0].tpc_descriptions])
        
            second_level_siblings = []
            for _, submodel in filtered_df.iterrows():  # Iterate over filtered submodels
                if algo in submodel.path.as_posix():
                    for submodel_topic in submodel.tpc_descriptions:
                        # Add Parent-Child (PC) pair
                        pair_PC = [submodel.father_tpc_descriptions.split(", "), submodel_topic.split(", ")]
                        PC_pair_groups.append(pair_PC)
                    
                    second_level_siblings += [" ".join(el.split(", ")) for el in submodel.tpc_descriptions]
                    # Add Parent non-Child (P_noC) pair
                    for _, submodel_j in filtered_df.iterrows():
                        if submodel.exp_tpc != submodel_j.exp_tpc:  # Ensure submodel_j is different from submodel
                            if algo in submodel_j.path.as_posix():
                                for submodel_topic in submodel.tpc_descriptions:
                                    pair_P_noC = [submodel_j.father_tpc_descriptions.split(", "), submodel_topic.split(", ")]
                                    P_noC_pair_groups.append(pair_P_noC)
                                    
            sibling_groups.append(second_level_siblings)
            sibling_groups = [sibling_groups]
            
            results = defaultdict()
            results["dataset"] = dtset
            results["algorithm"] = algo
            results["iter"] = iter_
            results["PCC"] = np.mean(get_CLNPMI(PC_pair_groups, bow, vocab_w2id))
            results["PCD"] = np.mean(get_topics_difference(PC_pair_groups))
            results["Sibling_TD"] = np.mean(get_Sibling_TD(sibling_groups))
            results["PnCD"] =np.mean(get_topics_difference(P_noC_pair_groups))
    
            results_df_ours.append(pd.DataFrame([results]))
            
results_df_ours = pd.concat(results_df_ours, ignore_index=True)

--- Calculating ours.... 
Executing for dtset: ai




[PosixPath('/export/usuarios_ml4ds/lbartolome/Datasets/S2CS-AI/htm_variability_models/htm_20_tpcs_20230929')]
/export/usuarios_ml4ds/lbartolome/Datasets/S2CS-AI/htm_variability_models/htm_20_tpcs_20230929
This is the father: /export/usuarios_ml4ds/lbartolome/Datasets/S2CS-AI/htm_variability_models/htm_20_tpcs_20230929




















00%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15200/15200 [00:00<00:00, 63919.93it/s]

In [9]:
results_df_ours.to_csv("ours_ai_from_copy.csv")

In [8]:
results_df_ours

Unnamed: 0,dataset,algorithm,iter,PCC,PCD,Sibling_TD,PnCD
0,ai,ws,0,0.004027,0.805667,0.670333,0.989702
1,ai,ws,1,0.014571,0.811333,0.614833,0.990053
2,ai,ws,2,0.013211,0.810667,0.676333,0.989544
3,ai,ds,0,0.000232,0.806498,0.438608,0.98066
4,ai,ds,1,0.003045,0.807619,0.424603,0.98266
5,ai,ds,2,0.00619,0.81,0.491917,0.98357
