In [1]:
import numpy as np
import pandas as pd
from functools import reduce
import pathlib
import matplotlib.pyplot as plt
import gzip
import scipy.sparse as sparse
import random

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['xtick.labelsize'] = 24
plt.rcParams['ytick.labelsize'] = 24
plt.rcParams['font.size'] = 22
plt.rcParams['axes.titlesize'] = 24
plt.rcParams['axes.labelsize'] = 24
plt.rcParams['legend.fontsize'] = 24
plt.rcParams['lines.markersize'] = 13
plt.style.use('seaborn-white')
plt.rcParams['lines.linewidth'] = 4

In [2]:
pd.set_option('display.max_colwidth', None)

In [76]:
path_model = \
    pathlib.Path("/export/usuarios_ml4ds/lbartolome/Datasets/CORDIS/models_htm_ctm")

In [77]:
thetas = sparse.load_npz(path_model.joinpath("root_model_0_20230102").joinpath('TMmodel/thetas.npz'))

In [27]:
# ndocs x ntopics: (45837, 10)
thetas_dense = thetas.todense()
thetas_dense.shape

(45837, 10)

In [80]:
smaller_than_value = thetas_dense[thetas_dense < 0.003]
smaller_than_value

matrix([], shape=(1, 0), dtype=float64)

In [6]:
def sum_up_to(vector, max_sum):
    x = np.array(list(map(np.int_, vector*max_sum))).ravel()
    pos_idx = list(np.where(x != 0)[0])
    while np.sum(x) != max_sum:
        idx = random.choice(pos_idx)
        x[idx] += 1
    return x

In [7]:
def get_str_rpr(vector, max_sum):
    vector = sum_up_to(vector, max_sum)
    rpr = ""
    for idx,val in enumerate(vector):
        rpr += "t" + str(idx) + "|" + str(val) + " "
    rpr = rpr.rstrip() 
    return rpr

In [72]:
path_corpus = path_model.joinpath("root_model_0_20230102").joinpath("modelFiles/corpus.txt")
zs_file = path_model.joinpath("root_model_0_20230102").joinpath("w_assign.txt")

In [73]:
# Original corpus
path_corpus_parquet = path_model.joinpath("root_model_0_20230102").joinpath("corpus.parquet")
corpus = pd.read_parquet(path_corpus_parquet)
len(corpus)

61117

In [74]:
# Training corpus
corpus_txt = [line.rsplit(' 0 ')[1].strip() for line in open(
    path_corpus, encoding="utf-8").readlines()]
print(len(corpus_txt))
doc_tpc_rpr = [get_str_rpr(thetas_dense[row,:], 1000) for row in range(len(thetas_dense))]
zs = [line.split("\n")[0] for line in open(zs_file, encoding="utf-8").readlines()]

45837


In [100]:
# 2 level subcorpus from training corpus
submodel_path = path_model.joinpath("root_model_0_20230102").joinpath("submodel_htm-ws_from_topic_0_train_with_10_20230102")
path_subcorpus = submodel_path.joinpath("corpus.parquet")
subcorpus = pd.read_parquet(path_subcorpus)
subcorpus_txt = subcorpus.bow_text.tolist()
len(subcorpus)

45837

In [101]:
df = pd.DataFrame(list(zip(corpus_txt,doc_tpc_rpr, zs, subcorpus_txt)),
               columns =['lemmas', 'doc-tpc', "zs", "sublemmas"])
df

Unnamed: 0,lemmas,doc-tpc,zs,sublemmas
0,discovery pleasure change italy long west reproduction narrative historiography argument cease tie marriage second_half form sexuality oriented experimentation fulfilment female experience start serve woman integrate theory unprecedented change industrialise western_europe post explore transformation italy reconstruction shift public morality change west country sexuality country post nature valuable reference experience british representation opportunity expand professional competence term skill contact positive career,t0|47 t1|29 t2|38 t3|43 t4|18 t5|45 t6|43 t7|49 t8|37 t9|651,6 6 2 6 6 6 6 6 6 6 6 6 4 6 6 6 6 3 6 6 7 6 4 6 6 9 6 5 6 6 6 7 6 6 6 6 6 6 6 6 8 8 3 6 2 6 9 8 6 6 9 6 6 6 3 3 6 8 6,
1,spectroscopic computing investigation chromium binding pure mineral aquifer soil parallel objective transfer expand expertise maria host_institution national technical university_athens objective frequent environment mobile polluted groundwater surface complexation modeling rely recent_year spectroscopic theory investigation bind metal contaminant mineral surface limited utilize fourier_transform infrared_spectroscopy extended x_ray absorption fine spectroscopy density_functional theory calculation combination sorption complexation surface γ alumina condition ph surface loading competitive anion generate data future construct data_drive surface complexation objective pursue add life coordinate chromium river remediation technologies measures guide risk_assessment remediation greece micro x_ray fluorescence absorption edge diffraction bulk determine bind aquifer refine transport treatment build expertise participate lab sanitary engineering laboratory laboratory metallurgy spectroscopic_technique add element environment portfolio collaboration intend academic_career,t0|47 t1|72 t2|55 t3|49 t4|37 t5|81 t6|158 t7|95 t8|35 t9|371,6 4 9 9 1 9 7 9 7 2 2 2 2 1 2 9 1 9 2 1 9 0 2 9 2 4 7 9 1 2 1 2 8 2 4 4 9 7 1 1 6 8 6 9 4 2 1 4 7 2 5 2 2 2 6 0 7 9 2 4 6 9 4 1 9 8 7 1 4 1 7 0 1 0 0 6 0 2 4 6 2 9 2 8 3 1 1 9 1 9 2 2 2 4 5 6 1 2 3 1 7 1 3 1 9 2 1 1 4 0 5 7,environment alumina add coordinate chromium remediation portfolio
2,enterprise_europe targeting innovation implement tailor innovation management service enhance germany sme potential service adapt sme market requirement order innovative product service market key management sme_beneficiary eic_pilot identify match sme owner coach investor order gap financial requirement region southwest germany serve enterprise_europe consortium,t0|24 t1|692 t2|59 t3|35 t4|42 t5|19 t6|41 t7|26 t8|30 t9|32,0 5 2 8 1 0 0 8 0 6 8 0 0 6 0 0 0 0 4 1 0 9 6 6 1 1 0 6 9 9 4 0 9 1 0 8 0 7 6 0 8 0,enterprise_europe innovation management enhance potential service sme market requirement order service identify investor financial region serve consortium
3,network observatories research_infrastructures volcanology construct integrated harmonize support exploit build national pan_european include e_infrastructure volcano harmonization include link stakeholders connection isolate locate situ overcome_fragmentation level include fragmentation main theme building volcano atmosphere interaction surface volcanic risk management networking_activity theme include collaboration network civil_protection agency network atmosphere gas aerosol observation observation subsurface initiation multidisciplinary observation volcano laboratory test_bed joint include production service initialize volcanic_ash transport dispersal eruption integrate modeling pre eruption data complete trans_national facilitate virtual modeling respond eruption offer integrate open wide simplified efficient key multidisciplinary locate lead improve driver good_practice observatory open pathway exploit volcanic geothermal_energy,t0|50 t1|63 t2|57 t3|408 t4|24 t5|79 t6|63 t7|59 t8|147 t9|50,0 0 4 0 5 2 2 0 0 0 9 0 5 7 7 0 0 2 6 5 2 0 4 4 4 0 9 5 4 8 2 4 8 3 0 9 0 0 2 4 8 0 7 0 7 7 9 5 4 4 4 0 0 6 4 4 4 4 2 0 8 3 2 7 9 4 8 4 9 8 4 9 7 2 0 2 0 0 8 4 8 4 7 7 4 4 4 0 0 3 0 8 2 7 2 0 4,network observatories volcanology support exploit build pan_european harmonization include locate include volcanic management networking_activity network agency initiation multidisciplinary production virtual respond eruption lead improve good_practice volcanic
4,dynamical large_number partial_differential physics infinite_dimensional hamiltonian dynamical class schrödinger_equation wave hydrodynamic numerous derive pose fundamental_question inspire entire field investigation main invariant space hamiltonian start stationary finite_dimensional dynamical main field link fact interesting outer parameter resonant derivative non_linearity fundamental open field main_goal semi linear quasi linear normal form integrable integrable hydrodynamic search water wave periodic nonlinear_pde resonant minimal restriction non_linearity naples idea innovative combine kam method normal form theory combinatorial method,t0|31 t1|799 t2|40 t3|17 t4|19 t5|18 t6|12 t7|18 t8|18 t9|28,7 3 7 7 7 8 7 7 5 0 7 3 7 7 7 2 4 7 7 6 7 2 5 7 7 7 7 7 7 7 4 7 7 2 7 7 1 2 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8 7 7 7 7 7 7 7 1 1 7 7 7 6 7 0 7,wave combinatorial
...,...,...,...,...
7006,rational_design synthesis experimental statistics promising_alternative combine microporous zeolite metal_organic high porosity diversity transition_metal center tailor linker reside nanoporous mof network corner unit transition_metal linker unit molecule functionalise manifold structural analogy zeolite offer interesting promising feature porous_material possibility tailor specific zeolite cavity window exceptional chemistry stability organic_solvent water aqueous alkaline mof discovery promising porous_material specific adsorption happen trial_error rational_design molecule simulation outstanding prediction performance select optimal objective identify optimal simulation adsorption performance synthesis_characterisation pre select computing experimental performance industry simulation experiment target gas_separation mixture gas purification storage capture novelty lie produce fundamental engineering relevance industry,t0|23 t1|61 t2|25 t3|36 t4|18 t5|234 t6|97 t7|50 t8|78 t9|378,1 9 1 8 7 1 1 1 4 5 1 5 1 4 7 9 9 1 1 1 1 5 1 5 7 8 1 9 0 5 1 1 1 8 5 1 1 1 5 5 1 1 1 4 1 1 8 8 1 1 2 9 6 5 1 9 7 1 8 1 1 5 5 1 1 1 5 1 1 1 5 7 1 1 8 1 7 1 8 1 3 5 7 5 5 9 7 1 1 1 2 1 1 9,structural
7007,deliver promise increase rate pose stringent_requirement integrate operator telecom_operator expect roll small cell require urban space backhaul availability network sharing unlock commercial deployment open_access neutral play_key deployment network urban scenario dense small cell require parallel trend pave_way heterogeneous distributed cloud paradigms differ today establish cloud edge_computing cloud architecture push edge network close produce act data posit set player poise trend build edge deployment telecommunication focus smart_city cell power edge network player resource sharing end end virtualization push cloud extreme edge deploy distributed cloud radio platform municipality owner act neutral main build deploy open platform extend centralized cloud extreme edge network demonstration city_barcelona bristol impact varied range actor telecom number vertical utilize city,t0|41 t1|56 t2|39 t3|81 t4|52 t5|89 t6|62 t7|40 t8|450 t9|90,4 4 4 4 4 3 4 4 4 4 9 4 4 5 4 4 4 4 4 4 4 4 4 1 3 4 4 4 4 4 4 4 4 4 3 4 4 4 4 4 4 4 4 4 4 4 4 4 8 4 4 4 4 4 4 3 4 4 6 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4 4 4 3 4 4 4 4 4 4 3 4 4 7 4 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 7 4 4 4,
7008,semantic collaboration management e_learning tourism prepare start commercial_exploitation semantic collaboration software target market management enhance management e_learning tourism open_source platform semantic collaboration universität active adopt sme exploitation prospective target market,t0|30 t1|52 t2|43 t3|171 t4|93 t5|69 t6|102 t7|101 t8|284 t9|55,9 6 5 4 1 6 6 6 3 6 4 0 1 6 2 8 6 6 6 8 9 0 6 5 2 7 8 6 7 6,target collaboration
7009,probe sodium channels painful neuropathic_pain frequent feature peripheral_neuropathy impact patient quality_life increase healthcare individual neuropathy pain prediction susceptible similar risk exposure inability identify high risk individual drug responder patient lack drug act target site strong evidence pathogenicity explain disappointing effect treatment propane issue solid genetic employ innovative technological platform main_objective resolve genetic architecture painful neuropathy stratification high risk neuropathic_pain patient biomarker deepen_understanding underlie druggable_target identify molecule tailor drug responder patient determine effect pre_clinical setting field state_art opening window validate individualized pain medicine innovative impact patient care market target sequencing genetic express nociceptive pathway identification target unbiased exome_sequencing transcriptome assay offer close optimal balance focused achievable goal pain unbiased capture deliver target unique patient population strong preliminary_finding departure point propane expect translate population pain patient term diagnosis_treatment,t0|87 t1|170 t2|121 t3|36 t4|100 t5|133 t6|79 t7|60 t8|35 t9|179,0 8 2 8 5 8 8 8 4 4 8 8 1 2 8 8 8 8 5 8 8 8 8 9 8 8 8 6 8 8 8 5 8 3 8 9 8 8 8 3 8 8 8 6 8 3 8 8 8 8 8 3 0 8 6 8 5 8 9 8 9 3 8 6 8 8 8 8 8 8 5 9 8 8 8 3 8 8 8 8 8 8 8 2 8 8 8 5 3 8 3 8 8 8 8 8 8 8 8 3 8 8 8 3 8 8 8 4 9 9 8 2 8 4 0 8 8 8 8 4 3 8 8 8 8 8,probe architecture strong


In [98]:
keywords = [line[0:-1] for line in open(
    path_model.joinpath("root_model_0_20230102").joinpath('TMmodel/tpc_descriptions.txt'),
    encoding="utf-8").readlines()]
labels = ["Innovation Policy","Nanotechnology", "Climate Change and Ecology", "Molecular Biology", "Information Technology",
          "Sustainable Energy","Miscellaneous", "Physics and Astronomy", "Healthcare and Medicine", "Manufacturing and Industry"]
df_keywords = pd.DataFrame({"Keywords": keywords,
                            "Labels": labels})
df_keywords

Unnamed: 0,Keywords,Labels
0,"innovation, policy, stakeholders, support, country, cooperation, partner, initiative, organisation, implementation, international, national, promote, action, foster",Innovation Policy
1,"synthesis, catalysis, optical, magnetic, light, nanoscale, reaction, electron, photonic, graphene, spin, nanostructure, chemistry, photon, nanoparticle",Nanotechnology
2,"change, specie, climate, climate_change, evolution, population, biodiversity, forest, environment, pattern, brain, ocean, influence, adaptation, ecological",Climate Change and Ecology
3,"cell, protein, regulation, expression, pathway, molecular_mechanism, signal, mouse, receptor, gene_expression, rna, regulator, chromatin, mediate, identify",Molecular Biology
4,"data, software, robot, communication, platform, information, service, cloud, network, security, internet, artificial_intelligence, management, real_time, distributed",Information Technology
5,"power, aircraft, component, performance, battery, engine, fuel, high, manufacture, electric, reduction, grid, wind, efficiency, concept",Sustainable Energy
6,"ims, mb, pose_threat, validated, pos, ais, waterborne, licensed, spare, socio_economical, ast, greater, slicing, microsatellite, obsessive_compulsive",Miscellaneous
7,"mathematic, theory, numerical, observation, geometry, planet, galaxy, star, physics, deep, method, universe, precision, observational, algebra",Physics and Astronomy
8,"patient, treatment, diagnosis, healthcare, drug, therapy, biomarker, risk, care, intervention, vaccine, efficacy, clinical_trial, disease, outcome",Healthcare and Medicine
9,"product, production, waste, market, food, water, industry, produce, company, bio, quality, agriculture, grow, plastic, packaging",Manufacturing and Industry


In [105]:
keywords = [line[0:-1] for line in open(
    path_model.joinpath("root_model_0_20230102").joinpath("submodel_htm-ws_from_topic_2_train_with_10_20230102").joinpath('TMmodel/tpc_descriptions.txt'),
    encoding="utf-8").readlines()]
labels_0 = ["Sustainability", "Agricultural Economics", "Circular Economy", "Biotechnology", "Synthesis",
          "Collaboration", "Public Policy", "Cultural History", "Robotics", "Business Innovation"]
labels_2 = ["Society and Politics", "Climate Science", "Cognitive Science", "Evolutionary Biology", "Data Analysis",
            "Cellular and Molecular Biology", "Biomedical Research", "Cutting-Edge Technologies",
            "Materials Science", "Academic Research"]
df_keywords = pd.DataFrame({"Keywords": keywords,
                            "Labels": labels_2
                           })
df_keywords

Unnamed: 0,Keywords,Labels
0,"country, politic, migration, economy, inequality, survey, city, culture, policy, international, urban, law, gender, literature, media",Society and Politics
1,"climate, change, climate_change, future, ecosystem, water, impact, prediction, effect, past, land, ocean, long_term, modeling, data",Climate Science
2,"brain, information, human, neuron, perception, cognitive, visual, representation, control, language, action, specific, computing, functional, network",Cognitive Science
3,"population, specie, evolution, genetic, plant, individual, trait, genome, environment, ecological, effect, pattern, biology, human, diversity",Evolutionary Biology
4,"disseminate, merge, tailor, related, principal, participatory, predictor, involvement, supervision, studies, ubiquitous, stem, duration, capable, competitiveness",Data Analysis
5,"receptor, stem_cell, mechanical, therapy, subset, mass_spectrometry, molecular_biology, metabolite, high_throughput, microscopy, unveil, phage, immunity, membrane, dna",Cellular and Molecular Biology
6,"initiate, precursor, proteomic, disruption, persistence, characterization, microscopic, assay, diagnosis, replication, kinetic, regulator, strain, robustness, symptom",Biomedical Research
7,"intense, impossible, stone, unable, narrow, huge, cutting_edge, socio, synergy, dangerous, persistent, impose, wp, intra, imperative",Cutting-Edge Technologies
8,"solid, integrity, pioneering, equip, appearance, eurasia, intensive, retrieve, react, persist, intense, socio, extraordinary, lateral, migrate",Materials Science
9,"manifest, branch, dynamics, grant, dominate, predictor, persist, inherent, random, bear, sparse, phd, epidemic, acknowledge, latera",Academic Research


In [108]:
keywords = [line[0:-1] for line in open(
    path_model.joinpath("root_model_0_20230102").joinpath("submodel_htm-ds_thr_0.1_from_topic_2_train_with_10_20230102").joinpath('TMmodel/tpc_descriptions.txt'),
    encoding="utf-8").readlines()]
labels_0 = ["Sustainability", "Agricultural Economics", "Circular Economy", "Biotechnology", "Synthesis",
          "Collaboration", "Public Policy", "Cultural History", "Robotics", "Business Innovation"]
labels_2 = ["Society and Politics", "Climate Science", "Cognitive Science", "Evolutionary Biology", "Data Analysis",
            "Cellular and Molecular Biology", "Biomedical Research", "Cutting-Edge Technologies",
            "Materials Science", "Academic Research"]
labels_2_01 = ["Neuroscience and Brain Science","Miscellaneous or Unclear","Evolution and Biodiversity",
               "Genetics and Molecular Biology", "Sustainability and Climate Change",
               "Climate Science and Atmospheric Chemistry", "Agriculture and Crop Science",
               "Artificial Intelligence and Robotics", "Economics and Business", "Politics and Migration"]
df_keywords = pd.DataFrame({"Keywords": keywords,
                            "Labels": labels_2_01
                           })
df_keywords

Unnamed: 0,Keywords,Labels
0,"brain, neuron, neural, memory, neuroscience, circuit, plasticity, functional, image, synaptic, behavioral, cortical, magnetic_resonance, mouse, network",Neuroscience and Brain Science
1,"apparel, best_practice, mild, person_month, hemoglobin, hydrogen_bond, histone_mark, micrometre, braid, increased, parasitic_disease, muscle_weakness, mri_scanner, activator, grind",Miscellaneous or Unclear
2,"specie, population, evolution, diversity, ecological, ecology, trait, biodiversity, adaptation, variation, selection, genetic, adaptive, pathogen, genome",Evolution and Biodiversity
3,"biology, genome, protein, tissue, cancer, cell, sequence, molecule, organism, dna, bacteria, expression, molecular_mechanism, gene_expression, regulation",Genetics and Molecular Biology
4,"management, integrate, monitoring, service, sustainability, stakeholders, scenario, support, uncertainty, satellite, include, assess, climate_change, forecast, vulnerability",Sustainability and Climate Change
5,"atmosphere, climate, earth, surface, ocean, carbon, temperature, quantify, future, sea, sediment, flux, concentration, chemistry, aerosol",Climate Science and Atmospheric Chemistry
6,"healthcare, disease, crop, food, exposure, plant, microbiome, effect, factor, breeding, pathogen, agriculture, increase, root, stress",Agriculture and Crop Science
7,"robot, language, learning, artificial, speech, linguistic, cognitive, music, task, human, object, interactive, perception, artificial_intelligence, autonomous",Artificial Intelligence and Robotics
8,"financial, economy, shock, firm, policymaker, market, business, investment, trade, company, consumer, productivity, policy, industry, evidence",Economics and Business
9,"migration, politic, transnational, actor, place, national, state, space, seek, power, regime, china, conflict, integration, mobil",Politics and Migration


In [35]:
submodel_path = path_model.joinpath("root_model_0_20230102").joinpath("submodel_htm-ws_from_topic_6_train_with_10_20230102")

45837

In [15]:
thetas = sparse.load_npz(submodel_path.joinpath('TMmodel/thetas.npz'))
# ndocs x ntopics: (34377, 10)
thetas_dense = thetas.todense()

In [40]:
path_subcorpus = submodel_path.joinpath("modelFiles/corpus.txt")
subcorpus_txt = [line.rsplit(' 0 ')[1].strip() for line in open(
    path_subcorpus, encoding="utf-8").readlines()]
print(len(subcorpus_txt))
doc_tpc_rpr = [get_str_rpr(thetas_dense[row,:], 1000) for row in range(len(thetas_dense))]

34377


In [42]:
df2 = pd.DataFrame(list(zip(subcorpus_txt, doc_tpc_rpr)),
               columns =['lemmas', 'doc-tpc'])
df2

Unnamed: 0,lemmas,doc-tpc
0,spectra procedure,t0|32 t1|28 t2|24 t3|44 t4|22 t5|22 t6|751 t7|39 t8|24 t9|14
1,blood_flow cardiologist open,t0|85 t1|168 t2|174 t3|52 t4|64 t5|52 t6|61 t7|110 t8|34 t9|200
2,,t0|296 t1|49 t2|52 t3|43 t4|110 t5|34 t6|209 t7|62 t8|63 t9|82
3,connection understanding family collect serve pedagogical relation potential sociology school visual,t0|265 t1|37 t2|157 t3|47 t4|137 t5|43 t6|70 t7|129 t8|62 t9|53
4,flexible sensor,t0|9 t1|62 t2|32 t3|47 t4|19 t5|11 t6|52 t7|740 t8|20 t9|8
...,...,...
34372,,t0|46 t1|57 t2|29 t3|38 t4|66 t5|462 t6|48 t7|70 t8|45 t9|139
34373,cell direct,t0|185 t1|28 t2|360 t3|27 t4|80 t5|86 t6|99 t7|37 t8|56 t9|42
34374,fuel exploit important_implication industry,t0|45 t1|25 t2|20 t3|36 t4|43 t5|72 t6|27 t7|27 t8|54 t9|651
34375,neuron habit automatization intentional,t0|27 t1|16 t2|25 t3|17 t4|26 t5|12 t6|11 t7|10 t8|31 t9|825


In [43]:
submodel_path = path_model.joinpath("root_model_0_20230102").joinpath("submodel_htm-ds_thr_0.1_from_topic_6_train_with_8_20230102")

In [45]:
thetas = sparse.load_npz(submodel_path.joinpath('TMmodel/thetas.npz'))
# ndocs x ntopics: (34377, 10)
thetas_dense = thetas.todense()
thetas_dense.shape

(7011, 10)

In [46]:
path_subcorpus = submodel_path.joinpath("modelFiles/corpus.txt")
subcorpus_txt = [line.rsplit(' 0 ')[1].strip() for line in open(
    path_subcorpus, encoding="utf-8").readlines()]
print(len(subcorpus_txt))
doc_tpc_rpr = [get_str_rpr(thetas_dense[row,:], 1000) for row in range(len(thetas_dense))]

7011


In [47]:
df2 = pd.DataFrame(list(zip(subcorpus_txt, doc_tpc_rpr)),
               columns =['lemmas', 'doc-tpc'])
df2

Unnamed: 0,lemmas,doc-tpc
0,responsible integrate blockchain offer conflict mineral ensure breach human_right risk origin country mineral gold tin tungsten copper component electronic today apple nokia rely product usage valuable ore pose crucial ethical_legal risk company involve extraction country congo funding violence company csr transform today costly service commodity processor stakeholders capitalize ethical free decentralized company create data incentivise stakeholders responsible extraction follow entire chain team extensive_experience scale business serve mineral supply_chain,t0|48 t1|30 t2|38 t3|42 t4|19 t5|44 t6|43 t7|49 t8|37 t9|650
1,secure efficient embed future_internet security past rely connectivity ubiquity today establish security engineering embed inorder achieve_goal set resource efficient security building_block specific security engineering support integrate building_block target follow driver security engineering increase security practical scenario relevant efficient embed,t0|47 t1|72 t2|56 t3|48 t4|36 t5|80 t6|159 t7|96 t8|35 t9|371
2,epilepsy network genetic synapsis circuit pave_way drug strategy epilepsy network genetic synapsis circuitry pave_way drug strategy promote collaborative multidisciplinary translational epilepsy enhance effective knowledge_transfer exchange good mobile molecule imm lead partner university_amsterdam university_rome epilepsy center lund_university reputed neuroscientist impact synaptic brain circuitry dysregulation innovative therapy refractory form epilepsy increase scientific_technological innovation epilepsy network imm interchange idea partner sustain network deadline promote joint grant joint phd_student train_young promote internationalisation increase awareness epilepsy caregiver patient promote joint target dissemination promote meeting orient thematic hand workshop_summer school short_term site visit imm transfer partner add_value spread university_lisbon portuguese interaction mind brain college university_lisbon national neuroscientific patient_caregiver organization,t0|24 t1|691 t2|60 t3|36 t4|41 t5|19 t6|42 t7|26 t8|29 t9|32
3,evolution explanation cooperation microbe humans cooperation pose evolution theory exploit selfish individual evolutionary_biologist theory overview cooperation contrast theory potential unsuccessful theory understanding observation cooperative nature present interdisciplinary prediction lead explanation cooperation theory explicit testable_prediction specific exploit offer experiment bacteria vertebrate experiment human addition specific hypothesis evolution theory link explanation cooperation taxon level biology organization,t0|49 t1|64 t2|57 t3|409 t4|24 t5|78 t6|62 t7|58 t8|148 t9|51
4,visibility macedonia organize rm fourth public young_people manifestation public contact citizen wide_range creation national wealth popularization field public intention reach young general_public opportunity promote career sustainability improvement quality everyday live economy society integration european_union increase success term number participate people public_opinion survey justify preparation improve appreciation public involve people engage young_people consortium present visit organization involve consortium extend partner survey people present great_interest researcher_night regular manifestation city good position interact national level main_objective researcher_night pan_european consist close public direct exchange meeting interactive present researcher_ordinary people,t0|29 t1|799 t2|40 t3|18 t4|20 t5|17 t6|13 t7|17 t8|19 t9|28
...,...,...
7006,evaluation test identity management deploy grow number range identification platform biometric passport control border check reliability remain wide standard evaluation attack strength beat fill_gap build online open platform evaluation biometric protocol vulnerability standardization evaluation impact reliability biometric lead meaningful increase performance transfer company easy interoperable authority decision_maker informed progress biometric impact standard,t0|23 t1|63 t2|23 t3|34 t4|18 t5|235 t6|97 t7|50 t8|78 t9|379
7007,responsible intelligent systems formal automate risk check intelligent computing checking intelligent environment normative law input_output decision risk goal central correspond proposal suitable formal logical representation formalism responsibility action interaction joint action reason evaluation grade responsibility risk relative normative perform computing check responsibility intelligent interact human agent answer_question logical specification language collective responsibility probability grade responsibility relative normative answer_question suitable translation related optimize success hinge combine insight philosophy legal theory,t0|41 t1|55 t2|38 t3|82 t4|52 t5|89 t6|62 t7|41 t8|451 t9|89
7008,bidirectional artificial upper_limb bidirectional artificial upper_limb proposal evaluation select amputee neuro control upper_limb prosthesis control feel amputee natural neuron interface stable selective connection nervous_system goal combine recording motor signal govern action hand arm motion control mechanical prosthesis sensory_feedback tactile sensor neuromorphic stimulus proposal language link central_nervous peripheral_nerve signal order govern motion reach goal variety explore brain nerve assemble integrate include electromagnetic brain nerve signal motion change brain blood_flow metabolism,t0|31 t1|52 t2|43 t3|172 t4|92 t5|68 t6|102 t7|102 t8|284 t9|54
7009,innovative strategy clean urban transport historic city renaissance integrated package mobile measurement historic city preservation enhancement historic city sustainability economy visitor resident business renaissance historic tourism city vanguard sustainability renaissance city strong reliance marry environment sustainability mobile economy partner historic city layout valuable heritage preserve enhance partner unique local_regional include economy integration great add_value renaissance good_practice resonance relevance range historic cities,t0|87 t1|169 t2|121 t3|35 t4|100 t5|134 t6|79 t7|60 t8|36 t9|179


In [71]:
dfs = []
for entry in path_model.joinpath("root_model_0_20230102").iterdir():
    if "submodel_htm-ds" in entry.as_posix() and "train_with_10" in entry.as_posix():
        exp_tpc = int(entry.as_posix().split("from_topic_")[1].split("_")[0])
        thr = float(entry.as_posix().split("thr_")[1].split("_")[0])
        try:
            thetas = sparse.load_npz(entry.joinpath('TMmodel/thetas.npz'))
        except:
            thetas = np.array([0,0])
        dfs.append([exp_tpc,thr,thetas.shape[0]])
df_ds = pd.DataFrame(dfs, columns=["tpc","thr","docs"]).sort_values(by=["tpc","thr"])
df_ds[df_ds.tpc==0]

Unnamed: 0,tpc,thr,docs
69,0,0.1,8247
37,0,0.2,4830
3,0,0.3,3569
59,0,0.4,2649
72,0,0.5,1782
35,0,0.6,990
1,0,0.7,384
61,0,0.8,2
74,0,0.9,2


In [None]:
# Bastantes menos documentos (train and val)
# Muchos docs apenas palabras --> sería conveniente poner un mínimo de palabras para conservar doc

In [None]:
# Una vez entrenada, enfatizar valores pequeños, meter exp / umbral -> asignar docs prácticamente a un top
# Pensalizar en la loss con la norma L1
# Aplicar dropout (e.g., quedarme con los tres valores más grandes)
# Matar las mezlcas de producto que tienen peso más bajo 3, obligar a reconstruir el doc a partir de las 3 mayores

# Umbral para WS -> 25 palabras aprox.