# Find most relevant GEMET topics and export topics lists for all languages
* topics in `topic_list_v3.txt` were selected from Jiri's model based on keywords and crucial needs from Partners (Regional library spreadsheet)
* topics from GEMET (English) are compared (word movers distance) with these topics from Partners to see which are most relevant (the idea is to filter the topics, or sort by the relevance / because there are > 5000 topics in GEMET and topic assignment is slow and the extracted topics not always relevant to the text)
* english and corresponding topicf from other languages are exported to text files


In [2]:
import spacy
from tqdm import tqdm
from pprint import pprint
import time


import numpy as np
import math

import os

import pytextrank

import pandas as pd


In [3]:
from gensim.models import Word2Vec, KeyedVectors
slim_model_name = '/home/dzon/kajo/word2vec-slim/w2v-jiri-slim.bin'
start = time.time()
model = KeyedVectors.load_word2vec_format(slim_model_name, binary=True)
print('Finished loading model %.4f s' % ((time.time()-start)))
start = time.time()
#normalize vectors - needed for better performance of WMD
model.init_sims(replace=True)
print('Finished normalizing vectors %.4f s' % ((time.time()-start)))


Finished loading model 3.5719 s
Finished normalizing vectors 0.0985 s


In [4]:
#get topics from file
with open('/home/dzon/kajo/semantic-explorer/data/topic_list_v3.txt') as fp:
    topics_file = fp.readlines()
topics = []
for topic in topics_file:
    topic = topic.strip().lower()
    if topic not in topics:
        topics.append(topic)

In [78]:
# read gemet topics - previously preprocessed into one spreadsheet with all languages
df = pd.read_csv('/home/dzon/kajo/topics/gemet_topics_lang.csv', sep = ";")
df.head()

Unnamed: 0.1,Unnamed: 0,id,link,ar,az,bg,ca,cs,da,de,...,pl,pt,ro,ru,sk,sl,sv,tr,uk,zh-CN
0,0,100,http://www.eionet.europa.eu/gemet/concept/100,هيئة إدارية,inzibati qurum,Административен орган,òrgans administratius,orgán správní,administrativt organ,Verwaltungsbehörde,...,organ administracji,corpos administrativos,organism administrativ,административный орган,správny úrad,upravni organ,myndighet,"idari kurum, kuruluş",адміністративний орган,行政体
1,1,10002,http://www.eionet.europa.eu/gemet/concept/10002,المحاسبة,mühasibat uçotu,Счетоводство,comptabilitat,účetnictví,regnskab,Buchführung,...,księgowość,contabilidade,contabilitate,бухгалтерский учет,účtovná evidencia,računovodstvo,bokföring,muhasebe,бухгалтерський облік,会计学
2,2,10003,http://www.eionet.europa.eu/gemet/concept/10003,حياةالحيوان,heyvanların həyatı,Животински свят,vida animal,život zvířat,dyreliv,Tierleben,...,życie zwierząt,vida animal,viață a animalelor,жизнь животных,,življenje živali,,hayvan yaşamı,життя тварин,动物生命
3,3,10008,http://www.eionet.europa.eu/gemet/concept/10008,منتج استهلاكي,istehlak məhsulu,Потребителски продукт,producte de consum,výrobek spotřební,forbrugsvare,Konsumprodukt,...,produkt konsumpcyjny,produtos de consumo,produs al consumatorului,потребительский продукт,spotrebný výrobok,potrošniški izdelek,konsumentvara,tüketim ürünleri,споживчий продукт,消费品
4,4,1001,http://www.eionet.europa.eu/gemet/concept/1001,جسر,körpü,Мост,pont,most,bro,Brücke,...,most,pontes,pod,мост,most,most,bro,köprü,міст,桥


In [76]:
# export columns by language
dict = df.to_dict('list')
sk_topics = dict['sk']
en_topics = dict['en']
print(len(sk_topics), len(en_topics))
if math.isnan(sk_topics[2]):
    print("isnan")
sk_topics[2] #.strip().lower()

5565 5565
isnan


nan

In [57]:
# process en_topics / lemmatize
import spacy
nlp = spacy.load("en_core_web_sm")
lemma_lines = []
for i in range(len(en_topics)):
    tmp = en_topics[i].strip().lower()
    # replace by re substitution - probably 2 lines
    tmp = tmp.replace("("," ")
    tmp = tmp.replace(")"," ")
    tmp = tmp.replace("/"," ")
    tmp = tmp.replace("-"," ")
    tmp = tmp.replace(","," ")
    tmp = tmp.replace(";"," ")
    tmp = tmp.replace("  "," ")
    tmp = tmp.replace("  "," ")
    if tmp == "":
        continue
    doc = nlp(tmp)
    lemmas = []
    for token in doc:
        if token.is_stop:
            continue
        lemmas.append(token.lemma_)
    tmp = "_".join(lemmas)
    print(i,tmp)
    lemma_lines.append(tmp)
en_topics_lemma = lemma_lines    

0 administrative_body
1 accounting
2 animal_life
3 consumer_product
4 bridge
5 environmental_administration_institution
6 health_effect_noise
7 human_body
8 human_science
9 information_transfer
10 juridical_act
11 meteorological_research
12 natural_area_protection
13 natural_risk_prevention
14 physical_chemistry
15 physical_measurement_pollution
16 plant_life
17 plant_production
18 pollution_type
19 pollution_prevention
20 risk_management
21 safety_system
22 seismic_engineering
23 social_science
24 surface_water_management
25 wildlife_protection
26 historic_centre
27 promotion_trade_industry
28 masonry
29 bromine
30 sand_flat
31 animal_specie
32 plant_specie
33 occupation
34 folk_tradition
35 law_branch
36 judicial_system
37 chemical_measurement_pollution
38 measure_instrument
39 pollutant_evolution
40 urban_noise
41 cleanliness_hygiene
42 industrial_environment_general
43 natural_risk
44 natural_risk_analysis
45 major_risk
46 rescue_system
47 crisis_management
48 tax_system
49 concept

423 public_benefit_inquiry
424 public_inquiry
425 delegated_management
426 state_control
427 easement
428 crime
429 police_power
430 criminal_law_procedure
431 judgement_sentence
432 conflict
433 litigation
434 administrative_court_administration
435 court_justice_european_community
436 justice
437 trial
438 carbohydrate
439 court
440 carbon
441 lease
442 certification
443 homologation
444 carbonate
445 notification
446 pre_emption
447 prescription
448 repression
449 devolution
450 pollutant_flow
451 ocean_air_interface
452 pollutant_migration
453 incidental_pollution
454 bacteriological_pollution
455 carbon_cycle
456 diffuse_pollution
457 domestic_pollution
458 mineral_pollution
459 organic_pollution
460 photochemical_pollution
461 land_base_marine_pollution
462 toxic_pollution
463 river_disposal
464 underground_disposal
465 urban_pollutant
466 prevention_measure
467 protective_measure
468 strong_acidity
469 biomarker
470 biological_contamination
471 chemical_contamination
472 biologi

829 stock_trade
830 sterilisation_biological
831 environmental_economic_firm
832 yield_agricultural
833 environmental_problem_solve
834 fodder_plant
835 industrial_plant_organism
836 chemical_pollution
837 textile_plant
838 tropical_plant
839 agricultural_real_estate
840 agricultural_holding
841 type_tenure
842 geophysical_environment
843 petrochemical
844 speciality_chemical
845 protein_product
846 process_foodstuff
847 aeration
848 chemical_process
849 convenience_food
850 mining_product
851 root_crop
852 cultivation_system
853 crop_production
854 fishery_structure
855 fishing_ground
856 coal_industry
857 energy_industry
858 communication_industry
859 information_technology_industry
860 vacuum_industry
861 preparation_market
862 precision_engineering
863 material_technology
864 chemical_property
865 military_equipment
866 audiovisual_equipment
867 machinery
868 mechanical_equipment
869 pressure_equipment
870 thermal_equipment
871 industrial_manufacturing
872 size_business
873 busines

1207 electrical_good_industry
1208 service_provide_company
1209 restoration_water
1210 local_heat_supply
1211 forward_agent
1212 waste_avoidance
1213 natural_independence_law
1214 offence_environment
1215 provincial_regional_law_d
1216 provincial_regional_authority_d
1217 international_transaction
1218 road_setting
1219 air_quality_monitoring
1220 citizen_initiative
1221 area_stress
1222 storage_process
1223 mountain_refuge
1224 space_research
1225 red_list
1226 coagulation
1227 coal
1228 coal_base_energy
1229 insulate_material
1230 decantation
1231 wood_resource
1232 fishing_fleet
1233 coal_fire_power_plant
1234 emission_air
1235 air_quality_impact
1236 climate_change_mitigation
1237 climate_change_adaptation
1238 alternative_fuel
1239 material_flow
1240 non_mineral_waste
1241 waste_prevention
1242 waste
1243 ground_biomass
1244 ground_biomass_growth
1245 ground_non_tree_biomass
1246 ground_tree_biomass
1247 adaptation_strategy
1248 adaptive_capacity
1249 agricultural_bioenergy_produc

1549 waste_water_treatment_plant
1550 water_efficiency
1551 cockroach
1552 water_framework_directive
1553 water_policy
1554 water_scarcity
1555 water_stress
1556 water_use
1557 wilderness
1558 world_biocapacity
1559 world_health_organization
1560 foresight
1561 forward_look_study
1562 future_essay
1563 forecasting
1564 household_waste
1565 indoor_smoke
1566 infrastructure_spatial_information_europe
1567 landfill_waste
1568 landfill_waste_flow
1569 organic_waste_water
1570 policy_effectiveness
1571 temperature_change
1572 flue_gas_desulphurisation
1573 chemical_oxygen_demand
1574 code_practice
1575 code
1576 coelenterate
1577 cogeneration
1578 co_incineration
1579 coke
1580 cold
1581 cold_zone_ecosystem
1582 coliform_bacterium
1583 colloid
1584 colloidal_state
1585 colonisation
1586 aerosol
1587 colourimetry
1588 colour
1589 combination_effect
1590 combined_cycle_power_station
1591 combine_waste_water
1592 combustibility
1593 combustion_engine
1594 combustion_gas
1595 combustion_residue

1971 dune
1972 duration_sunshine
1973 dust
1974 dust_immission
1975 dust_removal
1976 registration_obligation
1977 dwelling
1978 dye
1979 agroforestry
1980 dyke
1981 dyke_reinforcement
1983 access_sea
1984 agroindustry
1985 earth_crust
1986 earthquake
1987 earth_science
1988 earth_sun_relationship
1989 earthworm
1990 earwig
1991 agrometeorology
1992 east_africa
1993 eastern_asia
1994 eastern_europe
1995 east_west_relation
1996 east_west_trade
1997 ec_council_ministers
1998 ec_directive
1999 ec_directive_biocide
2000 ec_directive_packaging
2001 ec_directive_waste_disposal
2002 ec_directive_water_protection
2003 ec_ecolabel
2004 echinoderm
2005 aids
2006 ecodevelopment
2007 ecolabelle
2008 ecological_abundance
2009 ecological_adaptation
2010 ecological_assessment
2011 ecological_balance
2012 ecological_bookkeeping
2013 air
2014 ecological_factor
2015 ecological_niche
2016 ecological_parameter
2017 ecologically_sensitive_area
2018 ecological_stocktaking
2019 ecologist_movement
2020 ecolog

2310 environmental_statistic
2311 environmental_stock_exchange
2312 environmental_subsidy
2313 environmental_target
2314 alarm
2315 environmental_teaching
2316 environmental_technology
2317 environmental_terminology
2318 environmental_training
2319 environmental_economic_valuation
2320 environmental_vandalism
2321 environmental_warfare
2322 environment_friendly
2323 environment
2324 enzyme
2325 epidemic
2326 epidemiology
2327 equine
2328 equipment
2329 equivalent_dose
2330 ergonomic
2331 erosion
2332 erosion_control
2333 estuarine_biology
2334 estuarine_conservation_area
2335 estuarine_ecosystem
2336 estuarine_oceanography
2337 estuary
2338 etch
2339 etch_substance
2340 ether
2341 alcohol
2342 ethic
2343 ethnology
2344 ethology
2345 eu_council
2346 euratom
2347 europe
2348 european_commission
2349 european_court_justice
2350 european_environment_agency
2351 european_environmental_council
2352 accident_source
2353 european_nature_reserve
2354 european_parliament
2355 european_union
2356

2740 halogenated_biphenyl
2741 halogenated_hydrocarbon
2742 halogenated_phenol
2743 halogenated_pollutant
2744 halogenated_terphenyl
2745 handicraft_business
2746 handicraft
2747 harbour
2748 hardness
2749 hard_dispose_waste
2750 waste_water_pollution
2751 harmonisation_law
2752 harvest
2753 hazard
2754 hazard_pollutant
2755 hazardous_substance
2756 hazardous_substance_legislation
2757 hazardous_waste_dump
2758 hazardous_waste
2759 hazardous_working_material
2760 haze
2761 headland_farm
2762 health
2763 health_care
2764 health_environment_relationship
2765 amphibian
2766 health_facility
2767 health_hazard
2768 health_legislation
2769 health_protection
2770 health_regulation
2771 amusement_park
2772 health_relate_biotechnology
2773 health_service
2774 hear_impairment
2775 hear_procedure
2776 hear_protection
2777 hear_sense
2778 heat_physics
2779 heat_power_station
2780 heater
2781 heathland
2782 heating
2783 heating_plant
2784 heat_pump
2785 heat_storage
2786 heat_supply
2787 anaerobic_

3118 laboratory
3119 laboratory_experiment
3120 laboratory_research
3121 laboratory_technique
3122 laboratory_waste
3123 labour
3124 antagonism
3125 labour_law
3126 labour_market
3127 labour_relation
3128 lacquer
3129 lagoon
3130 lake_basin
3131 lake_pollution
3132 lake
3133 lamp
3134 land
3135 antagonistic_effect_toxic_substance
3136 land_access
3137 land_allotment
3138 land_property_register
3139 land_carry_capacity
3140 land_clearing
3141 antarctica
3142 land_conservation
3143 land_consolidation
3144 land_cover
3145 land_development
3146 land_ecology
3147 landfill
3148 landfill_covering
3149 landfill_degasification
3150 landfill_gas
3151 landfill_leachate
3152 land_form
3153 landform
3154 antarctic_ecosystem
3155 land_mammal
3156 land_occupation
3157 land_planning
3158 land_pollution
3159 antarctic_ocean
3160 land_reclamation
3161 land_register
3162 land_restoration
3163 landscape
3164 landscape_mine
3165 antarctic_region
3166 landscape_alteration
3167 landscape_architecture
3168 la

3548 arctic_ocean
3549 multinational_firm
3550 multiple_use_management_area
3551 municipal_cleansing
3552 municipal_cleansing_service
3553 arctic_region
3554 municipal_environmental_policy
3555 municipality
3556 municipal_law
3557 municipal_level
3558 municipal_waste
3559 municipal_water_distribution_system
3560 distribution_area
3561 municipal_water_management
3562 muscular_system
3563 museum
3564 mushroom
3565 music
3566 mussel_farming
3567 mustelid
3568 mutagenicity
3569 mutagenicity_testing
3570 mutagen
3571 mutant
3572 mutated_micro_organism_release
3573 mutation
3574 mycology
3575 mycorrhiza
3576 area_potential_pollution
3577 national_conservation_programme
3578 national_economy
3579 national_environmental_accounting
3580 cultural_good
3581 nationalisation
3582 national_legislation
3583 national_park
3584 national_planning
3585 national_reserve
3586 natural_area
3587 natural_disaster
3588 natural_drainage_system
3589 natural_environment
3590 acoustic_insulation
3591 natural_ferti

3969 pipeline
3970 pipe
3971 plain
3972 plan
3973 plane_source
3974 plankton
3975 plan_urban_development
3976 planning
3977 atmospheric_model
3978 planning_law
3979 planning_measure
3980 planning_permission
3981 planning_programming_budgeting_system
3982 plant_breeding
3983 plant_community
3984 plant_component
3985 plant_disease
3986 plant_equipment
3987 plant_genetic
3988 plant_health_care
3989 atmospheric_monitoring
3990 plant_protection_product
3991 plantigrade
3992 planting
3993 plant_physiology
3994 plant_protection
3995 testing_plant_protection_product
3996 atmospheric_ozone
3997 plant_biology
3998 plant_textile_fibre
3999 plant_trade
4000 plasma_technology
4001 plastic
4002 atmospheric_particulate
4003 plastic_waste
4004 platinum
4005 playground
4006 atmospheric_physics
4007 plutonium
4008 poaching
4009 point_source
4010 poison
4011 poisoning
4012 atmospheric_pollution
4013 polar_ecosystem
4014 polar_region
4015 polder
4016 police
4017 police_law
4018 atmospheric_precipitation
4

4388 residential_area
4389 residential_building
4390 residential_area_traffic_calming
4391 residual_water
4392 residual_risk
4393 residual_waste
4394 residue_analysis
4395 banking
4396 residue_recycling
4397 resin
4398 resistance_biological
4399 resolution_act
4400 resorption
4401 resource
4402 resource_appraisal
4403 resource_conservation
4404 resource_exploitation
4405 resource_reserve
4406 resource_management
4407 respiration
4408 respiratory_air
4409 respiratory_disease
4410 sewage_spread_prohibition
4411 respiratory_protection_apparatus
4412 respiratory_system
4413 respiratory_tract
4414 responsibility
4415 impact_reversal
4416 rest_form
4417 restoration
4418 restoration_measure
4419 barium
4420 restriction_production
4421 restriction_use
4422 retail_trade
4423 retard_basin
4424 retrofit
4425 return_nature
4426 reusable_container
4427 activate_carbon
4428 reuse_material
4429 revegetation
4430 reverse_osmosis
4431 rice
4432 petition_right
4433 right_property
4434 right_compensation

4809 special_authorisation
4810 specialisation_biological
4811 special_law
4812 bioaccumulation
4813 special_waste
4814 specie
4815 conservation_specie
4816 species_conservation_programme
4817 specie_impoverishment
4818 specie_reintroduction
4819 spectroscopy
4820 speed
4821 speed_limit
4822 spider
4823 abandon_vehicle
4824 bioaccumulative_pollutant
4825 spillage
4826 spoil_dump
4827 poriferan
4828 sport
4829 sport_facility
4830 spray
4831 bio_availability
4832 spray_asbestos
4833 spring_hydrology_land
4834 spring_water
4835 spurt
4836 square
4837 squatter_settlement
4838 stabilisation_lagoon
4839 stable
4840 stack
4841 standard
4842 standardisation
4843 standard_build_industry
4844 staple_food
4845 starch
4846 state
4847 state_art
4848 datum_state_environment
4849 report_state_environment
4850 biochemical_method
4851 state_waste
4852 state_matter
4853 station
4854 statistical_analysis
4855 statistical_datum
4856 statistic
4857 waste_statistic
4858 status_development
4859 prescription_

5245 vehicle
5246 vehicle_exhaust_gas
5247 vehicle_inspection
5248 vehicle_manufacturing_industry
5249 biosphere_reserve
5250 ventilation
5251 vermin
5252 vertebrate
5253 biosynthesis
5254 veterinary_medicine
5255 viaduct
5256 vibration
5257 video
5258 village
5259 vinasse
5260 virology
5261 virus
5262 viscosity
5263 biotechnology
5264 vitamin
5265 viticulture
5266 biotic_factor
5267 vocabulary
5268 vocational_training
5269 volatile_organic_compound
5270 volatility
5271 volcanic_area
5272 volcanic_eruption
5273 adhesive
5274 volcanism
5275 volcano
5276 waste_income
5277 biotope
5278 voluntary_work
5279 vulcanisation
5280 vulnerable_species_iucn
5281 wadden_sea
5282 biotope_network
5283 wage_system
5284 wall
5285 war
5286 biotope_protection
5287 warm_blooded_animal
5290 wastage
5291 waste
5292 waste_air
5293 waste_air_purification_gas
5294 waste_analysis
5295 waste_assimilation_capacity
5296 waste_balance
5297 waste_bin
5298 type_waste
5299 waste_charge
5300 waste_classification
5301 wa

In [30]:
en_topics_lemma = lemma_lines
en_topics_lemma[5528]
range(5529)[-1]

5528

In [93]:
# explore topic similarities
new_topics_set = set()
topic_sims = {}
for i in range(len(topics)):
    topics_sims = []
    print(i, topics[i])
    for j in range(5529):
        topic1 = topics[i]
        topic2 = en_topics_lemma[j]
        wmd = model.wmdistance(topic1.split("_"), topic2.split("_")) 
        topics_sims.append((topic2,wmd))
        #t_key = "{};{}".format(topic1, topic2)
        #topic_sims[t_key] = wmd  
    sorted_topic_sims = sorted(topics_sims,key=lambda kv: kv[1],reverse=False)   
    print(sorted_topic_sims[:10])
    #for (topic2, sim) in sorted_topic_sims:
    #    if sim<1.05:
    #        new_topics_set.add(topic2)
    #    else:
    #        break

0 demographic_ageing
[('demographic_evolution', 0.5651418864553571), ('demographic_development', 0.5942636280530691), ('demography', 0.9354307585099338), ('lifestyle', 1.0469696578809022), ('social_dynamics', 1.05991894551903), ('population_ecological', 1.0705016202278137), ('population_dynamic', 1.075036575906396), ('population_growth', 1.0774826825478077), ('social_cohesion', 1.0782823915674091), ('active_population', 1.0791653442633748)]
1 demographic_changes
[('demographic_evolution', 0.5028112935488224), ('demographic_development', 0.5859596126720309), ('climatic_change', 0.8537222306868434), ('behavioural_change', 0.8556175117635726), ('physiological_change', 0.8736559065999985), ('technological_change', 0.8802341930730938), ('environmental_change', 0.8959931860097647), ('climate_change_adaptation', 0.9721548059098613), ('climate_change_impact', 0.9824234486916478), ('temperature_change', 0.9920554519500733)]
2 ageing_populations
[('population_dynamic', 0.9656780099158288), ('pop

[('underprivileged_people', 0.5669658184051514), ('young', 0.9744276513626576), ('elderly_person', 0.988328977651), ('disabled_person', 1.0226888780498504), ('adult_education', 1.0975663941190243), ('adult', 1.1009478153381347), ('age', 1.1030134153395295), ('youth', 1.1102039466791154), ('woman', 1.1231409022158385), ('displaced_person', 1.1240477412164211)]
21 older_men
[('young', 1.027404404452622), ('elderly_person', 1.0566089288496971), ('underprivileged_people', 1.072712245041132), ('woman', 1.0780399214160443), ('age', 1.0830686413898467), ('disabled_person', 1.0909688292485475), ('adult', 1.100284648483038), ('woman_status', 1.1026354416930675), ('adult_education', 1.103043147326231), ('youth', 1.1423254492431878)]
22 elderly_women
[('elderly_person', 0.5798571999793053), ('underprivileged_people', 0.9345577915204166), ('disabled_person', 0.9689936356129647), ('woman', 0.9941094939184189), ('young', 1.0165106871024967), ('youth', 1.064160676161349), ('woman_status', 1.080714752

[('wage_system', 0.928711833779633), ('high_education', 1.044142104219675), ('upper_house', 1.0521081040322184), ('low_house', 1.0670895522705317), ('low_flow', 1.0847850312895775), ('high_mountain', 1.0895345118191242), ('low_cost_housing', 1.0924989697027416), ('cost_increase', 1.1001841260430814), ('employment_structure', 1.1002326490049361), ('labour_relation', 1.1003789727168083)]
41 higher_wages
[('wage_system', 0.9417216111398935), ('high_education', 1.0087710851433278), ('high_mountain', 1.054163492742777), ('low_house', 1.078469023480296), ('employment_structure', 1.083202907276988), ('labour_market', 1.084556368882656), ('employment_level_effect', 1.0849404137830436), ('level_education', 1.0869377785634995), ('employment_environment', 1.0878307261726856), ('labour_relation', 1.0933152317656278)]
42 higher_salaries
[('high_education', 1.0139689037454127), ('wage_system', 1.0755757011684774), ('high_mountain', 1.0818393646130562), ('low_house', 1.086255048709452), ('level_educa

[('foreign_policy', 0.7863791640399717), ('foreign_trade', 0.8187277217096006), ('foreign_economic_relation', 0.8236814294330831), ('indigenous_knowledge', 1.0760614031564264), ('teaching', 1.0944052738969305), ('vocational_training', 1.1091970000727245), ('speech', 1.110302400576721), ('initial_training', 1.1160923359877104), ('vocabulary', 1.1261653353867942), ('training', 1.1272226427042085)]
60 professional_skills
[('professional_society', 0.5981422513903379), ('vocational_training', 1.0037844558116198), ('training', 1.0188277780407071), ('environmental_training', 1.062763826153338), ('initial_training', 1.0651394321883918), ('administrative_competence', 1.079637441133976), ('teaching', 1.0811529267821312), ('medical_science', 1.0812850003364682), ('school_teaching', 1.0856673125380278), ('indigenous_knowledge', 1.08999246497339)]
61 technical_skills
[('technical_information', 0.5739817779824137), ('technical_instruction', 0.5760706257061958), ('maintenance_technical', 0.6134478449

[('social_service', 0.4113308566807508), ('social_system', 0.5570790649715662), ('social_participation', 0.575143792956829), ('social_representation', 0.5759313641371727), ('social_protection', 0.5764254653602838), ('social_policy', 0.580567067920208), ('ecosystem_services', 0.5868065756733417), ('social_security', 0.5869829068790674), ('social_framework', 0.5886365571264028), ('social_cost', 0.5892678374075889)]
79 social_care
[('social_welfare', 0.5430709984481334), ('social_medicine', 0.5530389560585022), ('health_care', 0.5587711227938532), ('social_service', 0.5819820731480718), ('social_psychology', 0.599771803638339), ('social_behaviour', 0.6037256121635437), ('social_relief', 0.6071046375002861), ('social_condition', 0.6079724573716521), ('social_problem', 0.6109470794796944), ('social_equity', 0.6111123110076785)]
80 welfare_services
[('ecosystem_services', 0.5925366339249015), ('social_welfare', 0.623078465461731), ('theory_welfare_state', 0.8187180635501202), ('health_servic

[('technological_development', 0.5475748181343079), ('technological_change', 0.5556533924454451), ('technological_process', 0.5918451785758138), ('technological_accident', 0.6488670110702515), ('technological_hazard', 0.6669448614120483), ('innovation', 0.9091352863455117), ('technology', 0.9759747297733128), ('scientific_research', 0.9857606223365665), ('new_technology', 1.0089628360560536), ('scientific_ecology', 1.015698104133904)]
97 technological_progress
[('technological_change', 0.5495884345122576), ('technological_development', 0.5601060450685025), ('technological_process', 0.5637345649981499), ('progress_line', 0.6408568024635315), ('technological_accident', 0.6554660797119141), ('technological_hazard', 0.6686695218086243), ('scientific_technical_information', 1.0145597958112385), ('scientific_policy', 1.0147298313133122), ('scientific_committee', 1.0148038220381141), ('scientific_research', 1.017069333106637)]
98 technological_changes
[('technological_change', 0.3566699030566

[('novel_food', 1.0171252480436563), ('new_technology', 1.0414332843965293), ('alternative_technology', 1.0421040393390657), ('approach', 1.0496113445073367), ('sustainable_use', 1.054556982139349), ('environment_friendly', 1.0563531510375141), ('new_material', 1.0580998550055027), ('alternative_material', 1.0587701498813629), ('sustainable_consumption', 1.0604586060705183), ('collaborative_consumption', 1.0660685719410776)]
115 new_solutions
[('new_technology', 0.5624218860393763), ('new_material', 0.5790883392069339), ('new_installation', 0.6192068457603455), ('new_town', 0.6408774852752686), ('new_community', 0.6530278921127319), ('novel_food', 1.0343230573537947), ('alternative_technology', 1.1055019216746091), ('mathematical_method', 1.1088418547564147), ('biochemical_method', 1.1105088319131733), ('teaching_method', 1.1112548683929444)]
116 new_approaches
[('new_technology', 0.5619236432065964), ('new_material', 0.6117537021636963), ('new_community', 0.6346823573112488), ('new_in

[('climate_change_adaptation', 0.3603309840155273), ('climate_change_mitigation', 0.37618154884266924), ('climate_change_impact', 0.38708662325993376), ('man_climate_change', 0.4209816871170201), ('climatic_change', 0.4400683474617004), ('environmental_change', 0.4944146500854492), ('climate', 0.5294262766838074), ('behavioural_change', 0.5480689552707672), ('global_climate', 0.5573751926422119), ('technological_change', 0.5623146735993624)]
134 global_warming
[('global_warming', 0.0), ('global_climate', 0.4780808871177435), ('global_megatrend', 0.5203040838241577), ('global_model', 0.6187983751296997), ('global_convention', 0.6209043316904306), ('global_aspect', 0.6655065417289734), ('global_temperature_increase', 0.7602513397519791), ('global_environment_facility', 0.8231949300114173), ('global_mean_temperature_increase', 0.8947443521590233), ('climate', 0.9790083295387029)]
135 climatic_changes
[('climatic_change', 0.35666981017929317), ('climatic_alteration', 0.5336723160942793), (

[('economic_situation', 0.7810037826808497), ('environmental_economic_valuation', 0.8096032684813443), ('economic_growth', 0.8196569587867727), ('economic_development', 0.8197883763408101), ('economic_analysis', 0.82179896673849), ('economic_structure', 0.8229793852886952), ('economic_concentration', 0.825018318182469), ('economic_planning', 0.8258300242029089), ('economic_system', 0.8259799015580089), ('economic_plan', 0.8261628585601378)]
153 small_farms
[('small_scale_furnace', 0.8116338035374009), ('small_scale_inducer', 0.8250472622044808), ('small_power_station', 0.8252089811629008), ('small_medium_sized_industry', 0.8602366231405735), ('small_island_political_geography', 0.934575239823997), ('organic_farming', 0.9931635781136156), ('dairy_farm', 0.9972416419448853), ('mixed_farming', 1.0139358433285952), ('farm_price', 1.0159433900909425), ('fish_farming', 1.0159488489425184)]
154 family_farms
[('family', 0.5942357182502747), ('family_planning', 0.6329235434532166), ('family_law

[('large_combustion_plant', 0.8347182950454245), ('company_structure', 0.9533909173672199), ('electricity_company', 0.9889710750515461), ('small_medium_sized_industry', 0.9983828791752457), ('power_company', 0.998838386158228), ('gas_company', 0.9998240713688135), ('company_policy', 1.004479431687951), ('multinational_firm', 1.0141107103395461), ('size_business', 1.0418711686567663), ('small_scale_furnace', 1.0464616792509736)]
172 larger_firms
[('multinational_firm', 1.0149836574344635), ('size_business', 1.0277768140309453), ('close_firm', 1.037311461675644), ('firm', 1.0684168403377534), ('company_structure', 1.071258314877212), ('small_medium_sized_industry', 1.0736982704845668), ('electricity_company', 1.0793537041820882), ('power_company', 1.0966275289092062), ('small_scale_furnace', 1.09967465690817), ('company_policy', 1.1069928823122381)]
173 large_enterprises
[('large_combustion_plant', 0.8457428598839333), ('small_medium_sized_industry', 0.9849897199342549), ('size_business'

[('soil_quality', 0.5470829648015499), ('environmental_quality', 0.5596662429406047), ('landscape_management', 0.5609369134803415), ('landscape', 0.5762786269187927), ('landscape_utilisation', 0.5762786269187927), ('landscape_conservation', 0.5763309597969055), ('landscape_protection', 0.5818205249261856), ('landscape_component', 0.5867162276860476), ('quality_life', 0.5905134677886963), ('landscape_consumption', 0.5970097721099854)]
191 landscape_dynamics
[('landscape_ecology', 0.49237766765266655), ('landscape', 0.5479965806007385), ('landscape_utilisation', 0.5479965806007385), ('landscape_management', 0.5594359708816409), ('landscape_planning', 0.573382556438446), ('landscape_architecture', 0.5802281498908997), ('social_dynamics', 0.5878992244809866), ('urban_landscape', 0.5993729829788208), ('landscape_component', 0.6011754958108664), ('landscape_conservation', 0.6044299602508545)]
192 landscape_maintenance
[('maintenance_environment', 0.5163994381370545), ('landscape_alteration',

[('water_resource', 0.4115767784520984), ('water_management', 0.5407596400920153), ('water_scarcity', 0.5450992570185661), ('water_consumption', 0.5614906753480435), ('water_consumption', 0.5614906753480435), ('water_conservation', 0.5624796784161329), ('water_transportation', 0.5667500491663814), ('water_demand', 0.5767502805026173), ('water_saving', 0.5771414007678628), ('water_policy', 0.5778124581500889)]
211 social_supports
[('social_participation', 0.5944922655103207), ('social_development', 0.5946358399652243), ('social_structure', 0.5994030200961828), ('social_process', 0.6039713559599519), ('social_cohesion', 0.6085530328128934), ('social_policy', 0.6157679668143988), ('social_protection', 0.6199481434217692), ('social_framework', 0.6226576347941757), ('social_service', 0.6230438881949782), ('social_cost', 0.6243182379221917)]
212 credit_financing
[('credit_assistance', 0.5100148746408224), ('credit_policy', 0.5445308796200753), ('credit', 0.5916622281074524), ('financing', 0.

[('land_use', 0.5534197077100277), ('land_access', 0.5700094083631039), ('land_transportation', 0.571286234227717), ('land_value', 0.58960012413311), ('land_form', 0.6021522835371493), ('land_cover', 0.6026756278728247), ('land_disposal', 0.605202199575305), ('land_development', 0.6068885967775584), ('land_clearing', 0.6071408787879348), ('land', 0.6084277033805847)]
230 established_community
[('new_community', 0.6029437184333801), ('community_law', 0.6067215628453493), ('community_participation', 0.6067336201667786), ('plant_community', 0.6142798066139221), ('community_act', 0.6153002701663971), ('community_ruling', 0.617154061794281), ('european_community', 0.622243344783783), ('community_facility', 0.6256797909736633), ('community_finance', 0.6268004310225248), ('ecological_community', 0.6310429573059082)]
231 rural_ireland
[('rural_area', 0.5922867811657787), ('rural_law', 0.5970358090258836), ('rural_development', 0.6173962950706482), ('rural_environment', 0.621128499507904), ('ru

[('quality_life', 0.0), ('environmental_quality', 0.5420952010211348), ('quality_assurance', 0.5625740885734558), ('life_cycle', 0.575532853603363), ('soil_quality', 0.5929691127483845), ('quality_standard', 0.5935359001159668), ('freshwater_quality', 0.5995273719063997), ('life_science', 0.6055395007133484), ('school_life', 0.6056033372879028), ('animal_life', 0.6139315586072207)]
249 circular_economy
[('circular_economy', 0.0), ('economy', 0.5119001269340515), ('green_economy', 0.5498705622491836), ('carbon_economy', 0.5613549865651131), ('linear_economy', 0.5779880750647783), ('dual_economy', 0.5902918089270591), ('market_economy', 0.6324763298034668), ('yield_economy', 0.6422559983333945), ('national_economy', 0.6469451785087585), ('circular_mail', 0.6708628429886698)]
250 green_economy
[('green_economy', 0.0), ('circular_economy', 0.5498705622491836), ('carbon_economy', 0.5691451607294082), ('green_revolution', 0.5724263700045347), ('urban_green', 0.5776232668476105), ('economy', 

[('financial_assistance', 0.7252545109040378), ('financial_contribution', 0.7398326536275994), ('financial_aid', 0.7611853190534537), ('financial_management', 0.7670960760408595), ('economic_support', 0.7785631736666443), ('financial_compensation', 0.7835868493321508), ('decision_make_support', 0.8000894893905557), ('financial_instrument', 0.803694585578504), ('decision_support_system', 0.8066167267636107), ('financial_market', 0.8102768976662613)]
268 rural_development_measures
[('rural_development_policy', 0.3922799525365784), ('rural_development', 0.40609158499507175), ('plan_urban_development', 0.660654986616423), ('urban_development_law', 0.6763495749578322), ('urban_development_document', 0.6835323356766041), ('urban_planning_development', 0.6911891866881522), ('urban_development', 0.6957847905496453), ('rural_management_planning', 0.7402775282820616), ('regional_development', 0.7536327208138461), ('coastal_development', 0.7536626446032068)]
269 water_management
[('water_manageme

[('labour_market', 0.3295230445027351), ('market_economy', 0.5742748932909966), ('money_market', 0.5844195741033554), ('energy_market', 0.6039543216574191), ('market_gardening', 0.6098668991041183), ('market_price', 0.6116795442628861), ('market', 0.6175237894058228), ('market_research', 0.6241764172103405), ('financial_market', 0.6284664869308472), ('environment_market', 0.6309281587600708)]
287 agricultural_sector
[('economic_sector', 0.5489254237221479), ('agricultural_policy', 0.5502016577403546), ('agricultural_production', 0.5591438670252562), ('agricultural_landscape', 0.5673224777590037), ('agricultural_structure', 0.5695092879054546), ('agricultural_management', 0.5792486483848095), ('agricultural_equipment', 0.5849890242643953), ('agricultural_legislation', 0.585408096718192), ('agricultural_biotechnology', 0.5866767848511935), ('agricultural_storage', 0.5882992671296001)]
288 population_change
[('composition_population', 0.5633103350687028), ('population_structure', 0.566004

[('industrial_development', 0.0), ('technological_development', 0.49961765837353467), ('industrial_building', 0.5294109229664207), ('industrial_planning', 0.534182439545393), ('urban_development', 0.5401508838325739), ('environmental_development', 0.5418023411417604), ('regional_development', 0.5440858801109791), ('economic_development', 0.555851582575798), ('social_development', 0.5565615374343992), ('industrial_policy', 0.5566990245056749)]
306 good_education
[('high_education', 0.53053352419281), ('cultural_good', 0.5668532452454567), ('education_policy', 0.5835916470404863), ('environmental_education', 0.5897364745355844), ('general_education', 0.5935831159186363), ('good_service', 0.6015310364166497), ('level_education', 0.6035049623283147), ('good_management', 0.6084533148096799), ('consumer_good', 0.6172644249862432), ('primary_education', 0.6185324564259052)]
307 new_jobs
[('new_town', 0.6226572409980893), ('new_technology', 0.6251225471496582), ('new_installation', 0.630391180

[('region', 0.8241804710453184), ('economic_region', 0.8306381563068641), ('arctic_region', 0.8379137761231769), ('antarctic_region', 0.8457926949915611), ('biogeographical_region', 0.8497039171022153), ('tax_capital', 0.8508438228516771), ('free_movement_capital', 0.8595481813933498), ('polar_region', 0.8653054066307638), ('natural_capital_account', 0.8697834320035117), ('natural_capital', 0.8709213935387012)]
325 brussels_region
[('arctic_region', 0.6348558876806497), ('antarctic_region', 0.636036974132657), ('economic_region', 0.6385371123754978), ('region', 0.6446978449821472), ('biogeographical_region', 0.6572789549827576), ('polar_region', 0.6703232600419522), ('regional_convention', 1.063078745003581), ('regional_regulation', 1.0937047466444372), ('caribbean_area', 1.107434190745175), ('continent', 1.1217154105668068)]
326 regional_landscape
[('regional_structure', 0.5239680349418521), ('urban_landscape', 0.5388189861046672), ('landscape', 0.5590024590492249), ('landscape_utilis

[('land_degradation', 0.0), ('land_restoration', 0.551649592512846), ('soil_degradation', 0.5575824532166124), ('land_pollution', 0.5719360826662183), ('waterlogged_land', 0.5762822742413879), ('ecosystem_degradation', 0.5776636528948546), ('degradation', 0.5817100405693054), ('land', 0.5817100405693054), ('land_reclamation', 0.582960844039917), ('environmental_degradation', 0.586439861991167)]
345 soil_degradation
[('soil_degradation', 0.0), ('degradation', 0.49801698327064514), ('soil', 0.49801698327064514), ('soil_contamination', 0.5004821910030842), ('soil_leaching', 0.506308913230896), ('soil_damage', 0.5154146143664121), ('soil_compaction', 0.518385648727417), ('soil_erosion', 0.5249780416488647), ('freshwater_degradation', 0.5278134160132408), ('soil_decontamination', 0.530745891838789)]
346 soil_erosion
[('soil_erosion', 0.0), ('soil_salination', 0.46386854522562027), ('soil_subsidence', 0.46491990479081874), ('soil_compaction', 0.46660406637370583), ('erosion', 0.4945920407772

[('automobile_industry', 0.4916942275744677), ('hotel_industry', 0.5488162576243282), ('private_car', 0.5880725396931171), ('clothing_industry', 0.591954115884304), ('energy_industry', 0.5934893754950166), ('petrochemical_industry', 0.596476789495349), ('cement_industry', 0.5977539855247737), ('print_industry', 0.5992801079547405), ('furniture_industry', 0.5995879240503907), ('forest_industry', 0.6000948572273254)]
365 automobile_industry
[('automobile_industry', 0.0), ('petrochemical_industry', 0.5260274176226855), ('clothing_industry', 0.5315477230944037), ('steel_industry', 0.54614046872437), ('furniture_industry', 0.560764633578062), ('hotel_industry', 0.5615539945982099), ('industry', 0.561700701713562), ('extractive_industry', 0.5648185171108245), ('electrotechnical_industry', 0.569826281054616), ('pharmaceutical_industry', 0.5705749496426582)]
366 motor_industry
[('motor_vehicle_industry', 0.37405429115616207), ('electrical_industry', 0.5312045938866734), ('automobile_industry',

[('large_combustion_plant', 0.8434859089464278), ('inner_city', 1.0147893537731767), ('high_mountain', 1.0348389753668308), ('urban_structure', 1.0409626688818931), ('city', 1.0479801020544173), ('city_centre', 1.048375005579829), ('small_power_station', 1.0540208079827993), ('urban_population', 1.059661971616745), ('small_medium_sized_industry', 1.0640183790358901), ('urban_facility', 1.070065265477717)]
384 urban_centers
[('urban_facility', 0.5785848112106323), ('urban_area', 0.5967427322859764), ('urban_management', 0.6013938788599967), ('urban_ecosystem', 0.6027440904126167), ('urban_development', 0.6050521122031212), ('urban_landscape', 0.6055228684129714), ('urban_structure', 0.6210815531162023), ('urban_planning', 0.625034490974903), ('urban_traffic', 0.62752366065979), ('urban_mobility', 0.6332855820655823)]
385 settlement_structure
[('institutional_structure', 0.5872074342684745), ('human_settlement', 0.6031702093020678), ('urban_structure', 0.6147821580961942), ('regional_str

[('secondary_education', 0.4755304402351379), ('education', 0.5167872309684753), ('adult_education', 0.5523231591841579), ('primary_education', 0.5543967394616605), ('level_education', 0.571074254945755), ('high_education', 0.5856811586675048), ('general_education', 0.5981558561325073), ('tertiary_sector', 0.6165536642074585), ('environmental_education', 0.6262391209602356), ('education_policy', 0.6287046074867249)]
402 school_education
[('education', 0.4770478308200836), ('school', 0.4770478308200836), ('school_teaching', 0.4778488278388977), ('secondary_education', 0.5126750049233436), ('adult_education', 0.5642715692520142), ('level_education', 0.5718711945177317), ('general_education', 0.5809167165595293), ('education_policy', 0.5842352509498596), ('primary_education', 0.5931854636228084), ('school_life', 0.6062048077583313)]
403 upper_secondary_education
[('secondary_education', 0.3664090543772696), ('primary_education', 0.6601987386890255), ('high_education', 0.7494046024459555),

[('organic_farming', 0.5294704436116814), ('biological_production', 0.5394819886515737), ('manure_production', 0.5440437603589297), ('animal_production', 0.5478509444144368), ('agricultural_production', 0.5531180879983306), ('crop_production', 0.5586764216423035), ('plant_production', 0.5640025734901428), ('bioenergy_production', 0.5643775030710697), ('organic_certification', 0.5754208649568557), ('energy_production', 0.5757956003785729)]


print(len(new_topics_set))
for topic in sorted(new_topics_set):
    print(topic)

In [58]:
gemet_topic_sims = [10]*len(en_topics_lemma)
print(len(en_topics_lemma), len(gemet_topic_sims))
for i in range(50):
   print(gemet_topic_sims[i],en_topics_lemma[i],en_topics[i])

5565 5565
10 administrative_body administrative body
10 accounting accounting
10 animal_life animal life
10 consumer_product consumer product
10 bridge bridge
10 environmental_administration_institution environmental administration institution
10 health_effect_noise health effect of noise
10 human_body human body
10 human_science human science
10 information_transfer information transfer
10 juridical_act juridical act
10 meteorological_research meteorological research
10 natural_area_protection natural areas protection
10 natural_risk_prevention natural risks prevention
10 physical_chemistry physical chemistry
10 physical_measurement_pollution physical measurement of pollution
10 plant_life plant life
10 plant_production plant production
10 pollution_type pollution type
10 pollution_prevention pollution prevention
10 risk_management risk management
10 safety_system safety system
10 seismic_engineering seismic engineering
10 social_science social science
10 surface_water_management surf

In [59]:
# compare all topics with GEMET topics, save the smallest WMD similarities for GEMET topics
for i in tqdm(range(len(topics))):
    #print(i, topics[i])
    for j in range(5529):
        topic1 = topics[i]
        topic2 = en_topics_lemma[j]
        wmd = model.wmdistance(topic1.split("_"), topic2.split("_")) 
        if gemet_topic_sims[j]>wmd:
            gemet_topic_sims[j] = wmd
         

100%|██████████| 422/422 [05:05<00:00,  1.38it/s]


In [62]:
gemet_topic_sims[1]

1.1377433419564962

In [92]:
topic_sim_tuples = []
for i in range(5529):
    topic_sim_tuples.append((i, en_topics[i],en_topics_lemma[i],gemet_topic_sims[i]))
# sort the GEMET topics by smallest WMD similarity 
sorted_topic_sim_tuples = sorted(topic_sim_tuples, key=lambda tup: tup[3], reverse=False)
print(len(topic_sim_tuples), len(sorted_topic_sim_tuples))

# print the topics and similarities (find where is the threshold WMD>1)
for i,(topic_id, topic,topic_lemma,sim) in enumerate(sorted_topic_sim_tuples):
    if sim>1:
        break
    print(i, topic_id, topic,topic_lemma,sim)
print(i)  # 2381 = 2380 was the last item with wmd<1

# export topic lists for all needed languages - sorted by topic relevance
# languages missing: iw - hebrew, mk - macedonian 
lang_list = ['en' , 'sk', 'cs', 'fi', 'nl', 'el', 'it', 'lv', 'pl', 'es']
for lang in lang_list:
    filename = '/home/dzon/kajo/topics/topics_' + lang + '.lst'
    topics_list = dict[lang]
    with open(filename, 'wt', encoding='utf8') as f:
        for j in range(5529): #range(i)
            (topic_id,_,_,_) = sorted_topic_sim_tuples[j]
            topic = topics_list[topic_id]
            if not isinstance(topic, str): # and math.isnan(topic):
                continue
            f.write(topic + "\n")

5529 5529
0 298 local development local_development 0.0
1 327 rural development rural_development 0.0
2 637 water resources management water_resource_management 0.0
3 953 level of education level_education 0.0
4 954 general education general_education 0.0
5 984 labour force labour_force 0.0
6 995 bank (land) bank_land 0.0
7 1301 circular economy circular_economy 0.0
8 1373 environmental regulation environmental_regulation 0.0
9 1405 green economy green_economy 0.0
10 1429 land degradation land_degradation 0.0
11 1494 resource efficiency resource_efficiency 0.0
12 1554 water scarcity water_scarcity 0.0
13 1606 Community law community_law 0.0
14 1745 cultural heritage cultural_heritage 0.0
15 1796 agricultural land agricultural_land 0.0
16 1807 agricultural landscape agricultural_landscape 0.0
17 1878 agricultural production agricultural_production 0.0
18 2025 economic development economic_development 0.0
19 2515 food chain food_chain 0.0
20 2521 food industry food_industry 0.0
21 2527 f

854 3609 natural science natural_science 0.5970689891397952
855 3192 law (individual) law_individual 0.597119223886013
856 5188 urban ecosystem urban_ecosystem 0.5971468091011047
857 4870 stock management stock_management 0.5972434878349304
858 948 aid policy aid_policy 0.5973218461188078
859 1859 development aid development_aid 0.5973221369411945
860 279 financial fund financial_fund 0.5974229574203491
861 20 risk management risk_management 0.5974620276875496
862 486 carbon monoxide carbon_monoxide 0.5976757407188416
863 4691 intervention on land intervention_land 0.5978225344464183
864 3078 animal for slaughter animal_slaughter 0.5979376435279846
865 4653 slaughtering of animals slaughter_animal 0.5979376435279846
866 4096 population distribution population_distribution 0.5979382863578797
867 2910 animal disease animal_disease 0.5979424715042114
868 3156 land occupation land_occupation 0.5980064272880554
869 5416 water (geographic) water_geographic 0.5980765796291828
870 3125 labour 

1691 835 industrial plant (organism) industrial_plant_organism 0.7431775843877484
1692 2958 industrial plant (building) industrial_plant_building 0.743285541177326
1693 1947 agriculture and cattle industry agriculture_cattle_industry 0.7436562485038525
1694 245 supply and demand supply_demand 0.744171272332575
1695 909 public international law public_international_law 0.7450334610849338
1696 1569 organic waste water organic_waste_water 0.7457320802104163
1697 5376 water distribution system water_distribution_system 0.7458324703088234
1698 5136 tropical forest ecosystem tropical_forest_ecosystem 0.746204305263055
1699 3032 intensive animal husbandry intensive_animal_husbandry 0.7462745038289822
1700 240 product life cycle product_life_cycle 0.7477020827402876
1701 964 health care profession health_care_profession 0.7478510460971474
1702 2233 environmental impact assessment environmental_impact_assessment 0.7481783361601185
1703 3687 non-renewable energy resource non_renewable_energy_res