# Find most relevant GEMET topics and export topics lists for all languages
* topics in `topic_list_v3.txt` were selected from Jiri's model based on keywords and crucial needs from Partners (Regional library spreadsheet)
* topics from GEMET (English) are compared (word movers distance) with these topics from Partners to see which are most relevant (the idea is to filter the topics, or sort by the relevance / because there are > 5000 topics in GEMET and topic assignment is slow and the extracted topics not always relevant to the text)
* english and corresponding topicf from other languages are exported to text files


In [1]:
import spacy
from tqdm import tqdm
from pprint import pprint
import time


import numpy as np
import math

import os

import pytextrank

import pandas as pd

import re


In [2]:
from gensim.models import Word2Vec, KeyedVectors
slim_model_name = '/home/dzon/kajo/word2vec-slim/w2v-jiri-slim.bin'
start = time.time()
model = KeyedVectors.load_word2vec_format(slim_model_name, binary=True)
print('Finished loading model %.4f s' % ((time.time()-start)))
start = time.time()
#normalize vectors - needed for better performance of WMD
model.init_sims(replace=True)
print('Finished normalizing vectors %.4f s' % ((time.time()-start)))


Finished loading model 3.3576 s
Finished normalizing vectors 0.1974 s


In [3]:
#get topics from file
with open('/home/dzon/kajo/semantic-explorer/data/topic_list_v3.txt') as fp:
    topics_file = fp.readlines()
topics = []
for topic in topics_file:
    topic = topic.strip().lower()
    if topic not in topics:
        topics.append(topic)

In [4]:
# read gemet topics - previously preprocessed into one spreadsheet with all languages
df = pd.read_csv('/home/dzon/kajo/topics/gemet_topics_lang.csv', sep = ";")
df.head()

Unnamed: 0.1,Unnamed: 0,id,link,ar,az,bg,ca,cs,da,de,...,pl,pt,ro,ru,sk,sl,sv,tr,uk,zh-CN
0,0,100,http://www.eionet.europa.eu/gemet/concept/100,هيئة إدارية,inzibati qurum,Административен орган,òrgans administratius,orgán správní,administrativt organ,Verwaltungsbehörde,...,organ administracji,corpos administrativos,organism administrativ,административный орган,správny úrad,upravni organ,myndighet,"idari kurum, kuruluş",адміністративний орган,行政体
1,1,10002,http://www.eionet.europa.eu/gemet/concept/10002,المحاسبة,mühasibat uçotu,Счетоводство,comptabilitat,účetnictví,regnskab,Buchführung,...,księgowość,contabilidade,contabilitate,бухгалтерский учет,účtovná evidencia,računovodstvo,bokföring,muhasebe,бухгалтерський облік,会计学
2,2,10003,http://www.eionet.europa.eu/gemet/concept/10003,حياةالحيوان,heyvanların həyatı,Животински свят,vida animal,život zvířat,dyreliv,Tierleben,...,życie zwierząt,vida animal,viață a animalelor,жизнь животных,,življenje živali,,hayvan yaşamı,життя тварин,动物生命
3,3,10008,http://www.eionet.europa.eu/gemet/concept/10008,منتج استهلاكي,istehlak məhsulu,Потребителски продукт,producte de consum,výrobek spotřební,forbrugsvare,Konsumprodukt,...,produkt konsumpcyjny,produtos de consumo,produs al consumatorului,потребительский продукт,spotrebný výrobok,potrošniški izdelek,konsumentvara,tüketim ürünleri,споживчий продукт,消费品
4,4,1001,http://www.eionet.europa.eu/gemet/concept/1001,جسر,körpü,Мост,pont,most,bro,Brücke,...,most,pontes,pod,мост,most,most,bro,köprü,міст,桥


In [14]:
# export columns by language
df_dict = df.to_dict('list')
sk_topics = df_dict['sk']
# better to take Us english for our model
#en_topics = dict['en']
en_topics = df_dict['en-US']
print(len(sk_topics), len(en_topics))
if math.isnan(sk_topics[2]):
    print("isnan")
sk_topics[2] #.strip().lower()

5565 5565
isnan


nan

In [23]:
# process en_topics / lemmatize
import spacy
nlp = spacy.load("en_core_web_sm")
lemma_lines = []
for i in range(len(en_topics)):
    if not isinstance(en_topics[i], str): # check if string (not missing value)
        lemma_lines.append("")    
        continue
    
    # replace by re substitution - probably 2 lines
    #tmp = tmp.replace("("," ")
    #tmp = tmp.replace(")"," ")
    #tmp = tmp.replace("/"," ")
    #tmp = tmp.replace("-"," ")
    #tmp = tmp.replace(","," ")
    #tmp = tmp.replace(";"," ")
    #tmp = tmp.replace("  "," ")
    #tmp = tmp.replace("  "," ")
    tmp = en_topics[i].strip().lower()
    tmp = re.sub('[\(\)\;\,\-\_/]',' ',tmp)
    tmp = re.sub('\ +',' ',tmp)
    
    if tmp == "":
        lemma_lines.append("")
        continue
    doc = nlp(tmp)
    lemmas = []
    for token in doc:
        if token.is_stop:
            continue
        lemmas.append(token.lemma_)
    tmp = "_".join(lemmas)
    print(i,tmp)
    lemma_lines.append(tmp)
en_topics_lemma = lemma_lines   
len(en_topics_lemma)

0 administrative_body
1 accounting
2 animal_life
3 consumer_product
4 bridge
5 environmental_administration_institution
6 health_effect_noise
7 human_body
8 human_science
9 information_transfer
10 juridical_act
11 meteorological_research
12 natural_area_protection
13 natural_risk_prevention
14 physical_chemistry
15 physical_measurement_pollution
16 plant_life
17 plant_production
18 pollution_type
19 pollution_prevention
20 risk_management
21 safety_system
22 seismic_engineering
23 social_science
24 surface_water_management
25 wildlife_protection
26 historic_center
27 promotion_trade_industry
28 masonry
29 bromine
30 sand_flat
31 animal_specie
32 plant_specie
33 occupation
34 folk_tradition
35 law_branch
36 judicial_system
37 chemical_measurement_pollution
38 measure_instrument
39 pollutant_evolution
40 urban_noise
41 cleanliness_hygiene
42 industrial_environment_general
43 natural_risk
44 natural_risk_analysis
45 major_risk
46 rescue_system
47 crisis_management
48 tax_system
49 concept

438 carbohydrate
439 court
440 carbon
441 lease
442 certification
443 homologation
444 carbonate
445 notification
446 pre_emption
447 prescription
448 repression
449 devolution
450 pollutant_flow
451 ocean_air_interface
452 pollutant_migration
453 incidental_pollution
454 bacteriological_pollution
455 carbon_cycle
456 diffuse_pollution
457 domestic_pollution
458 mineral_pollution
459 organic_pollution
460 photochemical_pollution
461 land_base_marine_pollution
462 toxic_pollution
463 river_disposal
464 underground_disposal
465 urban_pollutant
466 prevention_measure
467 protective_measure
468 strong_acidity
469 biomarker
470 biological_contamination
471 chemical_contamination
472 biological_effect_pollution
473 carbon_dioxide
474 irreversibility_phenomenon
475 quality_objective
476 solid_particle
477 purify_power
478 carbon_dioxide_tax
479 sensor
480 instrumentation
481 metrology
482 observation_satellite
483 atrazine
484 organic_nitrogen
485 halogenated_compound
486 carbon_monoxide
487 

849 convenience_food
850 mining_product
851 root_crop
852 cultivation_system
853 crop_production
854 fishery_structure
855 fishing_ground
856 coal_industry
857 energy_industry
858 communication_industry
859 information_technology_industry
860 vacuum_industry
861 preparation_market
862 precision_engineering
863 material_technology
864 chemical_property
865 military_equipment
866 audiovisual_equipment
867 machinery
868 mechanical_equipment
869 pressure_equipment
870 thermal_equipment
871 industrial_manufacturing
872 size_business
873 business_activity
874 branch_activity
875 administrative_occupation
876 building_service
877 mode_transportation
878 destination_transport
879 degradation_environment
880 community_budget
881 economic_support
882 accounting_system
883 chemical_reaction
884 economic_forecasting
885 customs_tariff
886 commercial_transaction
887 pay_policy
888 european_monetary_system
889 money_market
890 exchange_policy
891 credit_policy
892 public_debt
893 tax_consumption
894

1233 coal_fire_power_plant
1252 coal_gasification
1263 coal_liquefaction
1274 coal_mining
1295 coal_refining
1306 aerodynamic_noise
1317 coal_technology
1328 coast
1339 coastal_area
1360 coastal_development
1371 coastal_ecosystem
1382 coastal_environment
1403 coastal_erosion
1414 coastal_fishing
1445 coastal_pollution
1476 coastal_water
1487 coastal_zone_planning
1498 coastguard
1519 coat
1540 cobalt
1551 cockroach
1573 chemical_oxygen_demand
1574 code_practice
1575 code
1576 coelenterate
1577 cogeneration
1578 co_incineration
1579 coke
1580 cold
1581 cold_zone_ecosystem
1582 coliform_bacterium
1583 colloid
1584 colloidal_state
1585 colonization
1586 aerosol
1587 colorimetry
1588 color
1589 combination_effect
1590 combined_cycle_power_station
1591 combine_waste_water
1592 combustibility
1593 combustion_engine
1594 combustion_gas
1595 combustion_residue
1596 commercialization
1597 commercial_law
1598 commercial_noise
1599 commercial_traffic
1600 commercial_vehicle
1601 trade_waste
1602 

1994 eastern_europe
1995 east_west_relation
1996 east_west_trade
1997 ec_council_ministers
1998 ec_directive
1999 ec_directive_biocide
2000 ec_directive_packaging
2001 ec_directive_waste_disposal
2002 ec_directive_water_protection
2003 ec_ecolabel
2004 echinoderm
2005 aids
2006 ecodevelopment
2007 ecolabele
2008 ecological_abundance
2009 ecological_adaptation
2010 ecological_assessment
2011 ecological_balance
2012 ecological_bookkeeping
2013 air
2014 ecological_factor
2015 ecological_niche
2016 ecological_parameter
2017 ecologically_sensitive_area
2018 ecological_stocktaking
2019 ecologist_movement
2020 ecology
2021 trophic_ecology
2022 economic_activity
2023 economical_ecological_efficiency
2024 economic_analysis
2025 economic_development
2026 economic_growth
2027 economic_instrument
2028 economic_management_instrument
2029 economic_planning
2030 economic_policy
2031 accident
2032 air_conditioning
2033 economics
2034 economic_situation
2035 economic_system
2036 economic_theory
2037 ec

2347 europe
2348 european_commission
2349 european_court_justice
2350 european_environment_agency
2351 european_environmental_council
2352 accident_source
2353 european_nature_reserve
2354 european_parliament
2355 european_union
2356 eutrophication
2357 alga
2358 evaluation
2359 evaluation_criterion
2360 evaluation_method
2361 evaluation_technology
2362 evaporation
2363 evapotranspiration
2364 evolution
2365 exact_science
2366 excavate_hole
2367 excavation_process
2368 algal_bloom
2369 excavation_heap
2370 excavation_site
2371 excessive_height_chimney_stack
2372 executive_order
2373 exhaust_device
2374 exhaust_gas
2375 exist_chemical
2376 exotic_specie
2377 expenditure
2378 algicide
2379 experiment
2380 expert_system
2381 exploration
2382 explosion
2383 explosive
2384 export
2385 export_license
2386 export_hazardous_waste
2387 exposure
2388 expropriation
2389 extensive_cattle_farm
2390 externality
2391 alicyclic_compound
2392 extinction_ecological
2393 extinct_species_iucn
2394 extract

2790 heavy_metal
2791 hedge
2792 herbicide
2793 herbivore
2794 heterocyclic_compound
2795 anaerobic_process
2796 high_education
2797 highland_ecosystem
2798 high_mountain
2799 high_protein_food
2800 high_rise_building
2801 high_speed_railway
2802 high_speed_train
2803 high_tide_water
2804 high_voltage_line
2805 highway
2806 hill
2807 analysis
2808 historical_evolution
2809 historical_monument
2810 historical_research
2811 historical_site
2812 analysis_program
2813 history
2814 holiday_camp
2815 vacation
2816 analytical_chemistry
2817 horse
2818 horticulture
2819 hospital
2820 hospital_waste
2821 hotel_industry
2822 hot_water
2823 analytical_equipment
2824 household
2825 household_chemical
2826 household_good
2827 analytical_method
2828 housing
2829 housing_density
2830 housing_finance
2831 housing_improvement
2832 housing_legislation
2833 housing_need
2834 housing_program
2835 housing_quality_standard
2836 swans_geese_duck
2837 installation_restoration
2838 human_biology
2839 human_dis

3193 law_amendment
3194 anthropologic_reserve
3195 law_enforcement
3196 neighborhood_law
3197 law_science
3198 leaching
3199 lead
3200 lead_compound
3201 lead_contamination
3202 lead_petrol_law
3203 lead_level_blood
3204 antibiotic
3205 leaf
3206 leakage
3207 leather
3208 leather_industry
3209 legal_basis
3210 antibody
3211 legally_protect_right
3212 legal_regulation
3213 legal_remedy
3214 legal_text
3215 legislation
3216 anticipation_danger
3217 legislation_pollution
3218 water_resource_legislation
3219 legislative_authority
3220 legislative_competence
3221 legislative_information
3222 legislature
3223 leisure_activity
3224 leisure_time
3225 lepidopteran
3226 antifoule_agent
3227 leukemia
3228 levy
3229 lexicon
3230 liability
3231 liability_marine_accident
3232 liability_nuclear_damage
3233 liability_legislation
3234 library
3235 permit
3236 permit_procedure
3237 permit_obligation
3238 lichen
3239 acidity_degree
3240 life_cycle
3241 life_science
3242 lifestyle
3243 light
3244 lighting

3640 nitrite
3641 nitro_compound
3642 nitrogen
3643 nitrogen_cycle
3644 nitrogen_dioxide
3645 nitrogen_fixation
3646 nitrogen_monoxide
3647 nitrogen_oxide
3648 nitrosamine
3649 aromatic_hydrocarbon
3650 noel
3651 noise
3652 noise_abatement
3653 noise_analysis
3654 noise_barrier
3655 noise_control
3656 noise_disturbance
3657 noise_emission
3658 noise_emission_levy
3659 noise_free_technology
3660 aromatic_substance
3661 noise_immission
3662 noise_legislation
3663 noise_level
3664 noise_measurement
3665 noise_monitoring
3666 noise_pollutant
3667 noise_pollution
3668 noise_protection
3669 noise_reduction
3670 noise_type
3671 arrangement_deposit_packaging
3672 nomad
3673 nomenclature
3674 non_biodegradable_pollutant
3675 non_build_area
3676 non_conventional_energy
3677 non_ferrous_metal_industry
3678 arsenic
3679 non_ferrous_metal
3680 non_governmental_organization
3681 non_ionize_radiation
3682 non_metallic_mineral
3683 non_metal
3684 non_pollute_energy_source
3685 non_pollute_fuel
3686 ar

4079 pollution_risk
4080 pollution_sink
4081 source_pollution
4082 polybrominate_biphenyl
4083 polychlorinate_biphenyl
4084 polychlordibenzo_p_dioxin
4085 polychlorinate_dibenzofuran
4086 polychlorinate_terphenyl
4087 polycyclic_aromatic_hydrocarbon
4088 polycyclic_hydrocarbon
4089 polyethylene_terephtalate
4090 polymerization
4091 polymer
4092 polyvinyl_chloride
4093 pond
4094 tailings_pond
4095 pool
4096 population_distribution
4097 population_dynamic
4098 population_ecological
4099 population_ecology
4100 population_growth
4101 population_movement
4102 population_structure
4103 population_trend
4104 post_treatment
4105 attribution
4106 potash
4107 rock_salt_mining
4108 poultry
4109 poultry_farming
4110 poverty
4111 power_company
4112 power_heat_relation
4113 power_station
4114 precipitation_chemical
4115 precipitation_enhancement
4116 predator
4117 prefabricated_building
4118 audiovisual_medium
4119 preliminary_proceeding
4120 premium
4121 preservation_evidence
4122 preservative
412

4474 rubber_processing
4475 rubber_processing_industry
4476 rubber_waste
4477 basidiomycete
4478 runoff
4479 rural_area
4480 rural_environment
4481 rural_population
4482 rural_settlement
4483 agritourism
4484 rural_water_supply
4485 batch_process
4486 safety
4487 safety_analysis
4488 safety_measure
4489 safety_rule
4490 safety_standard
4491 safety_standard_building
4492 safety_study
4493 salamander
4494 salination
4495 salmonella
4496 salt_content
4497 salt_load
4498 salt_marsh
4499 salt_meadow
4500 bathing_water
4501 salt_plug
4502 salt
4503 salt_water
4504 salvage
4505 sampling
4506 sampling_technique
4507 sanction
4508 sand
4509 sand_dune_fixation
4510 sand_dune
4511 sand_extraction
4512 sand_pit
4513 sanitary_fitting
4514 sanitary_landfill
4515 sanitation
4516 sanitation_plan
4517 saprobic_index
4518 battery
4519 saprobe
4520 satellite
4521 save
4522 sawdust
4523 battery_disposal
4524 schistosomiasis
4525 school
4526 school_teaching
4527 science
4528 scientific_technical_informatio

4917 sulfuric_acid
4918 sulfur_oxide
4919 biodegradability
4920 supervision_building_work
4921 supervision_installation
4922 supervisory_body
4923 supply_trade
4924 surface_active_agent
4925 surface_runoff
4926 surface_tension
4927 surface_treatment
4928 surface_water
4929 biodegradable_pollutant
4930 surgical_waste
4931 surplus
4932 surveillance
4933 survey
4934 sustainable_development
4935 sustainable_development_indicator
4936 sustainable_use
4937 marsh
4938 sweetener
4939 symbiosis
4940 biodegradation
4941 synecology
4942 synergism
4943 synergistic_effect_toxic_substance
4944 synthetic_detergent
4945 synthetic_fiber_industry
4946 biodiversity
4947 synthetic_material
4948 synthetic_material_industry
4949 synthetic_textile_fiber
4950 system_analysis
4951 system_comparison
4952 system_theory
4953 take_evidence
4954 tanker_truck
4955 tanker_ship
4956 tank_farm
4957 tannin
4958 tar
4959 target_group
4960 adaptation_period
4961 bioethic
4962 tariff
4963 tar_production
4964 tar_sand
4965 

5361 bird_prey
5362 waste_water_sludge
5363 waste_water_statistic
5364 waste_water_treatment
5365 waste_water_treatment_plant
5366 water_analysis
5367 biological_water_balance
5368 bird_specie
5369 water_body
5370 water
5371 water_collection
5372 water_conservation
5373 water_consumption
5374 watercourse
5375 water_demand
5376 water_distribution_system
5377 water_endangering
5378 water_erosion
5379 water_extraction
5380 birth_control
5381 waterfall
5382 water_flea
5383 hydrologic_flow
5384 water_agricultural_use
5385 water_consumption
5386 water_industrial_use
5387 waterfowl
5388 water_hardness
5389 water_hyacinth
5390 water_level
5391 waterlogged_land
5392 water_management
5393 water_monitoring
5394 water_pollutant
5395 water_pollution
5396 water_pollution_prevention
5397 water_protection
5398 water_protection_directive
5399 water_protection_legislation
5400 water_pump
5401 bitumen
5402 water_purification
5403 water_purification_plant
5404 water_quality
5405 water_quality_directive
54

5565

In [25]:
en_topics_lemma = lemma_lines
# in case of 'en' 5528, 'en-US' = 5207
last_normal_topic = len(en_topics_lemma)-37
print(en_topics_lemma[last_normal_topic])
range(last_normal_topic+1)[-1]

brick


5528

In [26]:
# explore topic similarities
new_topics_set = set()
topic_sims = {}
for i in range(len(topics)):
    topics_sims = []
    print(i, topics[i])
    for j in range(last_normal_topic+1):
        topic1 = topics[i]
        topic2 = en_topics_lemma[j]
        wmd = model.wmdistance(topic1.split("_"), topic2.split("_")) 
        topics_sims.append((topic2,wmd))
        #t_key = "{};{}".format(topic1, topic2)
        #topic_sims[t_key] = wmd  
    sorted_topic_sims = sorted(topics_sims,key=lambda kv: kv[1],reverse=False)   
    print(sorted_topic_sims[:10])
    #for (topic2, sim) in sorted_topic_sims:
    #    if sim<1.05:
    #        new_topics_set.add(topic2)
    #    else:
    #        break

0 demographic_ageing
[('demographic_evolution', 0.5651418864553571), ('demographic_development', 0.5942636280530691), ('demography', 0.9354307585099338), ('lifestyle', 1.0469696578809022), ('social_dynamics', 1.05991894551903), ('population_ecological', 1.0705016202278137), ('population_dynamic', 1.075036575906396), ('population_growth', 1.0774826825478077), ('active_population', 1.0791653442633748), ('social_behavior', 1.0795945825647115)]
1 demographic_changes
[('demographic_evolution', 0.5028112935488224), ('demographic_development', 0.5859596126720309), ('climatic_change', 0.8537222306868434), ('technological_change', 0.8802341930730938), ('environmental_change', 0.8959931860097647), ('demography', 0.9971868199794889), ('ecological_adaptation', 1.0238147886081337), ('genetic_modification', 1.0244619534984827), ('genetic_variation', 1.0254593916579486), ('climatic_alteration', 1.030725416010976)]
2 ageing_populations
[('population_dynamic', 0.9656780099158288), ('population_growth',

[('underprivileged_people', 0.5669658184051514), ('young', 0.9744276513626576), ('elderly_person', 0.988328977651), ('disabled_person', 1.0226888780498504), ('adult_education', 1.0975663941190243), ('adult', 1.1009478153381347), ('age', 1.1030134153395295), ('youth', 1.1102039466791154), ('woman', 1.1231409022158385), ('displaced_person', 1.1240477412164211)]
21 older_men
[('young', 1.027404404452622), ('elderly_person', 1.0566089288496971), ('underprivileged_people', 1.072712245041132), ('woman', 1.0780399214160443), ('age', 1.0830686413898467), ('disabled_person', 1.0909688292485475), ('adult', 1.100284648483038), ('woman_status', 1.1026354416930675), ('adult_education', 1.103043147326231), ('youth', 1.1423254492431878)]
22 elderly_women
[('elderly_person', 0.5798571999793053), ('underprivileged_people', 0.9345577915204166), ('disabled_person', 0.9689936356129647), ('woman', 0.9941094939184189), ('young', 1.0165106871024967), ('youth', 1.064160676161349), ('woman_status', 1.080714752

[('wage_system', 0.928711833779633), ('high_education', 1.044142104219675), ('upper_house', 1.0521081040322184), ('low_house', 1.0670895522705317), ('low_flow', 1.0847850312895775), ('high_mountain', 1.0895345118191242), ('low_cost_housing', 1.0924989697027416), ('cost_increase', 1.1001841260430814), ('employment_structure', 1.1002326490049361), ('employment_level_effect', 1.1055783336104261)]
41 higher_wages
[('wage_system', 0.9417216111398935), ('high_education', 1.0087710851433278), ('high_mountain', 1.054163492742777), ('low_house', 1.078469023480296), ('employment_structure', 1.083202907276988), ('employment_level_effect', 1.0849404137830436), ('level_education', 1.0869377785634995), ('employment_environment', 1.0878307261726856), ('low_flow', 1.0961645024993418), ('employment', 1.1014113638015985)]
42 higher_salaries
[('high_education', 1.0139689037454127), ('wage_system', 1.0755757011684774), ('high_mountain', 1.0818393646130562), ('low_house', 1.086255048709452), ('level_educat

[('foreign_policy', 0.7863791640399717), ('foreign_trade', 0.8187277217096006), ('foreign_economic_relation', 0.8236814294330831), ('indigenous_knowledge', 1.0760614031564264), ('teaching', 1.0944052738969305), ('vocational_training', 1.1091970000727245), ('speech', 1.110302400576721), ('initial_training', 1.1160923359877104), ('vocabulary', 1.1261653353867942), ('training', 1.1272226427042085)]
60 professional_skills
[('professional_society', 0.5981422513903379), ('vocational_training', 1.0037844558116198), ('training', 1.0188277780407071), ('environmental_training', 1.062763826153338), ('initial_training', 1.0651394321883918), ('administrative_competence', 1.079637441133976), ('teaching', 1.0811529267821312), ('medical_science', 1.0812850003364682), ('school_teaching', 1.0856673125380278), ('indigenous_knowledge', 1.08999246497339)]
61 technical_skills
[('technical_information', 0.5739817779824137), ('technical_instruction', 0.5760706257061958), ('maintenance_technical', 0.6134478449

[('social_service', 0.4113308566807508), ('social_system', 0.5570790649715662), ('social_participation', 0.575143792956829), ('social_representation', 0.5759313641371727), ('social_protection', 0.5764254653602838), ('social_policy', 0.580567067920208), ('social_security', 0.5869829068790674), ('social_framework', 0.5886365571264028), ('social_cost', 0.5892678374075889), ('social_medicine', 0.5950453123054504)]
79 social_care
[('social_welfare', 0.5430709984481334), ('social_medicine', 0.5530389560585022), ('health_care', 0.5587711227938532), ('social_service', 0.5819820731480718), ('social_psychology', 0.599771803638339), ('social_relief', 0.6071046375002861), ('social_condition', 0.6079724573716521), ('social_problem', 0.6109470794796944), ('social_movement', 0.6120474050421119), ('social_security', 0.614646315574646)]
80 welfare_services
[('social_welfare', 0.623078465461731), ('theory_welfare_state', 0.8187180635501202), ('health_service', 0.853229342289567), ('social_service', 0.92

[('technological_development', 0.5475748181343079), ('technological_change', 0.5556533924454451), ('technological_process', 0.5918451785758138), ('technological_accident', 0.6488670110702515), ('technological_hazard', 0.6669448614120483), ('innovation', 0.9091352863455117), ('technology', 0.9759747297733128), ('scientific_research', 0.9857606223365665), ('new_technology', 1.0089628360560536), ('scientific_ecology', 1.015698104133904)]
97 technological_progress
[('technological_change', 0.5495884345122576), ('technological_development', 0.5601060450685025), ('technological_process', 0.5637345649981499), ('progress_line', 0.6408568024635315), ('technological_accident', 0.6554660797119141), ('technological_hazard', 0.6686695218086243), ('scientific_cooperation', 1.0083242544311286), ('scientific_technical_information', 1.0145597958112385), ('scientific_policy', 1.0147298313133122), ('scientific_committee', 1.0148038220381141)]
98 technological_changes
[('technological_change', 0.356669903

[('novel_food', 1.0171252480436563), ('new_technology', 1.0414332843965293), ('alternative_technology', 1.0421040393390657), ('approach', 1.0496113445073367), ('sustainable_use', 1.054556982139349), ('environment_friendly', 1.0563531510375141), ('new_material', 1.0580998550055027), ('alternative_material', 1.0587701498813629), ('technology', 1.078372933872223), ('environmentally_friendly_product', 1.0830702043695695)]
115 new_solutions
[('new_technology', 0.5624218860393763), ('new_material', 0.5790883392069339), ('new_installation', 0.6192068457603455), ('new_town', 0.6408774852752686), ('new_community', 0.6530278921127319), ('novel_food', 1.0343230573537947), ('alternative_technology', 1.1055019216746091), ('mathematical_method', 1.1088418547564147), ('biochemical_method', 1.1105088319131733), ('teaching_method', 1.1112548683929444)]
116 new_approaches
[('new_technology', 0.5619236432065964), ('new_material', 0.6117537021636963), ('new_community', 0.6346823573112488), ('new_installat

[('man_climate_change', 0.4209816871170201), ('climatic_change', 0.4400683474617004), ('environmental_change', 0.4944146500854492), ('climate', 0.5294262766838074), ('technological_change', 0.5623146735993624), ('climate_resource', 0.5723705291748047), ('climate_type', 0.593866939296782), ('tropical_climate', 0.6128417253494263), ('desert_climate', 0.6180621981620789), ('mediterranean_climate', 0.6270692944526672)]
134 global_warming
[('global_warming', 0.0), ('global_model', 0.6187983751296997), ('global_convention', 0.6209043316904306), ('global_aspect', 0.6655065417289734), ('global_environment_facility', 0.8231949300114173), ('climate', 0.9790083295387029), ('continental_climate', 0.9797388146594167), ('climate_resource', 1.0207360221989155), ('permafrost_ecosystem', 1.0358977407753467), ('temperate_climate', 1.0382540885778666)]
135 climatic_changes
[('climatic_change', 0.35666981017929317), ('climatic_alteration', 0.5336723160942793), ('climatic_effect', 0.5552777801668644), ('cl

[('economic_situation', 0.7810037826808497), ('environmental_economic_valuation', 0.8096032684813443), ('economic_growth', 0.8196569587867727), ('economic_development', 0.8197883763408101), ('economic_analysis', 0.82179896673849), ('economic_structure', 0.8229793852886952), ('economic_concentration', 0.825018318182469), ('economic_planning', 0.8258300242029089), ('economic_system', 0.8259799015580089), ('economic_plan', 0.8261628585601378)]
153 small_farms
[('small_scale_furnace', 0.8116338035374009), ('small_scale_inducer', 0.8250472622044808), ('small_power_station', 0.8252089811629008), ('small_medium_sized_industry', 0.8602366231405735), ('small_island_political_geography', 0.934575239823997), ('organic_farming', 0.9931635781136156), ('dairy_farm', 0.9972416419448853), ('mixed_farming', 1.0139358433285952), ('farm_price', 1.0159433900909425), ('fish_farming', 1.0159488489425184)]
154 family_farms
[('family', 0.5942357182502747), ('family_planning', 0.6329235434532166), ('family_law

[('large_combustion_plant', 0.8347182950454245), ('company_structure', 0.9533909173672199), ('electricity_company', 0.9889710750515461), ('small_medium_sized_industry', 0.9983828791752457), ('power_company', 0.998838386158228), ('gas_company', 0.9998240713688135), ('company_policy', 1.004479431687951), ('multinational_firm', 1.0141107103395461), ('size_business', 1.0418711686567663), ('small_scale_furnace', 1.0464616792509736)]
172 larger_firms
[('multinational_firm', 1.0149836574344635), ('size_business', 1.0277768140309453), ('close_firm', 1.037311461675644), ('firm', 1.0684168403377534), ('company_structure', 1.071258314877212), ('small_medium_sized_industry', 1.0736982704845668), ('electricity_company', 1.0793537041820882), ('power_company', 1.0966275289092062), ('small_scale_furnace', 1.09967465690817), ('company_policy', 1.1069928823122381)]
173 large_enterprises
[('large_combustion_plant', 0.8457428598839333), ('small_medium_sized_industry', 0.9849897199342549), ('size_business'

[('soil_quality', 0.5470829648015499), ('environmental_quality', 0.5596662429406047), ('landscape_management', 0.5609369134803415), ('landscape', 0.5762786269187927), ('landscape_conservation', 0.5763309597969055), ('landscape_protection', 0.5818205249261856), ('landscape_utilization', 0.5865119623974562), ('landscape_component', 0.5867162276860476), ('quality_life', 0.5905134677886963), ('landscape_consumption', 0.5970097721099854)]
191 landscape_dynamics
[('landscape_ecology', 0.49237766765266655), ('landscape', 0.5479965806007385), ('landscape_management', 0.5594359708816409), ('landscape_planning', 0.573382556438446), ('landscape_architecture', 0.5802281498908997), ('social_dynamics', 0.5878992244809866), ('urban_landscape', 0.5993729829788208), ('landscape_component', 0.6011754958108664), ('landscape_utilization', 0.6044285483038426), ('landscape_conservation', 0.6044299602508545)]
192 landscape_maintenance
[('maintenance_environment', 0.5163994381370545), ('landscape_alteration',

[('water_resource', 0.4115767784520984), ('water_management', 0.5407596400920153), ('water_utilization', 0.5454708949625492), ('water_consumption', 0.5614906753480435), ('water_consumption', 0.5614906753480435), ('water_conservation', 0.5624796784161329), ('water_transportation', 0.5667500491663814), ('water_demand', 0.5767502805026173), ('water_saving', 0.5771414007678628), ('raw_water', 0.5779290512546897)]
211 social_supports
[('social_participation', 0.5944922655103207), ('social_development', 0.5946358399652243), ('social_structure', 0.5994030200961828), ('social_process', 0.6039713559599519), ('social_policy', 0.6157679668143988), ('social_protection', 0.6199481434217692), ('social_framework', 0.6226576347941757), ('social_service', 0.6230438881949782), ('social_cost', 0.6243182379221917), ('social_security', 0.6273968623763919)]
212 credit_financing
[('credit_assistance', 0.5100148746408224), ('credit_policy', 0.5445308796200753), ('credit', 0.5916622281074524), ('financing', 0.

[('land_use', 0.5534197077100277), ('land_access', 0.5700094083631039), ('land_transportation', 0.571286234227717), ('land_value', 0.58960012413311), ('land_form', 0.6021522835371493), ('land_cover', 0.6026756278728247), ('land_disposal', 0.605202199575305), ('land_development', 0.6068885967775584), ('land_clearing', 0.6071408787879348), ('land', 0.6084277033805847)]
230 established_community
[('new_community', 0.6029437184333801), ('community_law', 0.6067215628453493), ('community_participation', 0.6067336201667786), ('plant_community', 0.6142798066139221), ('community_act', 0.6153002701663971), ('community_ruling', 0.617154061794281), ('european_community', 0.622243344783783), ('community_facility', 0.6256797909736633), ('community_finance', 0.6268004310225248), ('ecological_community', 0.6310429573059082)]
231 rural_ireland
[('rural_area', 0.5922867811657787), ('rural_law', 0.5970358090258836), ('rural_development', 0.6173962950706482), ('rural_environment', 0.621128499507904), ('ru

[('economy', 0.5119001269340515), ('dual_economy', 0.5902918089270591), ('market_economy', 0.6324763298034668), ('yield_economy', 0.6422559983333945), ('national_economy', 0.6469451785087585), ('circular_mail', 0.6708628429886698), ('environmental_economy_issue', 0.8150912537278711), ('society', 1.047874958642304), ('culture_society', 1.0716633734394907), ('industrial_society', 1.078303166835308)]
250 green_economy
[('green_revolution', 0.5724263700045347), ('urban_green', 0.5776232668476105), ('economy', 0.5923357009887695), ('dual_economy', 0.5960349874675274), ('green_building', 0.5967165621269345), ('yield_economy', 0.6011863193553687), ('green_space', 0.6201781034469604), ('market_economy', 0.623924195766449), ('green_vegetable', 0.63149493932724), ('green_manure', 0.6373409628868103)]
251 eu_regulations
[('eu_council', 0.5856350038595796), ('ec_regulation', 0.9843606994309425), ('ec_directive', 1.0154702629016639), ('ec_policy', 1.0553730340805054), ('ec_treaty', 1.05945633686602

[('rural_development', 0.40609158499507175), ('plan_urban_development', 0.660654986616423), ('urban_development_law', 0.6763495749578322), ('urban_development_document', 0.6835323356766041), ('urban_planning_development', 0.6911891866881522), ('urban_development', 0.6957847905496453), ('rural_management_planning', 0.7402775282820616), ('regional_development', 0.7536327208138461), ('coastal_development', 0.7536626446032068), ('rural_environment', 0.7667491303837702)]
269 water_management
[('water_management', 0.0), ('water_resource_management', 0.37995938713631067), ('surface_water_management', 0.3848603579263376), ('water_quality_management', 0.3966168965693479), ('municipal_water_management', 0.40232626678713096), ('water_quantity_management', 0.41036322133040004), ('water_monitoring', 0.4764947895002961), ('water_utilization', 0.5301401739108562), ('water_resource', 0.5398346746462583), ('energy_management', 0.5491542320255041)]
270 water_resource_management
[('water_resource_managem

[('economic_sector', 0.5489254237221479), ('agricultural_policy', 0.5502016577403546), ('agricultural_production', 0.5591438670252562), ('agricultural_landscape', 0.5673224777590037), ('agricultural_structure', 0.5695092879054546), ('agricultural_management', 0.5792486483848095), ('agricultural_equipment', 0.5849890242643953), ('agricultural_legislation', 0.585408096718192), ('agricultural_biotechnology', 0.5866767848511935), ('agricultural_storage', 0.5882992671296001)]
288 population_change
[('composition_population', 0.5633103350687028), ('population_structure', 0.5660041617254019), ('population_growth', 0.5661616296719313), ('population_dynamic', 0.5803406184604168), ('population_trend', 0.5816953426988125), ('population_movement', 0.5834608836903572), ('population_distribution', 0.5979382863578797), ('population_ecology', 0.5989587036463022), ('climatic_change', 0.6037252333928347), ('human_population', 0.6102614029194117)]
289 local_food_production
[('food_production_agriculture'

[('high_education', 0.53053352419281), ('cultural_good', 0.5668532452454567), ('education_policy', 0.5835916470404863), ('environmental_education', 0.5897364745355844), ('general_education', 0.5935831159186363), ('good_service', 0.6015310364166497), ('level_education', 0.6035049623283147), ('good_management', 0.6084533148096799), ('consumer_good', 0.6172644249862432), ('primary_education', 0.6185324564259052)]
307 new_jobs
[('new_town', 0.6226572409980893), ('new_technology', 0.6251225471496582), ('new_installation', 0.6303911805152893), ('new_community', 0.643151581287384), ('new_material', 0.6488609313964844), ('employment_structure', 1.0201353941321374), ('employment_environment', 1.0208016480249167), ('employment', 1.0396430848556757), ('novel_food', 1.0623624955462216), ('termination_employment', 1.0726470823975205)]
308 additional_jobs
[('additional_packaging', 0.6580209732055664), ('employment_structure', 1.0167861290154456), ('termination_employment', 1.0427977432471514), ('emp

[('arctic_region', 0.6348558876806497), ('antarctic_region', 0.636036974132657), ('economic_region', 0.6385371123754978), ('region', 0.6446978449821472), ('biogeographical_region', 0.6572789549827576), ('polar_region', 0.6703232600419522), ('regional_convention', 1.063078745003581), ('regional_regulation', 1.0937047466444372), ('caribbean_area', 1.107434190745175), ('continent', 1.1217154105668068)]
326 regional_landscape
[('regional_structure', 0.5239680349418521), ('urban_landscape', 0.5388189861046672), ('landscape', 0.5590024590492249), ('regional_development', 0.56338072432822), ('landscape_management', 0.5648419581676125), ('regional_planning', 0.567135837412715), ('landscape_planning', 0.5702213048934937), ('agricultural_landscape', 0.5951353443759084), ('landscape_component', 0.6082831214580536), ('landscape_ecology', 0.6108423471450806)]
327 land_use_changes
[('land_use_classification', 0.40728944986618115), ('land_use', 0.4170625819523191), ('land_use_planning', 0.41717480402

[('soil_degradation', 0.0), ('degradation', 0.49801698327064514), ('soil', 0.49801698327064514), ('soil_leaching', 0.506308913230896), ('soil_damage', 0.5154146143664121), ('soil_compaction', 0.518385648727417), ('soil_erosion', 0.5249780416488647), ('freshwater_degradation', 0.5278134160132408), ('soil_decontamination', 0.530745891838789), ('ecosystem_degradation', 0.5313813868242502)]
346 soil_erosion
[('soil_erosion', 0.0), ('soil_salination', 0.46386854522562027), ('soil_subsidence', 0.46491990479081874), ('soil_compaction', 0.46660406637370583), ('erosion', 0.4945920407772064), ('soil', 0.4945920407772064), ('soil_leaching', 0.5068541765213013), ('soil_pollution', 0.510433892264843), ('soil_degradation', 0.5249780416488647), ('water_erosion', 0.5287915956485272)]
347 water_erosion
[('water_erosion', 0.0), ('water_salination', 0.46386845675230026), ('soil_water', 0.4945922180509567), ('water_pollution', 0.5104342341205478), ('drainage_water', 0.5235757184010744), ('soil_erosion', 0

[('automobile_industry', 0.0), ('petrochemical_industry', 0.5260274176226855), ('clothing_industry', 0.5315477230944037), ('steel_industry', 0.54614046872437), ('furniture_industry', 0.560764633578062), ('hotel_industry', 0.5615539945982099), ('industry', 0.561700701713562), ('extractive_industry', 0.5648185171108245), ('electrotechnical_industry', 0.569826281054616), ('pharmaceutical_industry', 0.5705749496426582)]
366 motor_industry
[('motor_vehicle_industry', 0.37405429115616207), ('electrical_industry', 0.5312045938866734), ('automobile_industry', 0.5312408162103295), ('craft_industry', 0.5651402198369503), ('mechanical_industry', 0.565703624426961), ('steel_industry', 0.5702501398984193), ('energy_industry', 0.5783311292629838), ('mineral_industry', 0.5784226287978887), ('refrigeration_industry', 0.5928035079525709), ('textile_industry', 0.593102394701302)]
367 historical_heritage
[('historical_monument', 0.4725358258378506), ('cultural_heritage', 0.48042788211107257), ('biologica

[('urban_facility', 0.5785848112106323), ('urban_area', 0.5967427322859764), ('urban_management', 0.6013938788599967), ('urban_ecosystem', 0.6027440904126167), ('urban_development', 0.6050521122031212), ('urban_landscape', 0.6055228684129714), ('urban_structure', 0.6210815531162023), ('urban_planning', 0.625034490974903), ('urban_traffic', 0.62752366065979), ('urban_ecology', 0.6341652274131775)]
385 settlement_structure
[('institutional_structure', 0.5872074342684745), ('human_settlement', 0.6031702093020678), ('urban_structure', 0.6147821580961942), ('regional_structure', 0.6194179058074951), ('agricultural_structure', 0.6219119429588318), ('social_structure', 0.6224808692932129), ('employment_structure', 0.6286013004487753), ('settlement_concentration', 0.6297576248635054), ('transitional_settlement', 0.6314802169799805), ('company_structure', 0.6321804523468018)]
386 cultural_landscape
[('cultural_heritage', 0.48243452039033174), ('landscape', 0.5182603001594543), ('cultural_develo

[('education', 0.4770478308200836), ('school', 0.4770478308200836), ('school_teaching', 0.4778488278388977), ('secondary_education', 0.5126750049233436), ('adult_education', 0.5642715692520142), ('level_education', 0.5718711945177317), ('general_education', 0.5809167165595293), ('education_policy', 0.5842352509498596), ('primary_education', 0.5931854636228084), ('school_life', 0.6062048077583313)]
403 upper_secondary_education
[('secondary_education', 0.3664090543772696), ('primary_education', 0.6601987386890255), ('high_education', 0.7494046024459555), ('adult_education', 0.750608064211884), ('level_education', 0.7593047259976663), ('education', 0.7686618679392883), ('general_education', 0.772345557205284), ('secondary_sector', 0.7911212720581652), ('upper_house', 0.7990514456424268), ('environmental_education', 0.8020297315122673)]
404 secondary_education
[('secondary_education', 0.0), ('primary_education', 0.4109256243993044), ('level_education', 0.5498995291649104), ('general_educa

print(len(new_topics_set))
for topic in sorted(new_topics_set):
    print(topic)

In [27]:
gemet_topic_sims = [10]*len(en_topics_lemma)
print(len(en_topics_lemma), len(gemet_topic_sims))
for i in range(50):
   print(gemet_topic_sims[i],en_topics_lemma[i],en_topics[i])

5565 5565
10 administrative_body administrative body
10 accounting accounting
10 animal_life animal life
10 consumer_product consumer product
10 bridge bridge
10 environmental_administration_institution environmental administration institution
10 health_effect_noise health effect of noise
10 human_body human body
10 human_science human science
10 information_transfer information transfer
10 juridical_act juridical act
10 meteorological_research meteorological research
10 natural_area_protection natural areas protection
10 natural_risk_prevention natural risks prevention
10 physical_chemistry physical chemistry
10 physical_measurement_pollution physical measurement of pollution
10 plant_life plant life
10 plant_production plant production
10 pollution_type pollution type
10 pollution_prevention pollution prevention
10 risk_management risk management
10 safety_system safety system
10 seismic_engineering seismic engineering
10 social_science social science
10 surface_water_management surf

In [28]:
# compare all topics with GEMET topics, save the smallest WMD similarities for GEMET topics
for i in tqdm(range(len(topics))):
    #print(i, topics[i])
    for j in range(last_normal_topic+1):
        topic1 = topics[i]
        topic2 = en_topics_lemma[j]
        wmd = model.wmdistance(topic1.split("_"), topic2.split("_")) 
        if gemet_topic_sims[j]>wmd:
            gemet_topic_sims[j] = wmd
         

100%|██████████| 422/422 [04:42<00:00,  1.49it/s]


In [29]:
gemet_topic_sims[1]

1.1377433419564962

In [30]:
topic_sim_tuples = []
for i in range(last_normal_topic+1):
    topic_sim_tuples.append((i, en_topics[i],en_topics_lemma[i],gemet_topic_sims[i]))
# sort the GEMET topics by smallest WMD similarity 
sorted_topic_sim_tuples = sorted(topic_sim_tuples, key=lambda tup: tup[3], reverse=False)
print(len(topic_sim_tuples), len(sorted_topic_sim_tuples))

# print the topics and similarities (find where is the threshold WMD>1)
for i,(topic_id, topic,topic_lemma,sim) in enumerate(sorted_topic_sim_tuples):
    if sim>1:
        break
    print(i, topic_id, topic,topic_lemma,sim)
print(i)  # 2381 = 2380 was the last item with wmd<1 for 'en', 2171/2170 for 'en-US'

# export topic lists for all needed languages - sorted by topic relevance
# languages missing: iw - hebrew, mk - macedonian 
lang_list = ['en' , 'en-US', 'sk', 'cs', 'fi', 'nl', 'el', 'it', 'lv', 'pl', 'es']
for lang in lang_list:
    filename = '/home/dzon/kajo/topics/topics_' + lang + '.lst'
    topics_list = df_dict[lang]
    with open(filename, 'wt', encoding='utf8') as f:
        for j in range(last_normal_topic+1): #range(i)
            (topic_id,_,_,_) = sorted_topic_sim_tuples[j]
            topic = topics_list[topic_id]
            if not isinstance(topic, str): # and math.isnan(topic):
                continue
            f.write(topic + "\n")

5529 5529
0 298 local development local_development 0.0
1 327 rural development rural_development 0.0
2 637 water resources management water_resource_management 0.0
3 953 level of education level_education 0.0
4 954 general education general_education 0.0
5 995 bank (land) bank_land 0.0
6 1606 Community law community_law 0.0
7 1745 cultural heritage cultural_heritage 0.0
8 1796 agricultural land agricultural_land 0.0
9 1807 agricultural landscape agricultural_landscape 0.0
10 1878 agricultural production agricultural_production 0.0
11 2025 economic development economic_development 0.0
12 2515 food chain food_chain 0.0
13 2521 food industry food_industry 0.0
14 2527 food quality food_quality 0.0
15 2685 global warming global_warming 0.0
16 2716 greenhouse effect greenhouse_effect 0.0
17 2941 industrial development industrial_development 0.0
18 3126 labor market labor_market 0.0
19 3136 land access land_access 0.0
20 3143 land consolidation land_consolidation 0.0
21 3157 land planning la

851 804 Community finance community_finance 0.6012246420243978
852 1658 age age 0.6013782024383545
853 2166 energy policy energy_policy 0.6013882756233215
854 641 reclamation industry reclamation_industry 0.6016485767589808
855 5379 water extraction water_extraction 0.6016830886435509
856 4844 staple food staple_food 0.6017045974731445
857 719 industrial safety industrial_safety 0.6018241795063018
858 3144 land cover land_cover 0.6019506305045486
859 274 national accounting national_accounting 0.6019906582111717
860 2189 environmental accounting environmental_accounting 0.6019908141171931
861 1091 climatic factor climatic_factor 0.6020120818653703
862 3805 organic solvent organic_solvent 0.6020165085792542
863 1201 business economics business_economic 0.6020169258117676
864 356 mountain management mountain_management 0.6020304469752312
865 3617 nature reserve nature_reserve 0.6020489419555664
866 4388 residential area residential_area 0.6020920788061619
867 758 environmental study envi

1742 90 central park area central_park_area 0.7930933497706184
1743 4771 solar energy technology solar_energy_technology 0.7932795782428115
1744 1163 digital image processing technique digital_image_processing_technique 0.7933936628448963
1745 5434 weather condition weather_condition 0.7937802428515648
1746 280 European Monetary Fund european_monetary_fund 0.7939195020086592
1747 3045 internal European market internal_european_market 0.79392502975227
1748 4925 surface runoff surface_runoff 0.7939945112629258
1749 15 physical measurement of pollution physical_measurement_pollution 0.7940193663958518
1750 430 criminal law procedure criminal_law_procedure 0.7942137630005749
1751 4556 sea water protection sea_water_protection 0.7942528812013556
1752 5403 water purification plant water_purification_plant 0.7946569934446451
1753 420 public waterways domain public_waterway_domain 0.794732101308836
1754 888 European Monetary System european_monetary_system 0.7949217790215178
1755 5436 weather 

In [31]:
sorted_topic_sim_tuples[:10]

[(298, 'local development', 'local_development', 0.0),
 (327, 'rural development', 'rural_development', 0.0),
 (637, 'water resources management', 'water_resource_management', 0.0),
 (953, 'level of education', 'level_education', 0.0),
 (954, 'general education', 'general_education', 0.0),
 (995, 'bank (land)', 'bank_land', 0.0),
 (1606, 'Community law', 'community_law', 0.0),
 (1745, 'cultural heritage', 'cultural_heritage', 0.0),
 (1796, 'agricultural land', 'agricultural_land', 0.0),
 (1807, 'agricultural landscape', 'agricultural_landscape', 0.0)]