# Explore topic similarities with WMD similarity measure

* comparing keywords found in texts with topics using word mover distance to find a reasonable threshold for wmd similarity keyword-topic
* found WMD threshold = 1.0

In [28]:
import spacy
from tqdm import tqdm
from pprint import pprint
import time


import numpy as np
import math

import os

import pytextrank


In [3]:
from gensim.models import Word2Vec, KeyedVectors
slim_model_name = '/home/dzon/kajo/word2vec-slim/w2v-jiri-slim.bin'
start = time.time()
model = KeyedVectors.load_word2vec_format(slim_model_name, binary=True)
print('Finished loading model %.4f s' % ((time.time()-start)))
start = time.time()
#normalize vectors - needed for better performance of WMD
model.init_sims(replace=True)
print('Finished normalizing vectors %.4f s' % ((time.time()-start)))


Finished loading model 3.2377 s
Finished normalizing vectors 0.0899 s


In [4]:
#get topics from file
with open('/home/dzon/kajo/semantic-explorer/data/topic_list_v3.txt') as fp:
    topics_file = fp.readlines()
topics = []
for topic in topics_file:
    topic = topic.strip().lower()
    if topic not in topics:
        topics.append(topic)

In [6]:
topic_sims = {}
for i in tqdm(range(len(topics))):
    for j in range(i+1,len(topics)):
        topic1 = topics[i]
        topic2 = topics[j]
        wmd = model.wmdistance(topic1.split("_"), topic2.split("_")) 
        t_key = "{}-{}".format(topic1, topic2)
        topic_sims[t_key] = wmd  

100%|██████████| 422/422 [00:12<00:00, 33.39it/s] 


In [27]:
sorted_topic_sims = sorted(topic_sims.items(),
                          key=lambda kv: kv[1],
                          reverse=False)
print('Most similar topics:')
pprint(sorted_topic_sims[:10])
print("\nLeast similar topics:")
pprint(sorted_topic_sims[-10:])

print("\nTopics with sim > 1:")
#pprint([(t,s) for (t,s) in sorted_topic_sims if s>1])
print(len([(t,s) for (t,s) in sorted_topic_sims if s>1]))
print("\nTopics with sim <= 1:")
print(len([(t,s) for (t,s) in sorted_topic_sims if s<=1]))


Most similar topics:
[('illegal_immigrants-illegal_migrants', 0.25969861426860097),
 ('irregular_migrants-irregular_immigrants', 0.25969867834281923),
 ('flemish_region-walloon_region', 0.26490518466144797),
 ('smaller_companies-larger_companies', 0.27452466300517325),
 ('smaller_firms-larger_firms', 0.27452476633280515),
 ('smaller_enterprises-larger_enterprises', 0.27452497996866704),
 ('lower_wages-higher_wages', 0.28893259796476367),
 ('upper_secondary_education-lower_secondary_education', 0.2902137666652251),
 ('permanent_pasture-permanent_pastures', 0.297483636610508),
 ('young_women-young_men', 0.30129122592914104)]

Least similar topics:
[('certification_programme-flash_floods', 1.3530118881440163),
 ('irregular_immigrants-leader_programme', 1.353569445798397),
 ('funding_cuts-riparian_vegetation', 1.3536500371118187),
 ('eu_regulations-soil_degradation', 1.3540491406162978),
 ('eu_rules-riparian_vegetation', 1.3543952122431993),
 ('eu_rules-soil_degradation', 1.354561985544145

In [14]:
words_number = {}
for topic in topics:
    words = len(topic.split("_"))
    try:
        words_number[words] += 1
    except:
        words_number[words] = 1
pprint(words_number)        

{2: 378, 3: 38, 4: 6}


In [29]:
 def get_keywords(text=None, keywords=10, **kwargs):
        """ Get keywords from text using TextRank algorithm implemented in pytextrank """
        # Option to get raw keywords without any processing
        raw_keywords = kwargs.get("raw_keywords", False)
        if "textrank" not in nlp.pipe_names:
            tr = pytextrank.TextRank()
            nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
        keywords_dict = {}
        doc = nlp(text)
        i = 0
        for p in doc._.phrases:
            p_doc = nlp(p.text)
            keyword_text = []
            for token in p_doc:
                # for purposes of the topic extraction, we clean the keyword phrases
                if token.is_punct or token.is_space or token.is_stop or (len(token) < 2):
                    continue
                keyword_text.append(token.lemma_)
            if keyword_text == []:
                continue
            keyword = "_".join(keyword_text)
            if raw_keywords:
                keyword = p.text
            # the keyword after lemmatization and token filtering can be already in the list
            if keyword in keywords_dict:
                if keywords_dict[keyword] < p.rank:
                    keywords_dict[keyword] = p.rank
            else:
                keywords_dict[keyword] = p.rank
                i += 1
            if i == keywords:
                break
        return keywords_dict

In [36]:
nlp = spacy.load("en_core_web_sm")
keywords = []
keywords_raw = []

for dirpath, dirnames, filenames in os.walk("/home/dzon/kajo/vito-texts-utf8"):
    i = 0
    for filename in [f for f in filenames if f.endswith(".txt")]:
        print (os.path.join(dirpath, filename))
        text_file = os.path.join(dirpath, filename)
        with open(text_file, encoding="utf-8", mode="r") as fp:
            text = fp.read() 
        kw_dic = get_keywords(text,keywords=1000)   
        print(len(kw_dic))
        for key in kw_dic.keys():
            keywords.append(key)
        #pprint(kw_dic)
        kw_raw_dic = get_keywords(text,keywords=1000,raw_keywords=True)   
        print(len(kw_raw_dic))
        for key in kw_raw_dic.keys():
            keywords_raw.append(key)
            

/home/dzon/kajo/vito-texts-utf8/agricultural_productivity_full.txt
40
40
/home/dzon/kajo/vito-texts-utf8/report-summaries_self-reliance-agriculture-food-discussion.txt
65
68
/home/dzon/kajo/vito-texts-utf8/organic_agriculture.txt
102
104
/home/dzon/kajo/vito-texts-utf8/report-summaries_loss-and-waste-food-chain.txt
187
188
/home/dzon/kajo/vito-texts-utf8/report-summaries_risk-management-agriculture-and-horticulture.txt
542
561
/home/dzon/kajo/vito-texts-utf8/report-summaries_impact-direct-support-business-income.txt
108
112
/home/dzon/kajo/vito-texts-utf8/report-summaries_innovation-agriculture-and-horticulture.txt
178
183
/home/dzon/kajo/vito-texts-utf8/role_of_water_in_agricultural_development.txt
888
912
/home/dzon/kajo/vito-texts-utf8/report-summaries_production-account-flemish-agriculture-and.txt
70
70
/home/dzon/kajo/vito-texts-utf8/report-summaries_organic-agriculture-2009.txt
119
119
/home/dzon/kajo/vito-texts-utf8/report-summaries_scientific-report-mira-2009-sub-sector.txt
151

In [40]:
topic_kw_sims = {}  
for i in tqdm(range(len(topics))):
    for j in range(len(keywords)):
        topic1 = topics[i]
        topic2 = keywords[j]
        wmd = model.wmdistance(topic1.split("_"), topic2.split("_")) 
        t_key = "{}-{}".format(topic1, topic2)
        topic_kw_sims[t_key] = wmd  




  0%|          | 0/422 [00:00<?, ?it/s][A[A[A


  0%|          | 1/422 [00:00<03:11,  2.20it/s][A[A[A


  0%|          | 2/422 [00:00<03:09,  2.22it/s][A[A[A


  1%|          | 3/422 [00:01<03:09,  2.21it/s][A[A[A


  1%|          | 4/422 [00:01<03:09,  2.21it/s][A[A[A


  1%|          | 5/422 [00:02<03:09,  2.20it/s][A[A[A


  1%|▏         | 6/422 [00:02<03:07,  2.22it/s][A[A[A


  2%|▏         | 7/422 [00:03<03:06,  2.22it/s][A[A[A


  2%|▏         | 8/422 [00:03<03:05,  2.23it/s][A[A[A


  2%|▏         | 9/422 [00:04<03:04,  2.24it/s][A[A[A


  2%|▏         | 10/422 [00:04<03:03,  2.25it/s][A[A[A


  3%|▎         | 11/422 [00:04<03:01,  2.26it/s][A[A[A


  3%|▎         | 12/422 [00:05<03:14,  2.11it/s][A[A[A


  3%|▎         | 13/422 [00:05<03:08,  2.16it/s][A[A[A


  3%|▎         | 14/422 [00:06<03:07,  2.18it/s][A[A[A


  4%|▎         | 15/422 [00:06<03:05,  2.20it/s][A[A[A


  4%|▍         | 16/422 [00:07<03:19,  2.03it/s][A[A

 64%|██████▍   | 270/422 [02:04<01:15,  2.02it/s][A[A[A


 64%|██████▍   | 271/422 [02:05<01:17,  1.96it/s][A[A[A


 64%|██████▍   | 272/422 [02:05<01:13,  2.05it/s][A[A[A


 65%|██████▍   | 273/422 [02:06<01:10,  2.12it/s][A[A[A


 65%|██████▍   | 274/422 [02:06<01:08,  2.17it/s][A[A[A


 65%|██████▌   | 275/422 [02:07<01:06,  2.20it/s][A[A[A


 65%|██████▌   | 276/422 [02:07<01:05,  2.23it/s][A[A[A


 66%|██████▌   | 277/422 [02:08<01:04,  2.25it/s][A[A[A


 66%|██████▌   | 278/422 [02:08<01:03,  2.25it/s][A[A[A


 66%|██████▌   | 279/422 [02:08<01:03,  2.27it/s][A[A[A


 66%|██████▋   | 280/422 [02:09<01:02,  2.28it/s][A[A[A


 67%|██████▋   | 281/422 [02:09<01:01,  2.28it/s][A[A[A


 67%|██████▋   | 282/422 [02:10<01:01,  2.29it/s][A[A[A


 67%|██████▋   | 283/422 [02:10<01:09,  1.99it/s][A[A[A


 67%|██████▋   | 284/422 [02:11<01:06,  2.07it/s][A[A[A


 68%|██████▊   | 285/422 [02:11<01:04,  2.14it/s][A[A[A


 68%|██████▊   | 286/422

In [47]:
sorted_topic_sims = sorted(topic_kw_sims.items(),
                          key=lambda kv: kv[1],
                          reverse=False)
print('Most similar topics:')
tmp_list = [(t,s) for (t,s) in sorted_topic_sims if s>0]
pprint(tmp_list[:50])
print("\nLeast similar topics:")
tmp_list = [(t,s) for (t,s) in sorted_topic_sims if not math.isinf(s)]
pprint(tmp_list[-50:])

print("\nTopics with sim > 1:")
#pprint([(t,s) for (t,s) in sorted_topic_sims if s>1])
print(len([(t,s) for (t,s) in sorted_topic_sims if s>1 and not math.isinf(s)]))
print("\nTopics with sim <= 1:")
print(len([(t,s) for (t,s) in sorted_topic_sims if s<=1 and s>0]))
print("\nTopics with sim ~ 1:")
tmp_list = [(t,s) for (t,s) in sorted_topic_sims if s<=1.1 and s>1]
print(len(tmp_list))
pprint(tmp_list[:50])

Most similar topics:
[('rural_development_programmes-rural_development_programme',
  0.23732740663783877),
 ('flemish_region-walloon_region', 0.26490518466144797),
 ('walloon_region-flemish_region', 0.26490518466144797),
 ('natural_renewable_resources-renewable_natural_resource', 0.2743844392672613),
 ('european_union_law-european_union_legislation', 0.28949777664433396),
 ('extreme_weather_events-extreme_weather_event', 0.29311810665329063),
 ('permanent_pastures-permanent_pasture', 0.297483636610508),
 ('heat_waves-heat_wave', 0.30928613990670445),
 ('water_resource_management-integrate_water_resource_management',
  0.31116352029111627),
 ('permanent_grassland-permanent_pasture', 0.32462382358700037),
 ('agricultural_prices-agricultural_price', 0.3388956938678026),
 ('agricultural_sector-agricultural_horticultural_sector', 0.34673294755021217),
 ('food_supply_chain-food_supply', 0.35268685729629484),
 ('global_climate_change-climate_change', 0.3527671807930535),
 ('utilized_agricultu

In [45]:
topic_kw_sims_raw = {}  
for i in tqdm(range(len(topics))):
    for j in range(len(keywords_raw)):
        topic1 = topics[i]
        topic2 = keywords_raw[j]
        wmd = model.wmdistance(topic1.split("_"), topic2.split(" ")) 
        t_key = "{}-{}".format(topic1, topic2)
        topic_kw_sims_raw[t_key] = wmd  




  0%|          | 0/422 [00:00<?, ?it/s][A[A[A


  0%|          | 1/422 [00:00<03:37,  1.94it/s][A[A[A


  0%|          | 2/422 [00:01<03:37,  1.93it/s][A[A[A


  1%|          | 3/422 [00:01<03:36,  1.94it/s][A[A[A


  1%|          | 4/422 [00:02<03:37,  1.92it/s][A[A[A


  1%|          | 5/422 [00:02<03:36,  1.93it/s][A[A[A


  1%|▏         | 6/422 [00:03<03:35,  1.93it/s][A[A[A


  2%|▏         | 7/422 [00:03<03:34,  1.93it/s][A[A[A


  2%|▏         | 8/422 [00:04<03:33,  1.94it/s][A[A[A


  2%|▏         | 9/422 [00:04<03:34,  1.93it/s][A[A[A


  2%|▏         | 10/422 [00:05<03:34,  1.92it/s][A[A[A


  3%|▎         | 11/422 [00:05<03:34,  1.91it/s][A[A[A


  3%|▎         | 12/422 [00:06<03:49,  1.79it/s][A[A[A


  3%|▎         | 13/422 [00:06<03:44,  1.82it/s][A[A[A


  3%|▎         | 14/422 [00:07<03:39,  1.85it/s][A[A[A


  4%|▎         | 15/422 [00:07<03:36,  1.88it/s][A[A[A


  4%|▍         | 16/422 [00:08<03:50,  1.76it/s][A[A

 64%|██████▍   | 270/422 [02:19<01:26,  1.76it/s][A[A[A


 64%|██████▍   | 271/422 [02:20<01:28,  1.70it/s][A[A[A


 64%|██████▍   | 272/422 [02:20<01:24,  1.78it/s][A[A[A


 65%|██████▍   | 273/422 [02:21<01:21,  1.83it/s][A[A[A


 65%|██████▍   | 274/422 [02:21<01:18,  1.88it/s][A[A[A


 65%|██████▌   | 275/422 [02:22<01:16,  1.91it/s][A[A[A


 65%|██████▌   | 276/422 [02:22<01:15,  1.93it/s][A[A[A


 66%|██████▌   | 277/422 [02:23<01:14,  1.95it/s][A[A[A


 66%|██████▌   | 278/422 [02:23<01:13,  1.96it/s][A[A[A


 66%|██████▌   | 279/422 [02:24<01:12,  1.97it/s][A[A[A


 66%|██████▋   | 280/422 [02:24<01:12,  1.96it/s][A[A[A


 67%|██████▋   | 281/422 [02:25<01:11,  1.97it/s][A[A[A


 67%|██████▋   | 282/422 [02:26<01:11,  1.96it/s][A[A[A


 67%|██████▋   | 283/422 [02:26<01:22,  1.69it/s][A[A[A


 67%|██████▋   | 284/422 [02:27<01:18,  1.76it/s][A[A[A


 68%|██████▊   | 285/422 [02:27<01:15,  1.82it/s][A[A[A


 68%|██████▊   | 286/422

In [46]:
sorted_topic_sims = sorted(topic_kw_sims_raw.items(),
                          key=lambda kv: kv[1],
                          reverse=False)

tmp_list = [(t,s) for (t,s) in sorted_topic_sims if s>0 and not math.isinf(s)]
print('Most similar topics:')
#tmp_list = [(t,s) for (t,s) in sorted_topic_sims if s>0]
pprint(tmp_list[:50])
print("\nLeast similar topics:")
#tmp_list = [(t,s) for (t,s) in sorted_topic_sims if not math.isinf(s)]
pprint(tmp_list[-50:])

print("\nTopics with sim > 1:")
#pprint([(t,s) for (t,s) in sorted_topic_sims if s>1])
print(len([(t,s) for (t,s) in sorted_topic_sims if s>1 and not math.isinf(s)]))
print("\nTopics with sim <= 1:")
print(len([(t,s) for (t,s) in sorted_topic_sims if s<=1 and s>0]))

Most similar topics:
[('utilized_agricultural_area-utilised agricultural area', 0.2084471148268918),
 ('smaller_businesses-larger businesses', 0.2745245962847471),
 ('smaller_companies-larger companies', 0.27452466300517325),
 ('fruit_and_vegetable_processing-vegetable and fruit cultivation',
  0.28267063349574806),
 ('permanent_pastures-permanent pasture', 0.297483636610508),
 ('arable_land-irrigated arable land', 0.3089976498496879),
 ('larger_firms-larger companies', 0.31842881084889174),
 ('natural_renewable_resources-some renewable natural resources',
  0.3216540398970706),
 ('brussels_capital_region-the brussels capital region', 0.32418962150214653),
 ('permanent_grassland-permanent pasture', 0.32462382358700037),
 ('food_price-food prices', 0.3388962489104271),
 ('global_climate_change-climate change', 0.3527671807930535),
 ('climate_change-climate changes', 0.3566698082072735),
 ('food_chain-food chains', 0.36261898729968073),
 ('direct_financial_support-direct income support',