# get list of topics based on nearest neigbours (depth=5) of topics from a manually-generated list

We generate a list of ~26000 topics similar to seed topics (manually generated list containing ~300topics) 

This list is the used to filter down the word2vec model

In [2]:
from igraph import Graph, plot
from gensim.models import Word2Vec
import logging

model=Word2Vec.load("/media/dzon/Data/Jiri-models/wv-sg1hs0.bin")

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

2019-11-29 12:21:29,269 : INFO : loading Word2Vec object from /media/dzon/Data/Jiri-models/wv-sg1hs0.bin
2019-11-29 12:21:29,270 : DEBUG : {'transport_params': None, 'ignore_ext': False, 'opener': None, 'closefd': True, 'newline': None, 'errors': None, 'encoding': None, 'buffering': -1, 'mode': 'rb', 'uri': '/media/dzon/Data/Jiri-models/wv-sg1hs0.bin'}
2019-11-29 12:21:34,770 : INFO : loading wv recursively from /media/dzon/Data/Jiri-models/wv-sg1hs0.bin.wv.* with mmap=None
2019-11-29 12:21:34,771 : INFO : loading vectors from /media/dzon/Data/Jiri-models/wv-sg1hs0.bin.wv.vectors.npy with mmap=None
2019-11-29 12:21:57,077 : INFO : setting ignored attribute vectors_norm to None
2019-11-29 12:21:57,078 : INFO : loading vocabulary recursively from /media/dzon/Data/Jiri-models/wv-sg1hs0.bin.vocabulary.* with mmap=None
2019-11-29 12:21:57,079 : INFO : loading trainables recursively from /media/dzon/Data/Jiri-models/wv-sg1hs0.bin.trainables.* with mmap=None
2019-11-29 12:21:57,080 : INFO : l

In [3]:
def __addsubtopic(dictionary, topic, topn=10, depth=1, threshold=0.8, distance=0):
    """
    private recursive method to add subtopic to dictionary using similarities in word2vec space

    parameters:
        (model - word2vec model)
        dictionary - dictionary to store topics, key=topic, distance:distance, subtopics: list of (subtopic,similarity)
        topic - string }name of topic (e.g. "rural areas")
        topn - how many similar topics are found (parameter for gensim similar_by_vector function)
        depth - how deep to dive (1=find only similarities for one topic, 2=look for similarities of 2nd level topics, ... )
            - maximum is 3
        threshold - minimal similarity (value return by similar_by_vector) to include that subtopic
        distance - distance from the first topic (used in recursion)

    returns: the number of subtopics found and added to the dictionary
    (it's a big higher than the final number of vertices in the final graph because some repeat)
    """
    #maximum depth is 3
    mydepth=min(depth,3)
    topics_found = 0
    # check if the topic already exists as a key in the dictionary
    if topic in dictionary:
        logging.info('topic "{}" already found in dict (distance: {})'.format(topic, dictionary[topic]['distance']))
        return topics_found
    else:
        logging.debug('adding topic "{}"'.format(topic))
        # the first value in list for a key is the distance from the primary term(topic)
        dictionary[topic]={}
        dictionary[topic]['distance'] = distance
        # get topn similar words with their cosine distances
        similar = model.wv.similar_by_vector(topic, topn=topn)
        dictionary[topic]['subtopics'] = []
        for (subtopic, similarity) in similar:
            # if the distance is abouve the threshold
            
            if similarity > threshold:
                logging.debug('adding subtopic "{}" to topic "{}" with similarity {}'.format(subtopic, topic, similarity))
                # add to dictionary
                
                dictionary[topic]['subtopics'].append((subtopic,similarity))
                topics_found += 1
                # if recursion - level defined by the recurse parameter
                if mydepth > 1:
                    logging.debug('recursively calling addsubtopic on topic "{}" (mydepth=={})'.format(subtopic, mydepth))
                    # ask for topic for the subtopic
                    topics_found += __addsubtopic(dictionary, topic=subtopic, topn=topn, depth=mydepth-1, 
                                                threshold=threshold, distance=distance+1)
            else:
                # do nothing if the similarity of the subtopic is under the minsim threshold
                logging.debug('similarity of "{}" with subtopic "{}" is only {} ({} required)'.format(topic, subtopic, similarity, threshold))
        return topics_found


In [4]:
def construct_graph_description(topics):
    """
    constructs igraph graph description (vertices+edges+labels)

    parameter: topics dictionary (created by addsubtopic function)

    retunrs: dictionary - igraph graph definition
    """
    subtopics = topics

    # dictionary of "vertex name"->"vertex number"
    vertices = dict()
    # list of vertex names
    vertices_list = []
    # list of vertex distances from the firts topic
    vertices_list_distance = []
    # number of nodes
    vertex_number = 0;
    # we are going through all the keys of topics dictionary
    for key in subtopics.keys():
        subtopic_list = subtopics[key]['subtopics']
        # check if we already have this node
        if key not in vertices_list:
            vertices[key] = vertex_number
            vertices_list.append(key)
            vertices_list_distance.append(subtopics[key]['distance'])
            vertex_number += 1
        # the list for the topic - the first value is the distance from firts node (needed to nicely draw color of nodes)
        subtopic_list = subtopics[key]['subtopics']
        # we start from the 2nd value in the list which is actually the 1st real subtopic name
        for i in range(0, len(subtopic_list)):
            # we have to check if the subtopic isn't already in the list - can be there from parsing of previous nodes
            if subtopic_list[i][0] not in vertices_list:
                vertices[subtopic_list[i][0]] = vertex_number
                vertices_list.append(subtopic_list[i][0])
                # if the subtopic is the last leaf in the graph it won't be a key in the topics dictionary
                try:
                    vertices_list_distance.append(subtopics[subtopic_list[i][0]]['distance'])
                except:
                    vertices_list_distance.append(subtopics[key]['distance']+1)
                vertex_number += 1
    
    # create dictionary
    graph_definition = {}
    # we construct the graph nodes
    graph_definition['vertices'] = vertices_list
    graph_definition['distance'] = vertices_list_distance
    graph_definition['edges'] = []
    graph_definition['edge_distance'] = []
    # add edges
    for key in subtopics.keys():
        subtopic_list = subtopics[key]['subtopics']
        for i in range(0,len(subtopic_list)):
            logging.debug('adding edges (i=={}) key: {} ({}), subtopic: {} ({}) '.format(i,key, vertices[key], subtopic_list[i][0], vertices[subtopic_list[i][0]]))
            # checking if the edge exists - get_eid throws an exception when there is no edge betweed the two vertices
            if ((vertices[key], vertices[subtopic_list[i][0]]) not in graph_definition['edges']) and ((vertices[subtopic_list[i][0]],vertices[key] ) not in graph_definition['edges']):
                 graph_definition['edges'].append((vertices[key], vertices[subtopic_list[i][0]]))
                 graph_definition['edge_distance'].append("{:.2f}".format(subtopic_list[i][1]))
    # return the iGraph graph
    return graph_definition

In [5]:
def construct_graph(topics):
    """
    constructs igraph graph

    parameter: topics dictionary (created by addsubtopic function)

    retunrs: g - igraph graph definition
    """
    subtopics = topics

    # dictionary of "vertex name"->"vertex number"
    vertices = dict()
    # list of vertex names
    vertices_list = []
    # list of vertex distances from the firts topic
    vertices_list_distance = []
    # number of nodes
    vertex_number = 0;
    # we are going through all the keys of topics dictionary
    for key in subtopics.keys():
        subtopic_list = subtopics[key]['subtopics']
        # check if we already have this node
        if key not in vertices_list:
            vertices[key] = vertex_number
            vertices_list.append(key)
            vertices_list_distance.append(subtopics[key]['distance'])
            vertex_number += 1
        # the list for the topic - the first value is the distance from firts node (needed to nicely draw color of nodes)
        subtopic_list = subtopics[key]['subtopics']
        # we start from the 2nd value in the list which is actually the 1st real subtopic name
        for i in range(0, len(subtopic_list)):
            # we have to check if the subtopic isn't already in the list - can be there from parsing of previous nodes
            if subtopic_list[i][0] not in vertices_list:
                vertices[subtopic_list[i][0]] = vertex_number
                vertices_list.append(subtopic_list[i][0])
                # if the subtopic is the last leaf in the graph it won't be a key in the topics dictionary
                try:
                    vertices_list_distance.append(subtopics[subtopic_list[i][0]]['distance'])
                except:
                    vertices_list_distance.append(subtopics[key]['distance']+1)
                vertex_number += 1
    
    # create a new graph
    g = Graph()
    # we construct the graph nodes
    g.add_vertices(len(vertices_list))
    # we assign names to the nodes
    g.vs["name"] = vertices_list
    # we have to set the label flag so the names are visible in the graph
    g.vs["label"] = g.vs["name"]
    # assigning the distances from the first node
    g.vs["distance"] = vertices_list_distance
    # definition of colors according to their distance - we don't expect to draw a bigger graph than with distance 4
    color_dict = {0: "blue", 1: "red", 2:"yellow", 3: "green", 4:"pink"}
    # assigning the color so the nodes are displayed with colors
    g.vs["color"] = [color_dict[dist] for dist in g.vs["distance"]]
    # add edges
    for key in subtopics.keys():
        subtopic_list = subtopics[key]['subtopics']
        for i in range(0,len(subtopic_list)):
            logging.debug('adding edges (i=={}) key: {} ({}), subtopic: {} ({}) '.format(i,key, vertices[key], subtopic_list[i][0], vertices[subtopic_list[i][0]]))
            # checking if the edge exists - get_eid throws an exception when there is no edge betweed the two vertices
            try:
                g.get_eid(vertices[key], vertices[subtopic_list[i][0]])
            except:
                g.add_edges([(vertices[key], vertices[subtopic_list[i][0]])])
    # return the iGraph graph
    return g

In [7]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
# must be like logger instance otherwise the DEBUG messages are not shown
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
#logger.setLevel(logging.DEBUG)

# clean the subtopics dictionary
if 'subtopics' in globals():
    del(subtopics)
subtopics = dict()

filepath = 'topic_list_checked.txt'
with open(filepath) as fp:
    topics = fp.readlines()
#print(len(topics))    
topics_tmp_dict = {}
for topic in topics:
    topic = topic.strip()
    topic = topic.lower()
    topics_tmp_dict[topic]=1
del topics
topics = []
for key in topics_tmp_dict:
    topics.append(key)
print('topics: {}'.format(len(topics)))

iii=0;
for topic in topics:
# get subtopics - the minimum similarity threshold is empirically set
    iii+=1
    print('processing topic {} {}'.format(iii,topic))
    print(__addsubtopic(subtopics, topic=topic, topn=10, depth=5, threshold=0.5))
    
    

gg = construct_graph_description(subtopics)
#construct the graph
#g = construct_graph(subtopics)



topics: 317
processing topic 1 demographic_change
170
processing topic 2 rural_lifestyle
0
processing topic 3 rural_development_planning
0
processing topic 4 rural_development
596
processing topic 5 landscape_planning
428
processing topic 6 border_region
358
processing topic 7 ageing_farming_population
590
processing topic 8 young_people
350
processing topic 9 newcomers
326
processing topic 10 proven_expertise
300
processing topic 11 brexit
450
processing topic 12 high_land_prices
679
processing topic 13 farm_size
510
processing topic 14 small_farm
353
processing topic 15 sufficient_funding
290
processing topic 16 farming_community
572
processing topic 17 limited_financing
83
processing topic 18 environmental_regulation
500
processing topic 19 remote_area
392
processing topic 20 transport_infrastructure
418
processing topic 21 high_regulation
164
processing topic 22 lower_wages
210
processing topic 23 forestry_development
360
processing topic 24 new_communities
313
processing topic 25 

301
processing topic 204 business
540
processing topic 205 financing
432
processing topic 206 innovation
0
processing topic 207 logistics
587
processing topic 208 poverty
270
processing topic 209 investor
634
processing topic 210 nature_protection
0
processing topic 211 participation
320
processing topic 212 community
254
processing topic 213 democracy
280
processing topic 214 healthy_food
0
processing topic 215 short_supply_chains
349
processing topic 216 folk_art
218
processing topic 217 food_production
0
processing topic 218 new_business_models
0
processing topic 219 fruit_and_vegetable_processing
327
processing topic 220 organic_food
0
processing topic 221 leader_programme
0
processing topic 222 farmers_organisations
0
processing topic 223 rural_entrepreneurship
0
processing topic 224 exodus
160
processing topic 225 flanders
307
processing topic 226 regional_landscape
0
processing topic 227 attractiveness
225
processing topic 228 land_use_change
256
processing topic 229 mobility
36

In [8]:
from pprint import pprint
import re
#pprint(gg['vertices'])
print('topics: {} vertices: {}'.format(len(topics),len(gg['vertices'])))

#del new_topics
new_topics = []
for ver in gg['vertices']:
    #if not re.match("\w*_\w*",ver):
    #    continue
    #print(ver)    
    new_topics.append(ver)
#pprint(new_topics)    

topics: 317 vertices: 26396


In [10]:
with open('topic_list_slim-model_26396topics_topn10_dep5_thr0.5.txt', 'w') as f:
    for item in new_topics:
        f.write("%s\n" % item)

In [89]:
import json
from pprint import pprint
#pprint(subtopics)

pprint(gg)
#pprint(json.dumps(subtopics))

{'distance': [0,
              1,
              1,
              1,
              1,
              1,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2,
              2],
 'edge_distance': ['rural_regions',
                   'urban_areas',
                   'rural_communities',
                   'remote_rural_areas',
                   'small_towns',
                   'urban_regions',
                   'peripheral_areas',
                   'border_regions',
                   'peripheral_regions',
                   'urban_centres',
                   'urban_agglomerations',
                   'urban_environments',
                   'metropolitan_areas',
                   'rural_populations',
                  

In [45]:
subtopic_list = subtopics['rural_areas']['subtopics']
print(range(0,len(subtopic_list)-1))
subtopic_list[4][0]

range(0, 4)


'small_towns'