In [1]:
import string
from nltk.corpus import stopwords
from library import clean_text_simple, terms_to_graph, unweighted_k_core

# execute the following if you haven't already (nltk > 3.2.1 is required)
#import nltk 
#nltk.download('stopwords')
#nltk.download('maxent_treebank_pos_tagger')
#nltk.download('averaged_perceptron_tagger')

#import os
#os.chdir() # change working directory to where functions are

In [2]:
my_doc = '''A method for solution of systems of linear algebraic equations \
with m-dimensional lambda matrices. A system of linear algebraic \
equations with m-dimensional lambda matrices is considered. \
The proposed method of searching for the solution of this system \
lies in reducing it to a numerical system of a special kind.'''
#my_doc = my_doc.replace('\n', '')
print(my_doc)

A method for solution of systems of linear algebraic equations with m-dimensional lambda matrices. A system of linear algebraic equations with m-dimensional lambda matrices is considered. The proposed method of searching for the solution of this system lies in reducing it to a numerical system of a special kind.


In [3]:
# pre-process document
stpwds = stopwords.words('english')
punct = string.punctuation.replace('-', '')
my_tokens = clean_text_simple(my_doc,my_stopwords=stpwds,punct=punct)
print("Number of different tokens =", len(set(my_tokens)))

Number of different tokens = 12


In [4]:
# build the graph
g = terms_to_graph(my_tokens, w=4)
print("Number of vertices = ", len(g.vs))
print("Number of edges    = ", len(g.es))
assert len(g.vs) == len(set(my_tokens)) # the number of nodes should be equal to the number of unique terms

edge_weights = []
for edge in g.es:
    source = g.vs[edge.source]['name']
    target = g.vs[edge.target]['name']
    weight = edge['weight']
    edge_weights.append([source, target, weight])

print(edge_weights)

Number of vertices =  12
Number of edges    =  42
[['method', 'solut', 2], ['method', 'system', 2], ['method', 'linear', 1], ['solut', 'system', 3], ['solut', 'linear', 1], ['system', 'linear', 2], ['solut', 'algebra', 1], ['system', 'algebra', 2], ['linear', 'algebra', 2], ['system', 'equat', 2], ['linear', 'equat', 2], ['algebra', 'equat', 2], ['linear', 'm-dimension', 2], ['algebra', 'm-dimension', 2], ['equat', 'm-dimension', 2], ['algebra', 'lambda', 2], ['equat', 'lambda', 2], ['m-dimension', 'lambda', 2], ['equat', 'system', 1], ['m-dimension', 'system', 1], ['lambda', 'system', 1], ['m-dimension', 'linear', 1], ['lambda', 'linear', 1], ['lambda', 'algebra', 1], ['equat', 'matric', 1], ['m-dimension', 'matric', 1], ['lambda', 'matric', 1], ['m-dimension', 'method', 1], ['lambda', 'method', 1], ['matric', 'method', 1], ['lambda', 'solut', 1], ['matric', 'solut', 1], ['matric', 'system', 1], ['method', 'numer', 1], ['solut', 'numer', 1], ['system', 'numer', 1], ['numer', 'system',

In [5]:
# build a graph-of-words g
for w in range(2,min(len(my_tokens)+1,30)):
    g = terms_to_graph(my_tokens, w=w)
    print(g.density())

print("""\nSliding window size increases => Density increases. 
It is never reaching 1 simply because edges are weighted
(some pairs of unique words can appear together in multiple windows)""")

0.10606060606060606
0.21212121212121213
0.3181818181818182
0.41666666666666663
0.5227272727272727
0.5833333333333334
0.6287878787878788
0.6666666666666666
0.696969696969697
0.7196969696969697
0.7424242424242423
0.7575757575757577
0.7727272727272727
0.7878787878787878
0.7954545454545454
0.8030303030303031
0.8030303030303031
0.8030303030303031
0.8030303030303031
0.8030303030303031
0.8030303030303031

Sliding window size increases => Density increases. 
It is never reaching 1 simply because edges are weighted
(some pairs of unique words can appear together in multiple windows)


In [6]:
# decompose g
core_numbers = unweighted_k_core(g)
print(core_numbers)

# compare with igraph method
print(dict(zip(g.vs['name'],g.coreness())))

{'algebra': 16, 'equat': 16, 'kind': 11, 'lambda': 16, 'linear': 16, 'm-dimension': 16, 'matric': 11, 'method': 16, 'numer': 11, 'solut': 16, 'special': 11, 'system': 16}
{'algebra': 16, 'equat': 16, 'kind': 11, 'lambda': 16, 'linear': 16, 'm-dimension': 16, 'matric': 11, 'method': 16, 'numer': 11, 'solut': 16, 'special': 11, 'system': 16}


In [7]:
# retain main core as keywords
max_c_n = max(list(core_numbers.values()))
keywords = [key for key, core in core_numbers.items() if core == max_c_n]
print(keywords)

['algebra', 'equat', 'lambda', 'linear', 'm-dimension', 'method', 'solut', 'system']
