### Packages and helper functions

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import json
import pandas as pd
import numpy as np 

In [2]:
def correlation_matrix(word_list, basis):
    word_similarity = cosine_similarity(basis)
    d = defaultdict(lambda:defaultdict(), {})
    for idx1, word1 in enumerate(word_list):
        for idx2, word2 in enumerate(word_list):
            d[word1][word2] = word_similarity[idx1][idx2]
    df = pd.DataFrame(d)
    return df

In [3]:
def calc(basis, word, mean_base=0):
    word_ = model.encode(word)-mean_base
    c = basis @ word_.T 
    c = np.abs(c)
    max_val = np.max(c)
    min_val = np.min(c)
    return max_val/min_val, word

### Some demo utilities

In [4]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
sample_output = json.load(open('./resources/demo_sample_keywords.json', 'r'))
sample_output

['graph',
 'user',
 'spreadsheet',
 'node',
 'community',
 'graph embedding',
 'model',
 'community detection',
 'twitter',
 'sparsity',
 'query information',
 'community profiling',
 'network',
 'spreadsheet software',
 'problem',
 'social networks',
 'node classification',
 'algorithm',
 'community embedding',
 'detection',
 'graph visualization',
 'graph neural networks',
 'relevance',
 'method',
 'social network',
 'node embedding',
 'database',
 'heterogeneous graphs',
 'query',
 'insight',
 'dilemma',
 'web',
 'diffusion topologies',
 'sparsity problem',
 'graph analytics methods',
 'graph analytics problem',
 'curvature regularization',
 'security token',
 'scalability',
 'user friendship links',
 'user credibility',
 'constraint',
 'graph properties',
 'drug',
 'semantic user search',
 'community membership',
 'database systems',
 'observation',
 'information',
 'application',
 'probabilistic prediction models',
 'entity',
 'graph data',
 'cascade',
 'heterogeneous information 

### Main part

In [5]:
arXiv_cs_subfields = 'Artificial Intelligence; Computation and Language; Computational Complexity; Computational Engineering, Finance, and Science; Computational Geometry; Computer Science and Game Theory; Computer Vision and Pattern Recognition; Computers and Society; Cryptography and Security; Data Structures and Algorithms; Databases; Digital Libraries; Discrete Mathematics; Distributed, Parallel, and Cluster Computing; Emerging Technologies; Formal Languages and Automata Theory; General Literature; Graphics; Hardware Architecture; Human-Computer Interaction; Information Retrieval; Information Theory; Logic in Computer Science; Machine Learning; Mathematical Software; Multiagent Systems; Multimedia; Networking and Internet Architecture; Neural and Evolutionary Computing; Numerical Analysis; Operating Systems; Other Computer Science; Performance; Programming Languages; Robotics; Social and Information Networks; Software Engineering; Sound; Symbolic Computation; Systems and Control'
subfield_list = arXiv_cs_subfields.split('; ')
subfield_list

['Artificial Intelligence',
 'Computation and Language',
 'Computational Complexity',
 'Computational Engineering, Finance, and Science',
 'Computational Geometry',
 'Computer Science and Game Theory',
 'Computer Vision and Pattern Recognition',
 'Computers and Society',
 'Cryptography and Security',
 'Data Structures and Algorithms',
 'Databases',
 'Digital Libraries',
 'Discrete Mathematics',
 'Distributed, Parallel, and Cluster Computing',
 'Emerging Technologies',
 'Formal Languages and Automata Theory',
 'General Literature',
 'Graphics',
 'Hardware Architecture',
 'Human-Computer Interaction',
 'Information Retrieval',
 'Information Theory',
 'Logic in Computer Science',
 'Machine Learning',
 'Mathematical Software',
 'Multiagent Systems',
 'Multimedia',
 'Networking and Internet Architecture',
 'Neural and Evolutionary Computing',
 'Numerical Analysis',
 'Operating Systems',
 'Other Computer Science',
 'Performance',
 'Programming Languages',
 'Robotics',
 'Social and Informatio

### If we use directly the embeddings of the subfield_list as basis.. 

In [6]:
basis = model.encode(subfield_list)
basis = basis / np.linalg.norm(basis, axis=1).reshape(-1,1)
df = correlation_matrix(subfield_list, basis)
df

Unnamed: 0,Artificial Intelligence,Computation and Language,Computational Complexity,"Computational Engineering, Finance, and Science",Computational Geometry,Computer Science and Game Theory,Computer Vision and Pattern Recognition,Computers and Society,Cryptography and Security,Data Structures and Algorithms,...,Operating Systems,Other Computer Science,Performance,Programming Languages,Robotics,Social and Information Networks,Software Engineering,Sound,Symbolic Computation,Systems and Control
Artificial Intelligence,1.0,0.711717,0.782139,0.579572,0.726609,0.686493,0.696109,0.65598,0.578317,0.688114,...,0.680445,0.648087,0.57421,0.722929,0.758069,0.541112,0.678689,0.575963,0.740578,0.642398
Computation and Language,0.711717,1.0,0.761322,0.562802,0.717943,0.615152,0.7453,0.664543,0.562327,0.780439,...,0.695253,0.564444,0.62273,0.819801,0.58484,0.583434,0.690933,0.663905,0.818837,0.731763
Computational Complexity,0.782139,0.761322,1.0,0.596598,0.811264,0.627579,0.646551,0.652095,0.581551,0.734201,...,0.728476,0.601822,0.686071,0.710821,0.655413,0.537712,0.688088,0.639268,0.806273,0.739478
"Computational Engineering, Finance, and Science",0.579572,0.562802,0.596598,1.0,0.674468,0.762875,0.576844,0.707602,0.586541,0.726595,...,0.575365,0.699265,0.373617,0.569427,0.564523,0.581258,0.61883,0.309243,0.535965,0.520852
Computational Geometry,0.726609,0.717943,0.811264,0.674468,1.0,0.758922,0.651725,0.635279,0.511436,0.755974,...,0.650059,0.60275,0.519193,0.740644,0.7135,0.48402,0.734556,0.487463,0.686773,0.611887
Computer Science and Game Theory,0.686493,0.615152,0.627579,0.762875,0.758922,1.0,0.63771,0.63284,0.566714,0.687294,...,0.550996,0.741635,0.366137,0.675301,0.609593,0.457918,0.689383,0.317917,0.56328,0.501758
Computer Vision and Pattern Recognition,0.696109,0.7453,0.646551,0.576844,0.651725,0.63771,1.0,0.620014,0.570258,0.757599,...,0.636618,0.620694,0.479966,0.677403,0.587417,0.56685,0.676485,0.558191,0.664311,0.642974
Computers and Society,0.65598,0.664543,0.652095,0.707602,0.635279,0.63284,0.620014,1.0,0.61127,0.741811,...,0.758696,0.665769,0.506645,0.631074,0.557179,0.796822,0.617914,0.475231,0.666969,0.714954
Cryptography and Security,0.578317,0.562327,0.581551,0.586541,0.511436,0.566714,0.570258,0.61127,1.0,0.63006,...,0.547109,0.537711,0.445085,0.493264,0.42886,0.581124,0.51579,0.469498,0.609862,0.578704
Data Structures and Algorithms,0.688114,0.780439,0.734201,0.726595,0.755974,0.687294,0.757599,0.741811,0.63006,1.0,...,0.735582,0.650406,0.578912,0.758961,0.58915,0.683986,0.709509,0.514437,0.718192,0.743093


In [None]:
sample_output

In [7]:
for w in sample_output:
    v, k = calc(basis,w)
    print(v,k)

1.787959 graph
2.2820697 user
1.9444182 spreadsheet
2.2052794 node
2.57612 community
1.5596007 graph embedding
2.388426 model
2.4135504 community detection
2.4296815 twitter
3.1913264 sparsity
2.1707978 query information
2.2987237 community profiling
2.1975203 network
1.6747152 spreadsheet software
2.4621234 problem
2.7874987 social networks
1.9041258 node classification
1.7329835 algorithm
2.3859522 community embedding
2.5883293 detection
1.534521 graph visualization
1.8720381 graph neural networks
2.604047 relevance
2.414766 method
2.6448522 social network
1.9179026 node embedding
1.9106208 database
1.6772743 heterogeneous graphs
2.0819082 query
2.4823565 insight
2.4612448 dilemma
1.8875811 web
2.1064498 diffusion topologies
2.3767376 sparsity problem
1.9805931 graph analytics methods
2.016979 graph analytics problem
1.7387792 curvature regularization
2.0203187 security token
2.1567545 scalability
2.2234213 user friendship links
1.8619163 user credibility
2.1634748 constraint
1.54162

1.9934636 cell
2.8901582 efficient algorithm
2.331086 relationship type
1.7581289 open datasets
2.0218973 robustness analysis
2.2301664 node closeness
2.659734 death
2.0357845 relationship semantics
3.0187476 accuracy
2.5716293 sequence
2.3984694 feature
2.0820816 user profiling
2.1961517 workload
2.7006085 scale
1.6543194 software
2.4972217 number
2.5186577 component
1.7402029 application programming interfaces
1.7983059 node classification uncertainty
2.7242558 multiple types
2.4730158 reconstruction loss
1.9286209 email
2.0837526 robust feature representation
1.6766844 taxonomy
2.5362337 heterogeneous nature
3.5876353 rate
2.1374958 models users
2.7333522 robust feature
1.6016389 research
2.223748 training
2.174403 unsupervised learning
1.9140626 augmented reality
2.7476795 relation
2.0435867 visualization capabilities
2.290275 key roles
2.728544 view
2.0205872 data size
2.6824055 attention models
2.2952015 entity recognition
1.626631 test user
1.8645358 jazz
2.531431 community stru

### So... Not so good, need something more

In [8]:
basis = model.encode(subfield_list)
# basis = basis/np.linalg.norm(basis,axis=1).reshape(-1,1)
mean_base = np.sum(basis,axis=0).reshape(1,-1)/len(basis)
basis = basis-mean_base
basis = basis/np.linalg.norm(basis,axis=1).reshape(-1,1)

In [9]:
df = correlation_matrix(subfield_list, basis)
df

Unnamed: 0,Artificial Intelligence,Computation and Language,Computational Complexity,"Computational Engineering, Finance, and Science",Computational Geometry,Computer Science and Game Theory,Computer Vision and Pattern Recognition,Computers and Society,Cryptography and Security,Data Structures and Algorithms,...,Operating Systems,Other Computer Science,Performance,Programming Languages,Robotics,Social and Information Networks,Software Engineering,Sound,Symbolic Computation,Systems and Control
Artificial Intelligence,1.0,-0.006521,0.240293,-0.168247,0.079534,0.127796,0.077458,-0.122887,-0.009614,-0.205061,...,-0.157274,0.066079,-0.068457,0.064313,0.354509,-0.216071,0.003708,-0.030292,0.109978,-0.235565
Computation and Language,-0.006521,1.0,0.119837,-0.272697,-0.002401,-0.122104,0.190252,-0.153562,-0.0908,0.099717,...,-0.168613,-0.211837,0.020702,0.358116,-0.171927,-0.150332,-0.006195,0.164789,0.343783,0.021193
Computational Complexity,0.240293,0.119837,1.0,-0.1737,0.329451,-0.085707,-0.126627,-0.195717,-0.041396,-0.090012,...,-0.039709,-0.105915,0.193123,-0.029589,0.032814,-0.280657,-0.015241,0.098077,0.298951,0.05037
"Computational Engineering, Finance, and Science",-0.168247,-0.272697,-0.1737,1.0,0.089551,0.438347,-0.083467,0.2028,0.14345,0.14859,...,-0.279792,0.316738,-0.362654,-0.215666,-0.011447,0.05094,0.003536,-0.465389,-0.331854,-0.38444
Computational Geometry,0.079534,-0.002401,0.329451,0.089551,1.0,0.327278,-0.070806,-0.2085,-0.18449,0.040392,...,-0.291715,-0.065379,-0.222021,0.109573,0.225811,-0.38407,0.166648,-0.265536,-0.093369,-0.364242
Computer Science and Game Theory,0.127796,-0.122104,-0.085707,0.438347,0.327278,1.0,0.067952,-0.006762,0.095296,0.016437,...,-0.358726,0.409279,-0.387465,0.082887,0.088523,-0.236684,0.184942,-0.455885,-0.25733,-0.445648
Computer Vision and Pattern Recognition,0.077458,0.190252,-0.126627,-0.083467,-0.070806,0.067952,1.0,-0.136062,0.040955,0.162076,...,-0.200501,0.066431,-0.219286,0.003823,-0.034862,-0.062384,0.077675,-0.003261,-0.053188,-0.127781
Computers and Society,-0.122887,-0.153562,-0.195717,0.2028,-0.2085,-0.006762,-0.136062,1.0,0.08657,0.022219,...,0.140138,0.127747,-0.225528,-0.226966,-0.181183,0.480683,-0.167009,-0.267027,-0.125596,0.030004
Cryptography and Security,-0.009614,-0.0908,-0.041396,0.14345,-0.18449,0.095296,0.040955,0.08657,1.0,0.025295,...,-0.166613,0.06679,-0.080754,-0.238154,-0.178311,0.155997,-0.10408,-0.010414,0.044367,-0.041594
Data Structures and Algorithms,-0.205061,0.099717,-0.090012,0.14859,0.040392,0.016437,0.162076,0.022219,0.025295,1.0,...,-0.133056,-0.044584,-0.186635,0.050211,-0.260787,0.070987,-0.036152,-0.335886,-0.13326,-0.040639


In [10]:
for w in sample_output:
    v,k = calc(basis,w,mean_base)
    print(v,k)

116.94669 graph
239.86743 user
55.69816 spreadsheet
75.42277 node
26.754158 community
1454.5206 graph embedding
31.264658 model
962.02893 community detection
32.139946 twitter
147.04088 sparsity
43.107956 query information
169.88914 community profiling
22.924994 network
200.763 spreadsheet software
21.973602 problem
286.13824 social networks
102.85856 node classification
3281.6692 algorithm
160.42361 community embedding
175.51471 detection
70.15353 graph visualization
221.91556 graph neural networks
241.02492 relevance
149.47345 method
68.16659 social network
376.79022 node embedding
672.4323 database
111.42785 heterogeneous graphs
116.57055 query
49.314632 insight
28.942213 dilemma
1558.6272 web
194.84116 diffusion topologies
308.97375 sparsity problem
1336.6156 graph analytics methods
276.55988 graph analytics problem
60.041634 curvature regularization
1446.3845 security token
213.07794 scalability
107.91737 user friendship links
57.735107 user credibility
118.23774 constraint
16.162

585.9873 set
734.67426 functional requirements
123.102196 cell
131.36809 efficient algorithm
46.9925 relationship type
309.77164 open datasets
268.593 robustness analysis
283.0986 node closeness
25.291998 death
52.568523 relationship semantics
36.734524 accuracy
52.656445 sequence
55.148563 feature
98.41755 user profiling
726.97754 workload
1035.5718 scale
59.724464 software
36.136646 number
256.18237 component
83.81387 application programming interfaces
103.79066 node classification uncertainty
45.362827 multiple types
264.85394 reconstruction loss
99.5316 email
58.006054 robust feature representation
541.6911 taxonomy
73.94157 heterogeneous nature
139.7595 rate
80.01072 models users
1246.8955 robust feature
107.9486 research
2202.7024 training
990.13666 unsupervised learning
38.26046 augmented reality
94.1151 relation
105.295555 visualization capabilities
32.458344 key roles
58.924385 view
223.52882 data size
133.01135 attention models
361.06516 entity recognition
509.27542 test user

### Add filtering

In [12]:
for w in sample_output:
    v, k = calc(basis, w, mean_base)
    if v > 600:
        print(v,k)

1454.5206 graph embedding
962.02893 community detection
3281.6692 algorithm
672.4323 database
1558.6272 web
1336.6156 graph analytics methods
1446.3845 security token
1550.0667 information
1033.8484 application
3193.5374 diffusion prediction task
680.5129 computational complexity
1476.2938 spreadsheet operations
5024.978 smartphone user verification
3914.6394 efficient
1032.4911 smoothness
1401.285 graph query language
1103.8303 complete
1079.1376 manifold
858.9276 user alignment task
748.75555 probability
827.5951 http
1268.2386 spreadsheet systems
879.78125 active graph embedding
26112.998 smartphones
616.0477 key
865.6642 interactive zooming
626.1919 probabilistic
1322.0083 code
1208.3625 relational databases
4817.4478 data closeness
1855.3005 mobility behaviors
1507.5398 interference
1102.9857 edge probability space
1126.506 profiles types
21255.01 web community
734.67426 functional requirements
726.97754 workload
1035.5718 scale
1246.8955 robust feature
2202.7024 training
990.1366

In [13]:
basis = model.encode(subfield_list)
mean_base = np.sum(basis,axis=0).reshape(1,-1)/len(basis)
basis = basis-mean_base
basis = basis/np.linalg.norm(basis,axis=1).reshape(-1,1)

In [34]:
word_ = model.encode('random forest')

In [35]:
idx = np.argmax(basis@word_.T)

In [36]:
idx

37

In [37]:
subfield_list[idx]

'Sound'

In [38]:
idx = np.argmin(basis@word_.T)
subfield_list[idx]

'Computational Engineering, Finance, and Science'

In [19]:
subfield_list

['Artificial Intelligence',
 'Computation and Language',
 'Computational Complexity',
 'Computational Engineering, Finance, and Science',
 'Computational Geometry',
 'Computer Science and Game Theory',
 'Computer Vision and Pattern Recognition',
 'Computers and Society',
 'Cryptography and Security',
 'Data Structures and Algorithms',
 'Databases',
 'Digital Libraries',
 'Discrete Mathematics',
 'Distributed, Parallel, and Cluster Computing',
 'Emerging Technologies',
 'Formal Languages and Automata Theory',
 'General Literature',
 'Graphics',
 'Hardware Architecture',
 'Human-Computer Interaction',
 'Information Retrieval',
 'Information Theory',
 'Logic in Computer Science',
 'Machine Learning',
 'Mathematical Software',
 'Multiagent Systems',
 'Multimedia',
 'Networking and Internet Architecture',
 'Neural and Evolutionary Computing',
 'Numerical Analysis',
 'Operating Systems',
 'Other Computer Science',
 'Performance',
 'Programming Languages',
 'Robotics',
 'Social and Informatio

In [40]:
basis.shape

(40, 768)

In [45]:
word = 'query'
word_ = model.encode(word)
word_ = word_.reshape(1,-1)
word_.shape

(1, 768)

In [47]:
score = basis @ word_.T
score.shape

(40, 1)

In [38]:
import numpy as np 
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
def sim(w1, w2): 
    emb1 = model.encode(w1)
    emb2 = model.encode(w2)
    len1 = np.linalg.norm(emb1)
    len2 = np.linalg.norm(emb2)
    # print('Similarity between {} and {} is {}'.format(w1, w2, (emb1@emb2)/(len1*len2)))
    return ((emb1@emb2)/(len1*len2), 'between \'{}\' and \'{}\''.format(w1, w2))

In [27]:
w1 = 'query processing'
w2s = ['database', 'data mining', 'ship', 'computer science']
for w2 in w2s:
    sim(w1, w2)

0.80920476 between 'query processing' and 'database'
0.7693073 between 'query processing' and 'data mining'
0.63783675 between 'query processing' and 'ship'
0.6145596 between 'query processing' and 'computer science'


In [46]:
w1 = 'query processing'
w2s = ["Artificial Intelligence", "Computation and Language", "Computational Complexity", "Computational Engineering, Finance, and Science", "Computational Geometry", "Computer Science and Game Theory", "Computer Vision and Pattern Recognition", "Computers and Society", "Cryptography and Security", "Data Structures and Algorithms", "Databases", "Digital Libraries", "Discrete Mathematics", "Distributed, Parallel, and Cluster Computing", "Emerging Technologies", "Formal Languages and Automata Theory", "General Literature", "Graphics", "Hardware Architecture", "Human-Computer Interaction", "Information Retrieval", "Information Theory", "Logic in Computer Science", "Machine Learning", "Mathematical Software", "Multiagent Systems", "Multimedia", "Networking and Internet Architecture", "Neural and Evolutionary Computing", "Numerical Analysis", "Operating Systems", "Other Computer Science", "Performance", "Programming Languages", "Robotics", "Social and Information Networks", "Software Engineering", "Sound", "Symbolic Computation", "Systems and Control"]
out = []
for w2 in w2s:
    out.append(sim(w1, w2))
sorted_out = sorted(out)

In [47]:
sorted_out

[(0.46096167, "between 'query processing' and 'Discrete Mathematics'"),
 (0.4909527,
  "between 'query processing' and 'Computational Engineering, Finance, and Science'"),
 (0.5228461, "between 'query processing' and 'Robotics'"),
 (0.52769464,
  "between 'query processing' and 'Computer Science and Game Theory'"),
 (0.53484887, "between 'query processing' and 'Other Computer Science'"),
 (0.5439956, "between 'query processing' and 'Mathematical Software'"),
 (0.5457522,
  "between 'query processing' and 'Neural and Evolutionary Computing'"),
 (0.54828835, "between 'query processing' and 'Digital Libraries'"),
 (0.5550702, "between 'query processing' and 'Logic in Computer Science'"),
 (0.55681294, "between 'query processing' and 'Cryptography and Security'"),
 (0.55905145,
  "between 'query processing' and 'Networking and Internet Architecture'"),
 (0.5681156, "between 'query processing' and 'Emerging Technologies'"),
 (0.5799204,
  "between 'query processing' and 'Formal Languages an

In [54]:

w1 = 'query processing'
w2s = [
    "Computer Architecture",
    "Parallel Computing",
    "Artificial Intelligence",
    "Bioinformatics",
    "Computational Biology",
    "Databases",
    "Data Mining",
    "Information Retrieval",
    "Interactive Computing",
    "Programming Languages",
    "Formal Methods",
    "Software Engineering",
    "Scientific Computing",
    "Machine Learning",
    "Computer Security and Privacy"
]
out = []
for w2 in w2s:
    out.append(sim(w1, w2))
sorted(out, reverse=True)

[(0.8903116, "between 'query processing' and 'Information Retrieval'"),
 (0.7716426, "between 'query processing' and 'Databases'"),
 (0.7693073, "between 'query processing' and 'Data Mining'"),
 (0.74265397, "between 'query processing' and 'Machine Learning'"),
 (0.7161455, "between 'query processing' and 'Parallel Computing'"),
 (0.7123288, "between 'query processing' and 'Programming Languages'"),
 (0.71069014, "between 'query processing' and 'Interactive Computing'"),
 (0.68981516, "between 'query processing' and 'Formal Methods'"),
 (0.65273833, "between 'query processing' and 'Software Engineering'"),
 (0.6331246, "between 'query processing' and 'Artificial Intelligence'"),
 (0.6231992, "between 'query processing' and 'Computer Architecture'"),
 (0.61357856, "between 'query processing' and 'Bioinformatics'"),
 (0.58540833, "between 'query processing' and 'Computational Biology'"),
 (0.5579474, "between 'query processing' and 'Scientific Computing'"),
 (0.5265492, "between 'query p

In [55]:
w1 = 'decision tree'
w2s = [
    "Computer Architecture",
    "Parallel Computing",
    "Artificial Intelligence",
    "Bioinformatics",
    "Computational Biology",
    "Databases",
    "Data Mining",
    "Information Retrieval",
    "Interactive Computing",
    "Programming Languages",
    "Formal Methods",
    "Software Engineering",
    "Scientific Computing",
    "Machine Learning",
    "Computer Security and Privacy"
]
out = []
for w2 in w2s:
    out.append(sim(w1, w2))
sorted(out, reverse=True)

[(0.70825917, "between 'decision tree' and 'Information Retrieval'"),
 (0.6411189, "between 'decision tree' and 'Interactive Computing'"),
 (0.6345508, "between 'decision tree' and 'Formal Methods'"),
 (0.62573034, "between 'decision tree' and 'Databases'"),
 (0.61937344, "between 'decision tree' and 'Data Mining'"),
 (0.60458213, "between 'decision tree' and 'Programming Languages'"),
 (0.6042834, "between 'decision tree' and 'Parallel Computing'"),
 (0.6036342, "between 'decision tree' and 'Computer Architecture'"),
 (0.58948785, "between 'decision tree' and 'Machine Learning'"),
 (0.55351377, "between 'decision tree' and 'Artificial Intelligence'"),
 (0.52967566, "between 'decision tree' and 'Software Engineering'"),
 (0.5173499, "between 'decision tree' and 'Computational Biology'"),
 (0.51645076, "between 'decision tree' and 'Scientific Computing'"),
 (0.5147327, "between 'decision tree' and 'Bioinformatics'"),
 (0.4753831, "between 'decision tree' and 'Computer Security and Priva

difference between 1st and 2nd = ~60 * difference between 2nd and 3rd