In [1]:
import gensim.downloader
import gc
from nltk.tokenize import sent_tokenize
from sqlitedict import SqliteDict
from preprocessors import Word2VecPreprocessor
from own_tokenizers import RegexpTokenizer
import nltk
import time
import numpy as np
import pickle
import math
import heapq
import itertools

# Checking if path exists.
import os 

# For LineSentence modification.
from gensim import utils

nltk.download('punkt')
from gensim.models.word2vec import Word2Vec
from allennlp.modules.elmo import Elmo, batch_to_ids

[nltk_data] Downloading package punkt to /home/mcio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
tokenizer = RegexpTokenizer(pattern='(?i)[a-zÀ-ÿ]+')
stopwords_file_name = "englishST.txt"
preprocessor = Word2VecPreprocessor(tokenizer, stopwords_file_name)

In [8]:
MAX_WORDS_IN_BATCH

10000

In [11]:
print(os.path.exists('politics.sents'))

True


In [12]:
# 15 mins
start_time = time.time()

# Write one sentence per line.
# Format: doc# sentence
tokens = dict()
sentences_file_name = 'politics.sents'
start_time = time.time()

nr_sentences = 0

if not os.path.exists('politics.sents'):
    with SqliteDict("politics.sqlite") as docs:
        f = open(sentences_file_name, 'w')
        chunk = ''
        # Write documents in order.
        for doc_id_int in range(len(list(docs.keys()))):
            docId = str(doc_id_int)
            text = [docs[docId]['fields']['headline'], docs[docId]['fields']['bodyText']] 
            processed_sents = preprocessor.process_text_lines_return_sentences(text)
            for sent in processed_sents:
                if sent == [] or sent == ['']:
                    continue
                chunk += str(docId) + " " + (" ".join(sent)) + "\n"
            if int(docId) % 10 == 0:
                f.write(chunk)
                chunk = ''
        f.close()

    print(time.time() - start_time)

1046.2928447723389


In [2]:
def get_nr_lines(file_name):
    nr_sentences = 0
    with open(file_name) as f:
        for line in f:
            nr_sentences += 1
    return nr_sentences

In [4]:
# 15 mins
# start_time = time.time()

# docs_tokens = dict()
# # Saving politics.sqlite as sents.
# sentences_file_name = 'politics.sents'
# start_time = time.time()
# with SqliteDict("politics.sqlite") as docs:
#     for docId in docs.keys():
#         text = [docs[docId]['fields']['headline'], docs[docId]['fields']['bodyText']] 
#         tokens = preprocessor.process_text_lines(text)
#         docs_tokens[int(docId)] = tokens
#         print(docId)

# with open('politics.tokens.dict', 'wb') as f:
#     pickle.dump(docs_tokens, f)
# print(time.time() - start_time)

In [3]:
# Class adapted from 
# https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/word2vec.py
class W2VLineSentence:
    def __init__(self, source):
        # Source must be the filename of a file which has one sentence per line,
        # in the format: doc# sentence, where sentence=preprocessed tokens, separated
        # by whitespace.
        self.source = source
        self.limit = None

    def __iter__(self):
        """Iterate through the lines in the source."""
        # If it didn't work like a file, use it as a string filename
        with utils.open(self.source, 'rb') as fin:
            for line in itertools.islice(fin, self.limit):
                line = utils.to_unicode(line).split()
                yield line[1:]

class ElmoLineSentence:
    def __init__(self, source, separate_sents=True):
        # Source must be the filename of a file which has one sentence per line,
        # in the format: doc# sentence, where sentence=preprocessed tokens, separated
        # by whitespace.
        self.source = source
        self.limit = None
        self.separate_sents = separate_sents

    def __iter__(self):
        """Iterate through the lines in the source."""
        curr_docId = 0
        
        # Assumption: source starts with docId=0.
        with open(self.source, 'r') as f:
            document = []
            for line in f:
                line = utils.to_unicode(line).split()
                docId = int(line[0])
                
                if docId != curr_docId:
                    yield curr_docId, document
                    curr_docId = docId
                    if self.separate_sents:
                        document = [line[1:]]
                    else:
                        document = line[1:]
                else:
                    if self.separate_sents:
                        document.append(line[1:])
                    else:
                        document += line[1:]
            yield curr_docId, document
            return

In [38]:
# Initial experiment: min_count=5, ideally we would want 1 I think.
word2vec_model = Word2Vec(size=300, window=3, min_count=3, workers=4, 
                          sg=0, negative=2, seed=0, iter=3)
sentences = W2VLineSentence('politics.sents')
word2vec_model.build_vocab(sentences)

# For the words in our vocabulary which already have embeddings generated by GoogleNews,
# import those embeddings and use them as a starting point for training/fine-tuning.
word2vec_model.intersect_word2vec_format('GoogleNews-vectors-negative300.bin.gz', 
                                         lockf=1.0, binary=True)

# Calculate number of sentences.
nr_sentences = get_nr_lines('politics.sents')

# Fine-tune the word2vec_model. 
# We do not want the number of epochs to be too high so as not to alter the 
# embeddings imported from GoogleNews too much.
word2vec_model.train(sentences, total_examples=nr_sentences, epochs=5)

# Save the model.
word2vec_model.save("politics.word2vec")

In [4]:
def get_nr_docs(sqlite_file):
    with SqliteDict(sqlite_file) as sql:
        nr_docs = 0
        for ii in sql.keys():
            nr_docs += 1
    return nr_docs

In [42]:
# Transform documents to vectors (using word2vec) and save them.
word2vec_model = Word2Vec.load("politics.word2vec")
df_sql = SqliteDict("df_index.sqlite")

# Load df index into memory.
df_index = dict()
for token in df_sql.keys():
    df_index[token] = df_sql[token]

els = ElmoLineSentence('politics.sents', separate_sents=False)
    
# Convert documents to vectors using word2vec.
vectors = SqliteDict("politics_vectors.sqlite")

# Get number of documents in the corpus:
nr_docs = get_nr_docs("politics.sqlite")

count = 0
start_time = time.time()
with open("politics.sents", 'r') as f:
    for docId, tokens in els:
        vector = np.zeros((300, ))
        doc_length = len(tokens)
        for token in tokens:
            if token in word2vec_model:
                df = df_index[token]
                vector += np.log10(nr_docs/df) * word2vec_model[token]
            else:
                doc_length -= 1

        # Normalise vector entries.
        if doc_length == 0:
            vector = vector/1
        else:
            vector = vector/doc_length
        
        docId_dict = dict()
        docId_dict['w2v'] = vector
        vectors[docId] = docId_dict
        
        if count % 100 == 0:
            vectors.commit()
            print(str(docId) + " " + str(time.time() - start_time))
        
        count += 1

# Commit last entries.
vectors.commit()
        
df_sql.close()
vectors.close()
print(time.time() - start_time)

  if token in word2vec_model:
  vector += np.log10(nr_docs/df) * word2vec_model[token]


0 0.004602670669555664
100 0.49768972396850586
200 0.9542901515960693
300 1.5187957286834717
400 2.438721179962158
500 2.9578816890716553
600 3.3784685134887695
700 3.8270697593688965
800 4.610992908477783
900 5.186288356781006
1000 8.013277292251587
1100 8.573341131210327
1200 9.043186664581299
1300 9.448086261749268
1400 9.9193115234375
1500 10.51247525215149
1600 11.000364303588867
1700 11.390458583831787
1800 11.768378496170044
1900 12.229432821273804
2000 12.687273025512695
2100 13.244320392608643
2200 13.725260496139526
2300 14.249815464019775
2400 14.86062240600586
2500 15.386181116104126
2600 15.853943109512329
2700 16.33879256248474
2800 16.963652849197388
2900 17.721566915512085
3000 18.454695224761963
3100 19.115296363830566
3200 19.596999883651733
3300 20.14169931411743
3400 20.86271095275879
3500 21.84165620803833
3600 22.65983295440674
3700 23.39417862892151
3800 23.959585666656494
3900 24.450259923934937
4000 24.98301076889038
4100 25.618056535720825
4200 26.083735942840

34100 161.36410927772522
34200 161.76437187194824
34300 162.15855169296265
34400 162.5714976787567
34500 162.9767837524414
34600 163.3980712890625
34700 163.91906809806824
34800 164.33352756500244
34900 164.70718502998352
35000 165.08547019958496
35100 165.50330233573914
35200 165.91762685775757
35300 166.3436803817749
35400 166.75294137001038
35500 167.20336365699768
35600 167.59461569786072
35700 168.01232290267944
35800 168.4444122314453
35900 168.8052532672882
36000 169.26442098617554
36100 169.6464328765869
36200 170.08830571174622
36300 170.58060026168823
36400 171.06237959861755
36500 171.52991342544556
36600 172.01834797859192
36700 172.45184087753296
36800 172.91686868667603
36900 173.35256099700928
37000 173.78215408325195
37100 174.30088353157043
37200 174.74257683753967
37300 175.14485239982605
37400 175.59156966209412
37500 175.9954035282135
37600 176.4458146095276
37700 176.86000609397888
37800 177.25681042671204
37900 177.7032868862152
38000 178.106947183609
38100 178.53

67604 350.3800642490387
67704 350.8206961154938
67804 351.2988829612732
67904 351.8230769634247
68004 352.2564024925232
68104 352.69617533683777
68204 353.17829275131226
68304 353.6075186729431
68404 354.0770902633667
68504 354.5133876800537
68604 354.6476535797119
68704 354.7590615749359
68804 354.82871174812317
68904 354.89321851730347
69004 354.9401228427887
69105 354.98598074913025
69205 355.0422987937927
69305 355.0928828716278
69405 355.20130372047424
69505 355.6215488910675
69605 356.05250787734985
69705 356.47217202186584
69805 356.89117646217346
69905 357.3209800720215
70005 357.72979259490967
70105 358.1159608364105
70205 358.629825592041
70305 359.101539850235
70405 359.5648422241211
70505 360.04937505722046
70605 360.53103399276733
70705 360.9802541732788
70805 361.4521207809448
70905 361.98039388656616
71005 362.4504177570343
71105 362.9799225330353
71205 363.50754976272583
71305 363.96879625320435
71405 364.546550989151
71505 365.0063257217407
71605 365.60927271842957
717

101405 608.5373604297638
101505 609.486166715622
101605 610.4757339954376
101705 611.4518728256226
101805 612.308468580246
101905 613.1750338077545
102005 614.0760962963104
102105 614.9514493942261
102205 615.7041437625885
102305 616.585967540741
102405 617.5692114830017
102505 618.4089779853821
102605 619.5032832622528
102705 620.5322322845459
102805 621.6652355194092
102905 622.6048200130463
103005 623.5140545368195
103105 624.4909057617188
103205 625.3742909431458
103305 626.3497843742371
103405 627.1520564556122
103505 627.9675176143646
103605 628.7655704021454
103705 629.6247191429138
103805 630.5386443138123
103905 631.3040828704834
104005 632.1404252052307
104105 633.0666761398315
104205 633.851359128952
104305 634.718649148941
104405 635.4829001426697
104505 636.3845012187958
104605 637.2606942653656
104705 638.0150623321533
104805 639.0131645202637
104905 639.9377520084381
105005 640.8581483364105
105105 641.6783192157745
105205 642.6083695888519
105305 643.4390940666199
10540

In [24]:
sentences = [['First', 'sentence', '.'], ['Another', '.'], ['A', 'third', 'sentence', '.']]
character_ids = batch_to_ids(sentences)
embeddings = elmo(character_ids)

In [42]:
np.array(embeddings['elmo_representations'][0][0][3].detach()) + np.ones((256, 1))

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [5]:
# Transform documents to vectors (using Elmo) and save them.
weights_file = "./elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
options_file = "./elmo_2x1024_128_2048cnn_1xhighway_options.json"

elmo = Elmo(options_file, weights_file, 1, dropout=0)

df_sql = SqliteDict("df_index.sqlite")
# Load df index into memory.
df_index = dict()
for token in df_sql.keys():
    df_index[token] = df_sql[token]



In [6]:
# Get number of documents in the corpus:
nr_docs = get_nr_docs("politics.sqlite")

In [7]:
els = ElmoLineSentence('politics.sents', separate_sents=True)

In [8]:
# Convert documents to vectors using Elmo.
vectors = SqliteDict("politics_vectors.sqlite")

count = 1
start_time = time.time()
with open("politics.sents", 'r') as f:
    for docId, sents in els:
        vector = np.zeros((256, ))
        gc.collect()
        
        character_ids = batch_to_ids(sents)
        embeddings = elmo(character_ids)
        array = np.array(embeddings['elmo_representations'][0].detach())
        doc_length = 0
        for sent_nr, sent in enumerate(sents):
            for token_nr, token in enumerate(sent):
                token_df = df_index.get(token, 0)
                vector += (1+np.log10(nr_docs/(token_df+1))) * array[sent_nr][token_nr]
                doc_length += 1

        # Normalise vector entries.
        if doc_length == 0:
            vector = vector/1
        else:
            vector = vector/doc_length
            
        docId_dict = vectors.get(docId, dict())
        docId_dict['elmo'] = vector
        vectors[docId] = docId_dict
        
        if count % 10 == 0:
            vectors.commit()
            print(str(docId) + " " + str(time.time() - start_time))
        
        if count % 100 == 0:
            break
        
        count += 1

# Commit last entries.
vectors.commit()

df_sql.close()
vectors.close()
print(time.time() - start_time)

9 33.032753229141235
19 72.33619666099548
29 136.41936230659485
39 184.2544915676117
49 239.27588748931885
59 285.9612548351288
69 313.01683235168457
79 356.3260838985443
89 372.58989548683167
99 399.8682713508606
399.8708965778351


150

In [12]:
for docId, sent in els:
    break

In [14]:
for s in sent:
    print(s)

['thom', 'yorke', 'tony', 'blair', 'adviser', 'force', 'meet', 'pm']
['thom', 'yorke', 'claim', 'adviser', 'tony', 'blair', 'force', 'meet', 'prime', 'minister']
['radiohead', 'singer', 'comment', 'interview', 'guardian', 'columnist', 'george', 'monbiot', 'french', 'online', 'magazine', 'télérama', 'refer', 'time', 'spokesman', 'big', 'climate', 'change', 'campaign', 'friends', 'earth']
['yorke', 'politician', 'acknowledge', 'existence', 'climate', 'change', 'run', 'difficulty']
['big', 'fight', 'agree', 'meet', 'prime', 'minister', 'friends', 'earth', 'deny', 'access', 'iraq', 'war']
['feel', 'morally', 'unacceptable', 'photograph', 'blair', 'spokesperson', 'blair', 'suggestion', 'preposterous', 'nonsense', 'yorke', 'write', 'guardian', 'habit', 'meet', 'politician', 'activism']
['lengthy', 'interview', 'yorke', 'talk', 'radiohead', 'commitment', 'carbon', 'neutral', 'tour', 'describe', 'piss', 'wind']
['radiohead', 'people', 'turn', 'happy', 'play', 'pron', 'venue', 'area', 'promoter

In [None]:
start_time = time.time()
character_ids = batch_to_ids(sents)
embeddings = elmo(character_ids)
print(time.time() - start_time)

In [70]:
vectors = SqliteDict("politics_vectors2.sqlite")
print(len(list(vectors.keys())))

1


In [49]:
def cosine_similarity(u, v):
    numerator = np.dot(u,v)
    denominator = np.sqrt(np.sum(u**2)) * np.sqrt(np.sum(v**2))
    if denominator == 0:
        return -1
    return numerator/denominator

In [84]:
# How long does it take to go through each document + generate an ordered list?
query = "brexit date"

start_time = time.time()
query_tokens = preprocessor.process_text_lines([query])
query_vector = np.zeros((300, ))
query_len = 0
for token in query_tokens:
    if token in df_index and token in word2vec_model:
        query_vector += (1/df_index[token]) * word2vec_model[token]
        query_len += 1
if query_len != 0:
    query_vector = query_vector/query_len

politics = SqliteDict("politics.sqlite")
politics_vectors = SqliteDict("politics_vectors.sqlite")

sims_docs = []
start_time = time.time()
for doc_id in politics_vectors.keys():
    heapq.heappush(sims_docs, (-cosine_similarity(query_vector, politics_vectors[str(doc_id)]), doc_id))

ans = []
for ii in range(len(sims_docs)):
    ans.append(heapq.heappop(sims_docs))

print(ans[:10])
print(time.time() - start_time)

  if token in df_index and token in word2vec_model:
  query_vector += (1/df_index[token]) * word2vec_model[token]


[(-0.6533943018313252, '50833'), (-0.6529102171398004, '8220'), (-0.647565086539182, '8269'), (-0.6327006923809433, '51714'), (-0.6303928050856873, '57545'), (-0.629913077371776, '57737'), (-0.6294237839179379, '57330'), (-0.6279802207569656, '8250'), (-0.6205684938290468, '57983'), (-0.6192380441355568, '8223')]
34.7034010887146


In [54]:
print(len(list(politics.keys())))

108110


In [62]:
print(politics['8220'])

{'type': 'interactive', 'sectionId': 'politics', 'sectionName': 'Politics', 'webPublicationDate': '2019-02-27T20:29:07Z', 'webTitle': 'How did your MP vote on the latest Brexit amendments?', 'webUrl': 'https://www.theguardian.com/politics/ng-interactive/2019/feb/27/how-did-your-mp-vote-on-the-latest-brexit-amendments', 'apiUrl': 'https://content.guardianapis.com/politics/ng-interactive/2019/feb/27/how-did-your-mp-vote-on-the-latest-brexit-amendments', 'fields': {'headline': 'How did your MP vote on the latest Brexit amendments?', 'wordcount': '0', 'thumbnail': 'https://media.guim.co.uk/f0850e2614f9321e9f10d7524b5b38421d15ff1d/0_0_1000_600/500.jpg', 'bodyText': ''}, 'isHosted': False, 'pillarId': 'pillar/news', 'pillarName': 'News', 'canonical_id': 'politics/ng-interactive/2019/feb/27/how-did-your-mp-vote-on-the-latest-brexit-amendments'}


In [78]:
# Create heuristic doc index.
# Parameters:
#b1
nr_closest_leaders = 3
#b2
nr_leaders_query = 3

doc_IDs = list(politics_vectors.keys())
N = len(doc_IDs)
nr_leaders = math.floor(np.sqrt(N))
leader_IDs = np.random.choice(doc_IDs, nr_leaders, replace=False)

followers = dict()
leaders_vectors = dict()

for leader_ID in leader_IDs:
    followers[leader_ID] = set()
    leaders_vectors[leader_ID] = politics_vectors[leader_ID]
    
start_time = time.time()
count = 0
for doc_ID in doc_IDs:
    if doc_ID in followers.keys():
        continue
    else:
        sims_docs = []
        follower_vector = politics_vectors[doc_ID]
        for leader_ID in leader_IDs:
            sims_docs.append((cosine_similarity(follower_vector, 
                                                leaders_vectors[leader_ID]), leader_ID))
        closest_leaders = sorted(sims_docs)[-nr_closest_leaders:]
        for _, leader_ID in closest_leaders:
            followers[leader_ID].add(doc_ID)
        if count % 100 == 0:
            print(str(doc_ID) + " " + str(time.time()-start_time))
        count += 1


print(time.time() - start_time)
avg_cluster_size = 0
for leader_ID in leader_IDs:
    avg_cluster_size += len(followers[leader_ID])/len(followers)
print(avg_cluster_size)

0 0.006788730621337891
100 0.663067102432251
201 1.3555762767791748
301 2.030191421508789
401 2.7216131687164307
501 3.410062074661255
601 4.1041481494903564
701 4.795581817626953
802 5.492031812667847
902 6.176883935928345
1003 6.8808205127716064
1103 7.581945419311523
1203 8.293373584747314
1303 8.985727310180664
1403 9.673619031906128
1503 10.334176540374756
1604 11.045413255691528
1704 11.759815692901611
1806 12.45730710029602
1906 13.153631925582886
2007 13.844558715820312
2107 14.549122333526611
2207 15.235826015472412
2307 15.937775135040283
2407 16.660672664642334
2507 17.380828142166138
2608 18.080174684524536
2708 18.77803325653076
2808 19.474588632583618
2908 20.16016387939453
3011 20.83468461036682
3112 21.52919602394104
3212 22.19629144668579
3313 22.880873918533325
3413 23.54924201965332
3513 24.237218141555786
3614 24.905752420425415
3716 25.597996473312378
3817 26.27711582183838
3917 26.950460195541382
4017 27.624985456466675
4117 28.290069103240967
4217 28.944379329681

34010 229.75094032287598
34111 230.4225616455078
34211 231.11722826957703
34311 231.80956745147705
34411 232.48346257209778
34511 233.1649158000946
34611 233.8477668762207
34711 234.52474188804626
34811 235.20120882987976
34911 235.89650344848633
35011 236.58918261528015
35111 237.26705265045166
35212 237.952388048172
35312 238.61020350456238
35412 239.29321575164795
35512 239.9836847782135
35612 240.65337777137756
35713 241.34461760520935
35813 242.0432140827179
35913 242.69247341156006
36013 243.37857222557068
36113 244.0644016265869
36213 244.7335124015808
36314 245.40773725509644
36414 246.07980847358704
36515 246.77249431610107
36615 247.42383193969727
36715 248.11245727539062
36815 248.80969977378845
36915 249.5115020275116
37016 250.20053052902222
37116 250.88599491119385
37217 251.5490095615387
37318 252.2098832130432
37418 252.88234496116638
37519 253.5443365573883
37619 254.2258279323578
37719 254.88218307495117
37820 255.58297753334045
37921 256.2493896484375
38021 256.92409

67796 458.96496176719666
67896 459.66169333457947
67996 460.3286757469177
68096 460.9860692024231
68196 461.6544165611267
68297 462.3303062915802
68397 463.02623081207275
68497 463.70078706741333
68597 464.38255190849304
68697 465.0570800304413
68797 465.7440936565399
68897 466.4433114528656
68997 467.11461305618286
69097 467.8114974498749
69197 468.4958884716034
69297 469.15712785720825
69398 469.848197221756
69498 470.53262734413147
69598 471.20079231262207
69698 471.87409114837646
69798 472.55460000038147
69898 473.2224225997925
69998 473.9139587879181
70098 474.60238337516785
70199 475.2510302066803
70299 475.95194602012634
70399 476.61324405670166
70500 477.30745339393616
70600 477.96834230422974
70701 478.64173793792725
70801 479.2959442138672
70901 479.95221519470215
71002 480.6112186908722
71104 481.283273935318
71205 481.9380111694336
71306 482.5954797267914
71406 483.2756495475769
71507 483.9785358905792
71607 484.6526563167572
71708 485.32821798324585
71809 485.9937977790832

102014 691.7185943126678
102114 692.406672000885
102214 693.1091339588165
102315 693.7954182624817
102415 694.4990417957306
102515 695.1919929981232
102615 695.9220721721649
102715 696.6594648361206
102815 697.4023373126984
102915 698.1009647846222
103015 698.7856171131134
103115 699.5440490245819
103215 700.2319121360779
103316 700.9144606590271
103416 701.6263229846954
103516 702.3357622623444
103616 703.0728807449341
103716 703.7666807174683
103817 704.4526188373566
103917 705.1195721626282
104017 705.828094959259
104117 706.5270729064941
104217 707.2101602554321
104317 707.8900671005249
104417 708.5749561786652
104518 709.2842552661896
104618 709.9717841148376
104718 710.6674404144287
104818 711.3699798583984
104918 712.0717976093292
105019 712.7614712715149
105119 713.4458434581757
105220 714.1874837875366
105321 714.9320471286774
105421 715.6392438411713
105521 716.3029000759125
105622 716.9740462303162
105722 717.6824641227722
105822 718.3588154315948
105922 719.0409350395203
10

In [82]:
# Another improvement. Instead of going through the list + sorting it afterwards,
# add elements in-order (we shave off an O(n)).

query = "brexit date"

start_time = time.time()

query_tokens = preprocessor.process_text_lines([query])
query_vector = np.zeros((300, ))
query_len = 0
for token in query_tokens:
    if token in df_index and token in word2vec_model:
        query_vector += (1/df_index[token]) * word2vec_model[token]
        query_len += 1
if query_len != 0:
    query_vector = query_vector/query_len

sims_docs = []
for leader_ID in leader_IDs:
    heapq.heappush(sims_docs, (-cosine_similarity(query_vector, leaders_vectors[leader_ID]), 
                               leader_ID))

most_sim_leaders = []
for ii in range(nr_leaders_query):
    s, leader_ID = heapq.heappop(sims_docs)
    most_sim_leaders.append((s, leader_ID))

sims_docs = []

follower_IDs = set()
for s, leader_ID in most_sim_leaders:
    sims_docs.append((s, leader_ID))
    follower_IDs = follower_IDs.union(followers[leader_ID])
for follower_ID in follower_IDs:
    heapq.heappush(sims_docs, 
                   (-cosine_similarity(query_vector, 
                                       politics_vectors[follower_ID]), follower_ID))

answer_IDs = []
for ii in range(len(sims_docs)):
    answer_IDs.append(heapq.heappop(sims_docs))

print(len(answer_IDs))
print(time.time()-start_time)
print(answer_IDs[:10])

  if token in df_index and token in word2vec_model:
  query_vector += (1/df_index[token]) * word2vec_model[token]


5171
1.6412711143493652
[(-0.6533943018313252, '50833'), (-0.6529102171398004, '8220'), (-0.647565086539182, '8269'), (-0.6327006923809433, '51714'), (-0.6303928050856873, '57545'), (-0.629913077371776, '57737'), (-0.6294237839179379, '57330'), (-0.6279802207569656, '8250'), (-0.6205684938290468, '57983'), (-0.6192380441355568, '8223')]


In [83]:
print(politics['50833'])

{'type': 'article', 'sectionId': 'politics', 'sectionName': 'Politics', 'webPublicationDate': '2017-10-13T08:16:52Z', 'webTitle': "Brexit, 'no deal' and the Tories: who said what?", 'webUrl': 'https://www.theguardian.com/politics/2017/oct/13/brexit-no-deal-and-the-tories-who-said-what', 'apiUrl': 'https://content.guardianapis.com/politics/2017/oct/13/brexit-no-deal-and-the-tories-who-said-what', 'fields': {'headline': "Brexit, 'no deal' and the Tories: who said what?", 'wordcount': '0', 'thumbnail': 'https://media.guim.co.uk/9270de646e2b1acdf77d608cfbf9bbd7f4a5e805/0_148_4444_2667/500.jpg', 'bodyText': ''}, 'isHosted': False, 'pillarId': 'pillar/news', 'pillarName': 'News', 'canonical_id': 'politics/2017/oct/13/brexit-no-deal-and-the-tories-who-said-what'}


In [36]:
for a in enumerate(range(1, 10)):
    print(a)

(0, 1)
(1, 2)
(2, 3)
(3, 4)
(4, 5)
(5, 6)
(6, 7)
(7, 8)
(8, 9)
