# Generate English Wikipedia WordVector

In [1]:
import json
import logging 
import os
import multiprocessing
from datetime import datetime
import pathlib
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
LOG = logging.getLogger('make_word_vec')
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

 ## Parameter suggestions brought to you by:
 * Word2vec applied to Recommendation: Hyperparameters Matter  https://arxiv.org/pdf/1804.04212  
 * Exploiting Similarities among Languages for Machine Translation https://arxiv.org/abs/1309.4168
 * Distributed Representations of Words and Phrases and their Compositionality https://arxiv.org/abs/1310.4546

In [3]:
parent_dir = os.path.dirname(pathlib.Path.cwd())
corpus_characteristics = 'processed'  
corpus_filename = os.path.join(parent_dir,  'wikipedia_corpus_processing' , 'wikimedia.en.processed.cor')

In [4]:
keyword_params = {
    'size': 300,
    'iter': 3,
    'min_count': 10,
    # 'min_count': 3, # Ignores all words with total frequency lower than this.
    # min_count 5 is the default; using min count 3 to get 3 million figure
    'max_final_vocab': 3000000,
    'ns_exponent': 0.75, # the default, optimal for linguistic tasks; also try -0.5 for recommenders
    'alpha':  0.025,
    'min_alpha': 0.004,
    'sg': 1, # skip gram
    'window': 10, # number of surrounding words to consider
    'workers': multiprocessing.cpu_count() - 1,
    'negative': 15, # 15 may be best
    # We don't use down sampling, because our corpus has been preprocessed and stopwords removed
    'sample': 0 # the default is:  sample=0.001 downsamples 8 most-common words    
    #0.00001  # sample=1e-05 downsamples   
}
LOG.info('Creating vector with parameters: %s', json.dumps(keyword_params))
eng_wikipedia = Word2Vec(corpus_file=corpus_filename, **keyword_params)

INFO : Creating vector with parameters: {"size": 300, "iter": 3, "min_count": 10, "max_final_vocab": 3000000, "ns_exponent": 0.75, "alpha": 0.025, "min_alpha": 0.004, "sg": 1, "window": 10, "workers": 7, "negative": 15, "sample": 0}
INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO : PROGRESS: at sentence #10000, processed 120890 words, keeping 23697 word types
INFO : PROGRESS: at sentence #20000, processed 240345 words, keeping 35662 word types
INFO : PROGRESS: at sentence #30000, processed 365441 words, keeping 43774 word types
INFO : PROGRESS: at sentence #40000, processed 487458 words, keeping 51645 word types
INFO : PROGRESS: at sentence #50000, processed 606271 words, keeping 59443 word types
INFO : PROGRESS: at sentence #60000, processed 729964 words, keeping 66927 word types
INFO : PROGRESS: at sentence #70000, processed 849658 words, keeping 74053 word types
INFO : PROGRESS: at sentence #80000, processed 

INFO : PROGRESS: at sentence #900000, processed 10741580 words, keeping 361754 word types
INFO : PROGRESS: at sentence #910000, processed 10859919 words, keeping 364089 word types
INFO : PROGRESS: at sentence #920000, processed 10972508 words, keeping 366602 word types
INFO : PROGRESS: at sentence #930000, processed 11080044 words, keeping 369425 word types
INFO : PROGRESS: at sentence #940000, processed 11192640 words, keeping 372148 word types
INFO : PROGRESS: at sentence #950000, processed 11299110 words, keeping 374829 word types
INFO : PROGRESS: at sentence #960000, processed 11406411 words, keeping 377511 word types
INFO : PROGRESS: at sentence #970000, processed 11516110 words, keeping 380251 word types
INFO : PROGRESS: at sentence #980000, processed 11633748 words, keeping 382790 word types
INFO : PROGRESS: at sentence #990000, processed 11754760 words, keeping 385152 word types
INFO : PROGRESS: at sentence #1000000, processed 11873782 words, keeping 387403 word types
INFO : PR

INFO : PROGRESS: at sentence #1810000, processed 21219062 words, keeping 574910 word types
INFO : PROGRESS: at sentence #1820000, processed 21336414 words, keeping 577008 word types
INFO : PROGRESS: at sentence #1830000, processed 21452799 words, keeping 579197 word types
INFO : PROGRESS: at sentence #1840000, processed 21570735 words, keeping 581400 word types
INFO : PROGRESS: at sentence #1850000, processed 21686638 words, keeping 583483 word types
INFO : PROGRESS: at sentence #1860000, processed 21799641 words, keeping 585474 word types
INFO : PROGRESS: at sentence #1870000, processed 21919395 words, keeping 587106 word types
INFO : PROGRESS: at sentence #1880000, processed 22033998 words, keeping 588972 word types
INFO : PROGRESS: at sentence #1890000, processed 22146225 words, keeping 591467 word types
INFO : PROGRESS: at sentence #1900000, processed 22260421 words, keeping 593644 word types
INFO : PROGRESS: at sentence #1910000, processed 22378379 words, keeping 595333 word types

INFO : PROGRESS: at sentence #2720000, processed 31764502 words, keeping 745513 word types
INFO : PROGRESS: at sentence #2730000, processed 31882868 words, keeping 747176 word types
INFO : PROGRESS: at sentence #2740000, processed 31998842 words, keeping 748845 word types
INFO : PROGRESS: at sentence #2750000, processed 32114656 words, keeping 750409 word types
INFO : PROGRESS: at sentence #2760000, processed 32233080 words, keeping 752159 word types
INFO : PROGRESS: at sentence #2770000, processed 32351941 words, keeping 753682 word types
INFO : PROGRESS: at sentence #2780000, processed 32469288 words, keeping 755391 word types
INFO : PROGRESS: at sentence #2790000, processed 32588645 words, keeping 756909 word types
INFO : PROGRESS: at sentence #2800000, processed 32709474 words, keeping 758714 word types
INFO : PROGRESS: at sentence #2810000, processed 32826533 words, keeping 760460 word types
INFO : PROGRESS: at sentence #2820000, processed 32944613 words, keeping 761970 word types

INFO : PROGRESS: at sentence #3630000, processed 42441068 words, keeping 886458 word types
INFO : PROGRESS: at sentence #3640000, processed 42558046 words, keeping 887947 word types
INFO : PROGRESS: at sentence #3650000, processed 42673984 words, keeping 889495 word types
INFO : PROGRESS: at sentence #3660000, processed 42791093 words, keeping 890828 word types
INFO : PROGRESS: at sentence #3670000, processed 42909510 words, keeping 892098 word types
INFO : PROGRESS: at sentence #3680000, processed 43025987 words, keeping 893407 word types
INFO : PROGRESS: at sentence #3690000, processed 43141000 words, keeping 894642 word types
INFO : PROGRESS: at sentence #3700000, processed 43257139 words, keeping 895977 word types
INFO : PROGRESS: at sentence #3710000, processed 43375862 words, keeping 897392 word types
INFO : PROGRESS: at sentence #3720000, processed 43490335 words, keeping 898874 word types
INFO : PROGRESS: at sentence #3730000, processed 43611114 words, keeping 900155 word types

INFO : PROGRESS: at sentence #4530000, processed 53273556 words, keeping 1007424 word types
INFO : PROGRESS: at sentence #4540000, processed 53389681 words, keeping 1008880 word types
INFO : PROGRESS: at sentence #4550000, processed 53502470 words, keeping 1010545 word types
INFO : PROGRESS: at sentence #4560000, processed 53618082 words, keeping 1011936 word types
INFO : PROGRESS: at sentence #4570000, processed 53737851 words, keeping 1013639 word types
INFO : PROGRESS: at sentence #4580000, processed 53852214 words, keeping 1015318 word types
INFO : PROGRESS: at sentence #4590000, processed 53966665 words, keeping 1016859 word types
INFO : PROGRESS: at sentence #4600000, processed 54079500 words, keeping 1018577 word types
INFO : PROGRESS: at sentence #4610000, processed 54197468 words, keeping 1020060 word types
INFO : PROGRESS: at sentence #4620000, processed 54311983 words, keeping 1021463 word types
INFO : PROGRESS: at sentence #4630000, processed 54429751 words, keeping 1022856

INFO : PROGRESS: at sentence #5430000, processed 63915029 words, keeping 1131679 word types
INFO : PROGRESS: at sentence #5440000, processed 64041312 words, keeping 1132682 word types
INFO : PROGRESS: at sentence #5450000, processed 64163219 words, keeping 1133738 word types
INFO : PROGRESS: at sentence #5460000, processed 64284809 words, keeping 1134956 word types
INFO : PROGRESS: at sentence #5470000, processed 64408758 words, keeping 1135949 word types
INFO : PROGRESS: at sentence #5480000, processed 64529206 words, keeping 1137350 word types
INFO : PROGRESS: at sentence #5490000, processed 64654202 words, keeping 1138568 word types
INFO : PROGRESS: at sentence #5500000, processed 64775309 words, keeping 1139666 word types
INFO : PROGRESS: at sentence #5510000, processed 64901513 words, keeping 1140908 word types
INFO : PROGRESS: at sentence #5520000, processed 65023295 words, keeping 1142245 word types
INFO : PROGRESS: at sentence #5530000, processed 65144709 words, keeping 1143487

INFO : PROGRESS: at sentence #6330000, processed 74504450 words, keeping 1248294 word types
INFO : PROGRESS: at sentence #6340000, processed 74620309 words, keeping 1249684 word types
INFO : PROGRESS: at sentence #6350000, processed 74734707 words, keeping 1250784 word types
INFO : PROGRESS: at sentence #6360000, processed 74849208 words, keeping 1252124 word types
INFO : PROGRESS: at sentence #6370000, processed 74965717 words, keeping 1253573 word types
INFO : PROGRESS: at sentence #6380000, processed 75080315 words, keeping 1254888 word types
INFO : PROGRESS: at sentence #6390000, processed 75193078 words, keeping 1256128 word types
INFO : PROGRESS: at sentence #6400000, processed 75307659 words, keeping 1257298 word types
INFO : PROGRESS: at sentence #6410000, processed 75423126 words, keeping 1258346 word types
INFO : PROGRESS: at sentence #6420000, processed 75536809 words, keeping 1259647 word types
INFO : PROGRESS: at sentence #6430000, processed 75654049 words, keeping 1261082

INFO : PROGRESS: at sentence #7230000, processed 85277464 words, keeping 1349868 word types
INFO : PROGRESS: at sentence #7240000, processed 85400890 words, keeping 1351140 word types
INFO : PROGRESS: at sentence #7250000, processed 85530226 words, keeping 1352166 word types
INFO : PROGRESS: at sentence #7260000, processed 85657593 words, keeping 1353222 word types
INFO : PROGRESS: at sentence #7270000, processed 85783968 words, keeping 1353998 word types
INFO : PROGRESS: at sentence #7280000, processed 85911577 words, keeping 1354974 word types
INFO : PROGRESS: at sentence #7290000, processed 86037377 words, keeping 1355930 word types
INFO : PROGRESS: at sentence #7300000, processed 86161664 words, keeping 1356835 word types
INFO : PROGRESS: at sentence #7310000, processed 86286549 words, keeping 1357913 word types
INFO : PROGRESS: at sentence #7320000, processed 86416525 words, keeping 1358900 word types
INFO : PROGRESS: at sentence #7330000, processed 86544230 words, keeping 1359814

INFO : PROGRESS: at sentence #8130000, processed 96375836 words, keeping 1439354 word types
INFO : PROGRESS: at sentence #8140000, processed 96494891 words, keeping 1440268 word types
INFO : PROGRESS: at sentence #8150000, processed 96613520 words, keeping 1441165 word types
INFO : PROGRESS: at sentence #8160000, processed 96731785 words, keeping 1442249 word types
INFO : PROGRESS: at sentence #8170000, processed 96852786 words, keeping 1443155 word types
INFO : PROGRESS: at sentence #8180000, processed 96971971 words, keeping 1444295 word types
INFO : PROGRESS: at sentence #8190000, processed 97087522 words, keeping 1445419 word types
INFO : PROGRESS: at sentence #8200000, processed 97207425 words, keeping 1446460 word types
INFO : PROGRESS: at sentence #8210000, processed 97327913 words, keeping 1447492 word types
INFO : PROGRESS: at sentence #8220000, processed 97445354 words, keeping 1448597 word types
INFO : PROGRESS: at sentence #8230000, processed 97565430 words, keeping 1449488

INFO : PROGRESS: at sentence #9020000, processed 106780020 words, keeping 1540648 word types
INFO : PROGRESS: at sentence #9030000, processed 106910725 words, keeping 1541303 word types
INFO : PROGRESS: at sentence #9040000, processed 107034760 words, keeping 1542067 word types
INFO : PROGRESS: at sentence #9050000, processed 107158106 words, keeping 1542969 word types
INFO : PROGRESS: at sentence #9060000, processed 107288954 words, keeping 1543777 word types
INFO : PROGRESS: at sentence #9070000, processed 107415237 words, keeping 1544539 word types
INFO : PROGRESS: at sentence #9080000, processed 107543389 words, keeping 1545528 word types
INFO : PROGRESS: at sentence #9090000, processed 107673458 words, keeping 1546159 word types
INFO : PROGRESS: at sentence #9100000, processed 107800476 words, keeping 1546982 word types
INFO : PROGRESS: at sentence #9110000, processed 107928201 words, keeping 1547697 word types
INFO : PROGRESS: at sentence #9120000, processed 108053603 words, keep

INFO : PROGRESS: at sentence #9910000, processed 117937697 words, keeping 1617752 word types
INFO : PROGRESS: at sentence #9920000, processed 118065380 words, keeping 1618507 word types
INFO : PROGRESS: at sentence #9930000, processed 118189904 words, keeping 1619423 word types
INFO : PROGRESS: at sentence #9940000, processed 118312844 words, keeping 1620444 word types
INFO : PROGRESS: at sentence #9950000, processed 118440378 words, keeping 1621363 word types
INFO : PROGRESS: at sentence #9960000, processed 118566208 words, keeping 1622278 word types
INFO : PROGRESS: at sentence #9970000, processed 118690570 words, keeping 1623326 word types
INFO : PROGRESS: at sentence #9980000, processed 118813275 words, keeping 1624279 word types
INFO : PROGRESS: at sentence #9990000, processed 118939798 words, keeping 1625026 word types
INFO : PROGRESS: at sentence #10000000, processed 119069097 words, keeping 1625768 word types
INFO : PROGRESS: at sentence #10010000, processed 119194436 words, ke

INFO : PROGRESS: at sentence #10790000, processed 128679939 words, keeping 1698558 word types
INFO : PROGRESS: at sentence #10800000, processed 128800504 words, keeping 1699376 word types
INFO : PROGRESS: at sentence #10810000, processed 128922474 words, keeping 1700291 word types
INFO : PROGRESS: at sentence #10820000, processed 129046015 words, keeping 1701199 word types
INFO : PROGRESS: at sentence #10830000, processed 129163727 words, keeping 1702089 word types
INFO : PROGRESS: at sentence #10840000, processed 129283921 words, keeping 1702905 word types
INFO : PROGRESS: at sentence #10850000, processed 129404669 words, keeping 1703777 word types
INFO : PROGRESS: at sentence #10860000, processed 129525481 words, keeping 1704682 word types
INFO : PROGRESS: at sentence #10870000, processed 129646030 words, keeping 1705601 word types
INFO : PROGRESS: at sentence #10880000, processed 129771974 words, keeping 1706472 word types
INFO : PROGRESS: at sentence #10890000, processed 129894849 

INFO : PROGRESS: at sentence #11670000, processed 139525662 words, keeping 1772627 word types
INFO : PROGRESS: at sentence #11680000, processed 139645765 words, keeping 1773594 word types
INFO : PROGRESS: at sentence #11690000, processed 139767854 words, keeping 1774504 word types
INFO : PROGRESS: at sentence #11700000, processed 139889141 words, keeping 1775368 word types
INFO : PROGRESS: at sentence #11710000, processed 140008135 words, keeping 1776275 word types
INFO : PROGRESS: at sentence #11720000, processed 140128719 words, keeping 1777169 word types
INFO : PROGRESS: at sentence #11730000, processed 140252050 words, keeping 1778078 word types
INFO : PROGRESS: at sentence #11740000, processed 140371977 words, keeping 1779040 word types
INFO : PROGRESS: at sentence #11750000, processed 140491103 words, keeping 1779999 word types
INFO : PROGRESS: at sentence #11760000, processed 140612404 words, keeping 1780803 word types
INFO : PROGRESS: at sentence #11770000, processed 140733239 

INFO : PROGRESS: at sentence #12550000, processed 149945859 words, keeping 1856951 word types
INFO : PROGRESS: at sentence #12560000, processed 150063035 words, keeping 1857954 word types
INFO : PROGRESS: at sentence #12570000, processed 150177453 words, keeping 1858825 word types
INFO : PROGRESS: at sentence #12580000, processed 150291285 words, keeping 1859780 word types
INFO : PROGRESS: at sentence #12590000, processed 150410216 words, keeping 1860692 word types
INFO : PROGRESS: at sentence #12600000, processed 150528297 words, keeping 1861127 word types
INFO : PROGRESS: at sentence #12610000, processed 150643764 words, keeping 1862078 word types
INFO : PROGRESS: at sentence #12620000, processed 150761742 words, keeping 1863012 word types
INFO : PROGRESS: at sentence #12630000, processed 150882035 words, keeping 1863967 word types
INFO : PROGRESS: at sentence #12640000, processed 151001134 words, keeping 1865040 word types
INFO : PROGRESS: at sentence #12650000, processed 151116469 

INFO : PROGRESS: at sentence #13430000, processed 160569314 words, keeping 1929121 word types
INFO : PROGRESS: at sentence #13440000, processed 160690514 words, keeping 1929942 word types
INFO : PROGRESS: at sentence #13450000, processed 160808101 words, keeping 1930711 word types
INFO : PROGRESS: at sentence #13460000, processed 160915395 words, keeping 1931566 word types
INFO : PROGRESS: at sentence #13470000, processed 161030232 words, keeping 1932573 word types
INFO : PROGRESS: at sentence #13480000, processed 161146617 words, keeping 1933458 word types
INFO : PROGRESS: at sentence #13490000, processed 161266919 words, keeping 1934304 word types
INFO : PROGRESS: at sentence #13500000, processed 161383997 words, keeping 1935286 word types
INFO : PROGRESS: at sentence #13510000, processed 161500370 words, keeping 1936051 word types
INFO : PROGRESS: at sentence #13520000, processed 161617460 words, keeping 1937056 word types
INFO : PROGRESS: at sentence #13530000, processed 161732981 

INFO : PROGRESS: at sentence #14310000, processed 170755396 words, keeping 2008643 word types
INFO : PROGRESS: at sentence #14320000, processed 170871505 words, keeping 2009537 word types
INFO : PROGRESS: at sentence #14330000, processed 170981878 words, keeping 2010653 word types
INFO : PROGRESS: at sentence #14340000, processed 171100193 words, keeping 2011487 word types
INFO : PROGRESS: at sentence #14350000, processed 171216410 words, keeping 2012567 word types
INFO : PROGRESS: at sentence #14360000, processed 171330272 words, keeping 2013435 word types
INFO : PROGRESS: at sentence #14370000, processed 171444731 words, keeping 2014676 word types
INFO : PROGRESS: at sentence #14380000, processed 171558334 words, keeping 2015935 word types
INFO : PROGRESS: at sentence #14390000, processed 171676701 words, keeping 2017184 word types
INFO : PROGRESS: at sentence #14400000, processed 171789841 words, keeping 2018634 word types
INFO : PROGRESS: at sentence #14410000, processed 171905896 

INFO : PROGRESS: at sentence #15180000, processed 181107178 words, keeping 2085654 word types
INFO : PROGRESS: at sentence #15190000, processed 181223308 words, keeping 2086533 word types
INFO : PROGRESS: at sentence #15200000, processed 181341419 words, keeping 2087465 word types
INFO : PROGRESS: at sentence #15210000, processed 181459121 words, keeping 2088246 word types
INFO : PROGRESS: at sentence #15220000, processed 181581465 words, keeping 2089118 word types
INFO : PROGRESS: at sentence #15230000, processed 181704800 words, keeping 2089817 word types
INFO : PROGRESS: at sentence #15240000, processed 181827024 words, keeping 2090557 word types
INFO : PROGRESS: at sentence #15250000, processed 181945565 words, keeping 2091439 word types
INFO : PROGRESS: at sentence #15260000, processed 182065118 words, keeping 2092198 word types
INFO : PROGRESS: at sentence #15270000, processed 182186974 words, keeping 2092944 word types
INFO : PROGRESS: at sentence #15280000, processed 182307871 

INFO : PROGRESS: at sentence #16060000, processed 191351378 words, keeping 2159991 word types
INFO : PROGRESS: at sentence #16070000, processed 191468344 words, keeping 2160864 word types
INFO : PROGRESS: at sentence #16080000, processed 191585018 words, keeping 2161546 word types
INFO : PROGRESS: at sentence #16090000, processed 191704671 words, keeping 2162374 word types
INFO : PROGRESS: at sentence #16100000, processed 191824434 words, keeping 2163117 word types
INFO : PROGRESS: at sentence #16110000, processed 191942532 words, keeping 2163890 word types
INFO : PROGRESS: at sentence #16120000, processed 192060980 words, keeping 2164718 word types
INFO : PROGRESS: at sentence #16130000, processed 192179224 words, keeping 2165491 word types
INFO : PROGRESS: at sentence #16140000, processed 192295925 words, keeping 2166530 word types
INFO : PROGRESS: at sentence #16150000, processed 192413720 words, keeping 2167240 word types
INFO : PROGRESS: at sentence #16160000, processed 192532857 

INFO : PROGRESS: at sentence #16940000, processed 201881925 words, keeping 2229744 word types
INFO : PROGRESS: at sentence #16950000, processed 202004581 words, keeping 2230398 word types
INFO : PROGRESS: at sentence #16960000, processed 202125096 words, keeping 2231083 word types
INFO : PROGRESS: at sentence #16970000, processed 202244767 words, keeping 2231898 word types
INFO : PROGRESS: at sentence #16980000, processed 202366361 words, keeping 2232591 word types
INFO : PROGRESS: at sentence #16990000, processed 202488260 words, keeping 2233259 word types
INFO : PROGRESS: at sentence #17000000, processed 202609364 words, keeping 2234016 word types
INFO : PROGRESS: at sentence #17010000, processed 202729844 words, keeping 2234796 word types
INFO : PROGRESS: at sentence #17020000, processed 202851574 words, keeping 2235504 word types
INFO : PROGRESS: at sentence #17030000, processed 202972118 words, keeping 2236273 word types
INFO : PROGRESS: at sentence #17040000, processed 203091025 

INFO : PROGRESS: at sentence #17820000, processed 212347795 words, keeping 2297177 word types
INFO : PROGRESS: at sentence #17830000, processed 212465253 words, keeping 2298112 word types
INFO : PROGRESS: at sentence #17840000, processed 212583923 words, keeping 2298891 word types
INFO : PROGRESS: at sentence #17850000, processed 212698553 words, keeping 2299630 word types
INFO : PROGRESS: at sentence #17860000, processed 212819876 words, keeping 2300498 word types
INFO : PROGRESS: at sentence #17870000, processed 212936745 words, keeping 2301458 word types
INFO : PROGRESS: at sentence #17880000, processed 213054256 words, keeping 2302232 word types
INFO : PROGRESS: at sentence #17890000, processed 213173487 words, keeping 2302929 word types
INFO : PROGRESS: at sentence #17900000, processed 213289486 words, keeping 2303913 word types
INFO : PROGRESS: at sentence #17910000, processed 213406393 words, keeping 2304747 word types
INFO : PROGRESS: at sentence #17920000, processed 213521301 

INFO : PROGRESS: at sentence #18690000, processed 222475130 words, keeping 2372570 word types
INFO : PROGRESS: at sentence #18700000, processed 222600556 words, keeping 2373267 word types
INFO : PROGRESS: at sentence #18710000, processed 222725807 words, keeping 2373900 word types
INFO : PROGRESS: at sentence #18720000, processed 222849847 words, keeping 2374527 word types
INFO : PROGRESS: at sentence #18730000, processed 222974481 words, keeping 2375126 word types
INFO : PROGRESS: at sentence #18740000, processed 223102937 words, keeping 2375749 word types
INFO : PROGRESS: at sentence #18750000, processed 223231414 words, keeping 2376314 word types
INFO : PROGRESS: at sentence #18760000, processed 223355020 words, keeping 2376954 word types
INFO : PROGRESS: at sentence #18770000, processed 223474487 words, keeping 2377664 word types
INFO : PROGRESS: at sentence #18780000, processed 223599634 words, keeping 2378234 word types
INFO : PROGRESS: at sentence #18790000, processed 223721473 

INFO : PROGRESS: at sentence #19570000, processed 233302682 words, keeping 2430762 word types
INFO : PROGRESS: at sentence #19580000, processed 233424135 words, keeping 2431541 word types
INFO : PROGRESS: at sentence #19590000, processed 233544480 words, keeping 2432286 word types
INFO : PROGRESS: at sentence #19600000, processed 233662014 words, keeping 2433050 word types
INFO : PROGRESS: at sentence #19610000, processed 233784134 words, keeping 2433680 word types
INFO : PROGRESS: at sentence #19620000, processed 233903817 words, keeping 2434335 word types
INFO : PROGRESS: at sentence #19630000, processed 234022634 words, keeping 2435004 word types
INFO : PROGRESS: at sentence #19640000, processed 234143112 words, keeping 2435859 word types
INFO : PROGRESS: at sentence #19650000, processed 234264395 words, keeping 2436402 word types
INFO : PROGRESS: at sentence #19660000, processed 234385955 words, keeping 2437052 word types
INFO : PROGRESS: at sentence #19670000, processed 234506047 

INFO : PROGRESS: at sentence #20450000, processed 243986040 words, keeping 2485039 word types
INFO : PROGRESS: at sentence #20460000, processed 244110228 words, keeping 2485731 word types
INFO : PROGRESS: at sentence #20470000, processed 244232197 words, keeping 2486459 word types
INFO : PROGRESS: at sentence #20480000, processed 244357246 words, keeping 2486993 word types
INFO : PROGRESS: at sentence #20490000, processed 244480456 words, keeping 2487566 word types
INFO : PROGRESS: at sentence #20500000, processed 244599245 words, keeping 2488229 word types
INFO : PROGRESS: at sentence #20510000, processed 244703562 words, keeping 2488584 word types
INFO : PROGRESS: at sentence #20520000, processed 244825511 words, keeping 2489132 word types
INFO : PROGRESS: at sentence #20530000, processed 244938319 words, keeping 2490084 word types
INFO : PROGRESS: at sentence #20540000, processed 245051762 words, keeping 2491184 word types
INFO : PROGRESS: at sentence #20550000, processed 245167678 

INFO : PROGRESS: at sentence #21330000, processed 254238687 words, keeping 2561088 word types
INFO : PROGRESS: at sentence #21340000, processed 254359193 words, keeping 2561748 word types
INFO : PROGRESS: at sentence #21350000, processed 254480047 words, keeping 2562493 word types
INFO : PROGRESS: at sentence #21360000, processed 254602921 words, keeping 2563187 word types
INFO : PROGRESS: at sentence #21370000, processed 254721955 words, keeping 2563758 word types
INFO : PROGRESS: at sentence #21380000, processed 254844877 words, keeping 2564363 word types
INFO : PROGRESS: at sentence #21390000, processed 254966175 words, keeping 2565092 word types
INFO : PROGRESS: at sentence #21400000, processed 255086265 words, keeping 2565723 word types
INFO : PROGRESS: at sentence #21410000, processed 255211926 words, keeping 2566313 word types
INFO : PROGRESS: at sentence #21420000, processed 255333443 words, keeping 2566909 word types
INFO : PROGRESS: at sentence #21430000, processed 255454388 

INFO : PROGRESS: at sentence #22210000, processed 264790072 words, keeping 2621224 word types
INFO : PROGRESS: at sentence #22220000, processed 264910215 words, keeping 2621929 word types
INFO : PROGRESS: at sentence #22230000, processed 265030082 words, keeping 2622646 word types
INFO : PROGRESS: at sentence #22240000, processed 265148287 words, keeping 2623255 word types
INFO : PROGRESS: at sentence #22250000, processed 265267502 words, keeping 2623989 word types
INFO : PROGRESS: at sentence #22260000, processed 265387263 words, keeping 2624755 word types
INFO : PROGRESS: at sentence #22270000, processed 265510215 words, keeping 2625334 word types
INFO : PROGRESS: at sentence #22280000, processed 265629847 words, keeping 2626122 word types
INFO : PROGRESS: at sentence #22290000, processed 265749467 words, keeping 2626689 word types
INFO : PROGRESS: at sentence #22300000, processed 265869806 words, keeping 2627456 word types
INFO : PROGRESS: at sentence #22310000, processed 265988976 

INFO : PROGRESS: at sentence #23090000, processed 274935208 words, keeping 2692476 word types
INFO : PROGRESS: at sentence #23100000, processed 275057883 words, keeping 2693199 word types
INFO : PROGRESS: at sentence #23110000, processed 275182232 words, keeping 2693774 word types
INFO : PROGRESS: at sentence #23120000, processed 275307861 words, keeping 2694381 word types
INFO : PROGRESS: at sentence #23130000, processed 275434736 words, keeping 2694911 word types
INFO : PROGRESS: at sentence #23140000, processed 275562173 words, keeping 2695503 word types
INFO : PROGRESS: at sentence #23150000, processed 275683193 words, keeping 2696088 word types
INFO : PROGRESS: at sentence #23160000, processed 275807483 words, keeping 2696636 word types
INFO : PROGRESS: at sentence #23170000, processed 275929080 words, keeping 2697302 word types
INFO : PROGRESS: at sentence #23180000, processed 276049828 words, keeping 2697912 word types
INFO : PROGRESS: at sentence #23190000, processed 276172637 

INFO : PROGRESS: at sentence #23970000, processed 285657954 words, keeping 2749258 word types
INFO : PROGRESS: at sentence #23980000, processed 285776185 words, keeping 2749972 word types
INFO : PROGRESS: at sentence #23990000, processed 285888962 words, keeping 2750893 word types
INFO : PROGRESS: at sentence #24000000, processed 286005965 words, keeping 2751792 word types
INFO : PROGRESS: at sentence #24010000, processed 286121165 words, keeping 2752757 word types
INFO : PROGRESS: at sentence #24020000, processed 286232519 words, keeping 2753519 word types
INFO : PROGRESS: at sentence #24030000, processed 286350053 words, keeping 2754355 word types
INFO : PROGRESS: at sentence #24040000, processed 286466970 words, keeping 2755154 word types
INFO : PROGRESS: at sentence #24050000, processed 286583560 words, keeping 2756109 word types
INFO : PROGRESS: at sentence #24060000, processed 286695785 words, keeping 2756888 word types
INFO : PROGRESS: at sentence #24070000, processed 286807974 

INFO : PROGRESS: at sentence #24850000, processed 295892363 words, keeping 2815082 word types
INFO : PROGRESS: at sentence #24860000, processed 296011658 words, keeping 2815694 word types
INFO : PROGRESS: at sentence #24870000, processed 296131437 words, keeping 2816262 word types
INFO : PROGRESS: at sentence #24880000, processed 296251029 words, keeping 2817033 word types
INFO : PROGRESS: at sentence #24890000, processed 296369987 words, keeping 2817886 word types
INFO : PROGRESS: at sentence #24900000, processed 296489865 words, keeping 2818602 word types
INFO : PROGRESS: at sentence #24910000, processed 296608140 words, keeping 2819129 word types
INFO : PROGRESS: at sentence #24920000, processed 296724837 words, keeping 2819942 word types
INFO : PROGRESS: at sentence #24930000, processed 296846439 words, keeping 2820576 word types
INFO : PROGRESS: at sentence #24940000, processed 296963391 words, keeping 2821307 word types
INFO : PROGRESS: at sentence #24950000, processed 297081667 

INFO : PROGRESS: at sentence #25720000, processed 306622358 words, keeping 2866256 word types
INFO : PROGRESS: at sentence #25730000, processed 306737581 words, keeping 2867118 word types
INFO : PROGRESS: at sentence #25740000, processed 306848264 words, keeping 2868036 word types
INFO : PROGRESS: at sentence #25750000, processed 306958209 words, keeping 2869049 word types
INFO : PROGRESS: at sentence #25760000, processed 307074623 words, keeping 2869960 word types
INFO : PROGRESS: at sentence #25770000, processed 307189566 words, keeping 2870960 word types
INFO : PROGRESS: at sentence #25780000, processed 307302126 words, keeping 2871906 word types
INFO : PROGRESS: at sentence #25790000, processed 307417000 words, keeping 2872789 word types
INFO : PROGRESS: at sentence #25800000, processed 307524953 words, keeping 2873958 word types
INFO : PROGRESS: at sentence #25810000, processed 307626320 words, keeping 2875577 word types
INFO : PROGRESS: at sentence #25820000, processed 307741568 

INFO : PROGRESS: at sentence #26600000, processed 316648516 words, keeping 2943412 word types
INFO : PROGRESS: at sentence #26610000, processed 316766174 words, keeping 2944139 word types
INFO : PROGRESS: at sentence #26620000, processed 316881953 words, keeping 2944767 word types
INFO : PROGRESS: at sentence #26630000, processed 316997137 words, keeping 2945459 word types
INFO : PROGRESS: at sentence #26640000, processed 317116247 words, keeping 2946091 word types
INFO : PROGRESS: at sentence #26650000, processed 317233297 words, keeping 2946746 word types
INFO : PROGRESS: at sentence #26660000, processed 317350821 words, keeping 2947408 word types
INFO : PROGRESS: at sentence #26670000, processed 317467243 words, keeping 2948088 word types
INFO : PROGRESS: at sentence #26680000, processed 317584724 words, keeping 2948671 word types
INFO : PROGRESS: at sentence #26690000, processed 317703705 words, keeping 2949211 word types
INFO : PROGRESS: at sentence #26700000, processed 317825897 

INFO : PROGRESS: at sentence #27480000, processed 326731849 words, keeping 3016018 word types
INFO : PROGRESS: at sentence #27490000, processed 326842337 words, keeping 3017024 word types
INFO : PROGRESS: at sentence #27500000, processed 326954517 words, keeping 3017997 word types
INFO : PROGRESS: at sentence #27510000, processed 327067913 words, keeping 3018731 word types
INFO : PROGRESS: at sentence #27520000, processed 327180613 words, keeping 3019683 word types
INFO : PROGRESS: at sentence #27530000, processed 327298825 words, keeping 3020510 word types
INFO : PROGRESS: at sentence #27540000, processed 327413193 words, keeping 3021389 word types
INFO : PROGRESS: at sentence #27550000, processed 327528785 words, keeping 3022164 word types
INFO : PROGRESS: at sentence #27560000, processed 327646155 words, keeping 3022802 word types
INFO : PROGRESS: at sentence #27570000, processed 327760714 words, keeping 3023609 word types
INFO : PROGRESS: at sentence #27580000, processed 327874407 

INFO : PROGRESS: at sentence #28360000, processed 336863141 words, keeping 3079218 word types
INFO : PROGRESS: at sentence #28370000, processed 336982821 words, keeping 3079759 word types
INFO : PROGRESS: at sentence #28380000, processed 337095319 words, keeping 3080521 word types
INFO : PROGRESS: at sentence #28390000, processed 337211512 words, keeping 3081091 word types
INFO : PROGRESS: at sentence #28400000, processed 337331291 words, keeping 3081769 word types
INFO : PROGRESS: at sentence #28410000, processed 337450127 words, keeping 3082433 word types
INFO : PROGRESS: at sentence #28420000, processed 337567972 words, keeping 3083047 word types
INFO : PROGRESS: at sentence #28430000, processed 337686438 words, keeping 3083634 word types
INFO : PROGRESS: at sentence #28440000, processed 337803326 words, keeping 3084526 word types
INFO : PROGRESS: at sentence #28450000, processed 337920026 words, keeping 3085236 word types
INFO : PROGRESS: at sentence #28460000, processed 338036731 

INFO : PROGRESS: at sentence #29240000, processed 346960215 words, keeping 3146068 word types
INFO : PROGRESS: at sentence #29250000, processed 347076315 words, keeping 3146957 word types
INFO : PROGRESS: at sentence #29260000, processed 347191157 words, keeping 3147857 word types
INFO : PROGRESS: at sentence #29270000, processed 347305367 words, keeping 3148733 word types
INFO : PROGRESS: at sentence #29280000, processed 347418132 words, keeping 3149527 word types
INFO : PROGRESS: at sentence #29290000, processed 347533333 words, keeping 3150424 word types
INFO : PROGRESS: at sentence #29300000, processed 347649195 words, keeping 3151317 word types
INFO : PROGRESS: at sentence #29310000, processed 347759200 words, keeping 3152359 word types
INFO : PROGRESS: at sentence #29320000, processed 347876651 words, keeping 3153216 word types
INFO : PROGRESS: at sentence #29330000, processed 347992991 words, keeping 3154097 word types
INFO : PROGRESS: at sentence #29340000, processed 348109469 

INFO : PROGRESS: at sentence #30120000, processed 357082990 words, keeping 3217008 word types
INFO : PROGRESS: at sentence #30130000, processed 357198546 words, keeping 3217743 word types
INFO : PROGRESS: at sentence #30140000, processed 357312999 words, keeping 3218509 word types
INFO : PROGRESS: at sentence #30150000, processed 357425238 words, keeping 3219365 word types
INFO : PROGRESS: at sentence #30160000, processed 357537999 words, keeping 3220143 word types
INFO : PROGRESS: at sentence #30170000, processed 357653598 words, keeping 3221081 word types
INFO : PROGRESS: at sentence #30180000, processed 357767328 words, keeping 3221857 word types
INFO : PROGRESS: at sentence #30190000, processed 357880330 words, keeping 3222762 word types
INFO : PROGRESS: at sentence #30200000, processed 357994939 words, keeping 3223608 word types
INFO : PROGRESS: at sentence #30210000, processed 358109877 words, keeping 3224228 word types
INFO : PROGRESS: at sentence #30220000, processed 358224904 

INFO : PROGRESS: at sentence #31000000, processed 367157178 words, keeping 3284378 word types
INFO : PROGRESS: at sentence #31010000, processed 367271503 words, keeping 3285019 word types
INFO : PROGRESS: at sentence #31020000, processed 367383916 words, keeping 3285778 word types
INFO : PROGRESS: at sentence #31030000, processed 367500248 words, keeping 3286529 word types
INFO : PROGRESS: at sentence #31040000, processed 367616281 words, keeping 3287333 word types
INFO : PROGRESS: at sentence #31050000, processed 367732250 words, keeping 3288040 word types
INFO : PROGRESS: at sentence #31060000, processed 367845275 words, keeping 3288771 word types
INFO : PROGRESS: at sentence #31070000, processed 367960020 words, keeping 3289273 word types
INFO : PROGRESS: at sentence #31080000, processed 368075070 words, keeping 3290107 word types
INFO : PROGRESS: at sentence #31090000, processed 368196334 words, keeping 3290651 word types
INFO : PROGRESS: at sentence #31100000, processed 368307023 

INFO : PROGRESS: at sentence #31880000, processed 377279588 words, keeping 3347917 word types
INFO : PROGRESS: at sentence #31890000, processed 377392107 words, keeping 3348598 word types
INFO : PROGRESS: at sentence #31900000, processed 377502224 words, keeping 3349313 word types
INFO : PROGRESS: at sentence #31910000, processed 377615520 words, keeping 3349985 word types
INFO : PROGRESS: at sentence #31920000, processed 377726478 words, keeping 3350724 word types
INFO : PROGRESS: at sentence #31930000, processed 377841988 words, keeping 3351326 word types
INFO : PROGRESS: at sentence #31940000, processed 377954846 words, keeping 3352026 word types
INFO : PROGRESS: at sentence #31950000, processed 378067690 words, keeping 3352823 word types
INFO : PROGRESS: at sentence #31960000, processed 378182533 words, keeping 3353488 word types
INFO : PROGRESS: at sentence #31970000, processed 378296618 words, keeping 3354262 word types
INFO : PROGRESS: at sentence #31980000, processed 378409937 

INFO : PROGRESS: at sentence #32760000, processed 387319653 words, keeping 3413686 word types
INFO : PROGRESS: at sentence #32770000, processed 387437802 words, keeping 3414350 word types
INFO : PROGRESS: at sentence #32780000, processed 387554670 words, keeping 3415109 word types
INFO : PROGRESS: at sentence #32790000, processed 387668403 words, keeping 3415780 word types
INFO : PROGRESS: at sentence #32800000, processed 387783833 words, keeping 3416392 word types
INFO : PROGRESS: at sentence #32810000, processed 387902834 words, keeping 3417045 word types
INFO : PROGRESS: at sentence #32820000, processed 388019374 words, keeping 3417878 word types
INFO : PROGRESS: at sentence #32830000, processed 388136154 words, keeping 3418494 word types
INFO : PROGRESS: at sentence #32840000, processed 388254292 words, keeping 3419236 word types
INFO : PROGRESS: at sentence #32850000, processed 388369831 words, keeping 3419959 word types
INFO : PROGRESS: at sentence #32860000, processed 388487108 

INFO : PROGRESS: at sentence #33630000, processed 397423470 words, keeping 3472159 word types
INFO : PROGRESS: at sentence #33640000, processed 397542621 words, keeping 3472817 word types
INFO : PROGRESS: at sentence #33650000, processed 397658158 words, keeping 3473445 word types
INFO : PROGRESS: at sentence #33660000, processed 397773581 words, keeping 3474097 word types
INFO : PROGRESS: at sentence #33670000, processed 397889476 words, keeping 3474648 word types
INFO : PROGRESS: at sentence #33680000, processed 398007083 words, keeping 3475197 word types
INFO : PROGRESS: at sentence #33690000, processed 398122252 words, keeping 3475851 word types
INFO : PROGRESS: at sentence #33700000, processed 398239521 words, keeping 3476560 word types
INFO : PROGRESS: at sentence #33710000, processed 398354987 words, keeping 3477172 word types
INFO : PROGRESS: at sentence #33720000, processed 398471170 words, keeping 3477695 word types
INFO : PROGRESS: at sentence #33730000, processed 398588332 

INFO : PROGRESS: at sentence #34510000, processed 407515825 words, keeping 3537290 word types
INFO : PROGRESS: at sentence #34520000, processed 407631907 words, keeping 3538110 word types
INFO : PROGRESS: at sentence #34530000, processed 407744230 words, keeping 3538872 word types
INFO : PROGRESS: at sentence #34540000, processed 407856008 words, keeping 3539584 word types
INFO : PROGRESS: at sentence #34550000, processed 407970123 words, keeping 3540394 word types
INFO : PROGRESS: at sentence #34560000, processed 408079484 words, keeping 3541108 word types
INFO : PROGRESS: at sentence #34570000, processed 408192526 words, keeping 3541860 word types
INFO : PROGRESS: at sentence #34580000, processed 408308013 words, keeping 3542714 word types
INFO : PROGRESS: at sentence #34590000, processed 408421025 words, keeping 3543602 word types
INFO : PROGRESS: at sentence #34600000, processed 408533296 words, keeping 3544358 word types
INFO : PROGRESS: at sentence #34610000, processed 408642112 

INFO : PROGRESS: at sentence #35390000, processed 417541871 words, keeping 3604186 word types
INFO : PROGRESS: at sentence #35400000, processed 417659330 words, keeping 3604999 word types
INFO : PROGRESS: at sentence #35410000, processed 417777811 words, keeping 3605597 word types
INFO : PROGRESS: at sentence #35420000, processed 417893729 words, keeping 3606185 word types
INFO : PROGRESS: at sentence #35430000, processed 418010459 words, keeping 3606823 word types
INFO : PROGRESS: at sentence #35440000, processed 418124442 words, keeping 3607480 word types
INFO : PROGRESS: at sentence #35450000, processed 418241650 words, keeping 3608240 word types
INFO : PROGRESS: at sentence #35460000, processed 418360044 words, keeping 3608818 word types
INFO : PROGRESS: at sentence #35470000, processed 418472876 words, keeping 3609652 word types
INFO : PROGRESS: at sentence #35480000, processed 418589534 words, keeping 3610492 word types
INFO : PROGRESS: at sentence #35490000, processed 418704526 

INFO : PROGRESS: at sentence #36270000, processed 427769518 words, keeping 3659141 word types
INFO : PROGRESS: at sentence #36280000, processed 427886184 words, keeping 3659715 word types
INFO : PROGRESS: at sentence #36290000, processed 427997411 words, keeping 3660567 word types
INFO : PROGRESS: at sentence #36300000, processed 428113714 words, keeping 3661077 word types
INFO : PROGRESS: at sentence #36310000, processed 428231384 words, keeping 3661609 word types
INFO : PROGRESS: at sentence #36320000, processed 428350129 words, keeping 3662171 word types
INFO : PROGRESS: at sentence #36330000, processed 428468297 words, keeping 3662834 word types
INFO : PROGRESS: at sentence #36340000, processed 428580635 words, keeping 3663447 word types
INFO : PROGRESS: at sentence #36350000, processed 428694214 words, keeping 3664096 word types
INFO : PROGRESS: at sentence #36360000, processed 428814109 words, keeping 3664587 word types
INFO : PROGRESS: at sentence #36370000, processed 428929659 

INFO : PROGRESS: at sentence #37150000, processed 437751067 words, keeping 3723433 word types
INFO : PROGRESS: at sentence #37160000, processed 437867078 words, keeping 3724067 word types
INFO : PROGRESS: at sentence #37170000, processed 437980028 words, keeping 3724698 word types
INFO : PROGRESS: at sentence #37180000, processed 438096121 words, keeping 3725348 word types
INFO : PROGRESS: at sentence #37190000, processed 438206456 words, keeping 3725872 word types
INFO : PROGRESS: at sentence #37200000, processed 438321261 words, keeping 3726499 word types
INFO : PROGRESS: at sentence #37210000, processed 438439334 words, keeping 3727078 word types
INFO : PROGRESS: at sentence #37220000, processed 438557014 words, keeping 3727551 word types
INFO : PROGRESS: at sentence #37230000, processed 438675254 words, keeping 3728192 word types
INFO : PROGRESS: at sentence #37240000, processed 438793964 words, keeping 3728874 word types
INFO : PROGRESS: at sentence #37250000, processed 438909435 

INFO : PROGRESS: at sentence #38030000, processed 447921045 words, keeping 3779501 word types
INFO : PROGRESS: at sentence #38040000, processed 448038327 words, keeping 3780119 word types
INFO : PROGRESS: at sentence #38050000, processed 448153832 words, keeping 3780779 word types
INFO : PROGRESS: at sentence #38060000, processed 448265475 words, keeping 3781460 word types
INFO : PROGRESS: at sentence #38070000, processed 448380983 words, keeping 3782063 word types
INFO : PROGRESS: at sentence #38080000, processed 448493652 words, keeping 3782836 word types
INFO : PROGRESS: at sentence #38090000, processed 448611700 words, keeping 3783590 word types
INFO : PROGRESS: at sentence #38100000, processed 448726324 words, keeping 3784279 word types
INFO : PROGRESS: at sentence #38110000, processed 448839722 words, keeping 3784956 word types
INFO : PROGRESS: at sentence #38120000, processed 448952470 words, keeping 3785737 word types
INFO : PROGRESS: at sentence #38130000, processed 449064336 

INFO : PROGRESS: at sentence #38910000, processed 458053197 words, keeping 3835374 word types
INFO : PROGRESS: at sentence #38920000, processed 458165200 words, keeping 3836021 word types
INFO : PROGRESS: at sentence #38930000, processed 458282532 words, keeping 3836527 word types
INFO : PROGRESS: at sentence #38940000, processed 458397970 words, keeping 3837038 word types
INFO : PROGRESS: at sentence #38950000, processed 458514716 words, keeping 3837553 word types
INFO : PROGRESS: at sentence #38960000, processed 458637484 words, keeping 3838053 word types
INFO : PROGRESS: at sentence #38970000, processed 458753864 words, keeping 3838525 word types
INFO : PROGRESS: at sentence #38980000, processed 458871013 words, keeping 3839062 word types
INFO : PROGRESS: at sentence #38990000, processed 458983074 words, keeping 3839762 word types
INFO : PROGRESS: at sentence #39000000, processed 459095474 words, keeping 3840286 word types
INFO : PROGRESS: at sentence #39010000, processed 459212319 

INFO : PROGRESS: at sentence #39790000, processed 468199671 words, keeping 3889343 word types
INFO : PROGRESS: at sentence #39800000, processed 468308211 words, keeping 3890015 word types
INFO : PROGRESS: at sentence #39810000, processed 468423846 words, keeping 3890670 word types
INFO : PROGRESS: at sentence #39820000, processed 468535700 words, keeping 3891428 word types
INFO : PROGRESS: at sentence #39830000, processed 468646227 words, keeping 3892117 word types
INFO : PROGRESS: at sentence #39840000, processed 468761545 words, keeping 3892633 word types
INFO : PROGRESS: at sentence #39850000, processed 468876549 words, keeping 3893384 word types
INFO : PROGRESS: at sentence #39860000, processed 468990127 words, keeping 3894161 word types
INFO : PROGRESS: at sentence #39870000, processed 469107236 words, keeping 3894776 word types
INFO : PROGRESS: at sentence #39880000, processed 469218026 words, keeping 3895549 word types
INFO : PROGRESS: at sentence #39890000, processed 469335337 

INFO : PROGRESS: at sentence #40670000, processed 478201983 words, keeping 3950187 word types
INFO : PROGRESS: at sentence #40680000, processed 478312524 words, keeping 3950813 word types
INFO : PROGRESS: at sentence #40690000, processed 478424714 words, keeping 3951491 word types
INFO : PROGRESS: at sentence #40700000, processed 478543044 words, keeping 3952076 word types
INFO : PROGRESS: at sentence #40710000, processed 478655639 words, keeping 3952756 word types
INFO : PROGRESS: at sentence #40720000, processed 478767550 words, keeping 3953494 word types
INFO : PROGRESS: at sentence #40730000, processed 478882972 words, keeping 3954198 word types
INFO : PROGRESS: at sentence #40740000, processed 478998206 words, keeping 3954820 word types
INFO : PROGRESS: at sentence #40750000, processed 479112862 words, keeping 3955449 word types
INFO : PROGRESS: at sentence #40760000, processed 479223019 words, keeping 3956215 word types
INFO : PROGRESS: at sentence #40770000, processed 479334074 

INFO : PROGRESS: at sentence #41550000, processed 488365086 words, keeping 4006522 word types
INFO : PROGRESS: at sentence #41560000, processed 488483429 words, keeping 4007171 word types
INFO : PROGRESS: at sentence #41570000, processed 488598675 words, keeping 4007790 word types
INFO : PROGRESS: at sentence #41580000, processed 488712899 words, keeping 4008408 word types
INFO : PROGRESS: at sentence #41590000, processed 488829093 words, keeping 4009026 word types
INFO : PROGRESS: at sentence #41600000, processed 488943151 words, keeping 4009630 word types
INFO : PROGRESS: at sentence #41610000, processed 489061465 words, keeping 4010203 word types
INFO : PROGRESS: at sentence #41620000, processed 489176662 words, keeping 4010807 word types
INFO : PROGRESS: at sentence #41630000, processed 489294781 words, keeping 4011427 word types
INFO : PROGRESS: at sentence #41640000, processed 489411061 words, keeping 4012027 word types
INFO : PROGRESS: at sentence #41650000, processed 489526726 

INFO : PROGRESS: at sentence #42430000, processed 498619882 words, keeping 4059990 word types
INFO : PROGRESS: at sentence #42440000, processed 498736514 words, keeping 4060570 word types
INFO : PROGRESS: at sentence #42450000, processed 498855002 words, keeping 4061116 word types
INFO : PROGRESS: at sentence #42460000, processed 498972670 words, keeping 4061636 word types
INFO : PROGRESS: at sentence #42470000, processed 499089303 words, keeping 4062192 word types
INFO : PROGRESS: at sentence #42480000, processed 499210449 words, keeping 4062686 word types
INFO : PROGRESS: at sentence #42490000, processed 499331438 words, keeping 4063241 word types
INFO : PROGRESS: at sentence #42500000, processed 499455373 words, keeping 4063711 word types
INFO : PROGRESS: at sentence #42510000, processed 499574949 words, keeping 4064243 word types
INFO : PROGRESS: at sentence #42520000, processed 499696745 words, keeping 4064829 word types
INFO : PROGRESS: at sentence #42530000, processed 499818276 

INFO : PROGRESS: at sentence #43310000, processed 508978043 words, keeping 4110555 word types
INFO : PROGRESS: at sentence #43320000, processed 509092921 words, keeping 4111286 word types
INFO : PROGRESS: at sentence #43330000, processed 509208824 words, keeping 4111839 word types
INFO : PROGRESS: at sentence #43340000, processed 509324439 words, keeping 4112523 word types
INFO : PROGRESS: at sentence #43350000, processed 509440043 words, keeping 4113244 word types
INFO : PROGRESS: at sentence #43360000, processed 509565369 words, keeping 4113874 word types
INFO : PROGRESS: at sentence #43370000, processed 509680229 words, keeping 4114416 word types
INFO : PROGRESS: at sentence #43380000, processed 509796580 words, keeping 4115123 word types
INFO : PROGRESS: at sentence #43390000, processed 509913382 words, keeping 4115775 word types
INFO : PROGRESS: at sentence #43400000, processed 510028245 words, keeping 4116449 word types
INFO : PROGRESS: at sentence #43410000, processed 510147520 

INFO : PROGRESS: at sentence #44190000, processed 517508935 words, keeping 4133501 word types
INFO : PROGRESS: at sentence #44200000, processed 517601568 words, keeping 4133679 word types
INFO : PROGRESS: at sentence #44210000, processed 517693002 words, keeping 4133829 word types
INFO : PROGRESS: at sentence #44220000, processed 517797319 words, keeping 4134149 word types
INFO : PROGRESS: at sentence #44230000, processed 517894690 words, keeping 4134450 word types
INFO : PROGRESS: at sentence #44240000, processed 517993371 words, keeping 4134599 word types
INFO : PROGRESS: at sentence #44250000, processed 518099864 words, keeping 4134954 word types
INFO : PROGRESS: at sentence #44260000, processed 518204711 words, keeping 4135322 word types
INFO : PROGRESS: at sentence #44270000, processed 518321728 words, keeping 4135997 word types
INFO : PROGRESS: at sentence #44280000, processed 518437374 words, keeping 4136509 word types
INFO : PROGRESS: at sentence #44290000, processed 518550782 

INFO : PROGRESS: at sentence #45070000, processed 527577863 words, keeping 4183102 word types
INFO : PROGRESS: at sentence #45080000, processed 527694538 words, keeping 4183637 word types
INFO : PROGRESS: at sentence #45090000, processed 527810677 words, keeping 4184243 word types
INFO : PROGRESS: at sentence #45100000, processed 527925489 words, keeping 4184764 word types
INFO : PROGRESS: at sentence #45110000, processed 528043434 words, keeping 4185131 word types
INFO : PROGRESS: at sentence #45120000, processed 528164535 words, keeping 4185646 word types
INFO : PROGRESS: at sentence #45130000, processed 528280044 words, keeping 4186078 word types
INFO : PROGRESS: at sentence #45140000, processed 528397367 words, keeping 4186597 word types
INFO : PROGRESS: at sentence #45150000, processed 528515378 words, keeping 4187140 word types
INFO : PROGRESS: at sentence #45160000, processed 528630322 words, keeping 4187578 word types
INFO : PROGRESS: at sentence #45170000, processed 528746342 

INFO : PROGRESS: at sentence #45950000, processed 538236898 words, keeping 4224457 word types
INFO : PROGRESS: at sentence #45960000, processed 538363819 words, keeping 4224814 word types
INFO : PROGRESS: at sentence #45970000, processed 538487972 words, keeping 4225241 word types
INFO : PROGRESS: at sentence #45980000, processed 538612455 words, keeping 4225621 word types
INFO : PROGRESS: at sentence #45990000, processed 538746062 words, keeping 4226162 word types
INFO : PROGRESS: at sentence #46000000, processed 538869265 words, keeping 4226502 word types
INFO : PROGRESS: at sentence #46010000, processed 538993686 words, keeping 4226904 word types
INFO : PROGRESS: at sentence #46020000, processed 539122247 words, keeping 4227304 word types
INFO : PROGRESS: at sentence #46030000, processed 539248871 words, keeping 4227694 word types
INFO : PROGRESS: at sentence #46040000, processed 539372545 words, keeping 4228170 word types
INFO : PROGRESS: at sentence #46050000, processed 539498750 

INFO : PROGRESS: at sentence #46830000, processed 548888856 words, keeping 4266696 word types
INFO : PROGRESS: at sentence #46840000, processed 549008017 words, keeping 4267097 word types
INFO : PROGRESS: at sentence #46850000, processed 549130282 words, keeping 4267465 word types
INFO : PROGRESS: at sentence #46860000, processed 549250071 words, keeping 4267953 word types
INFO : PROGRESS: at sentence #46870000, processed 549370623 words, keeping 4268423 word types
INFO : PROGRESS: at sentence #46880000, processed 549492491 words, keeping 4268897 word types
INFO : PROGRESS: at sentence #46890000, processed 549614927 words, keeping 4269331 word types
INFO : PROGRESS: at sentence #46900000, processed 549738285 words, keeping 4269733 word types
INFO : PROGRESS: at sentence #46910000, processed 549862636 words, keeping 4270209 word types
INFO : PROGRESS: at sentence #46920000, processed 549992040 words, keeping 4270561 word types
INFO : PROGRESS: at sentence #46930000, processed 550114933 

INFO : PROGRESS: at sentence #47710000, processed 559515341 words, keeping 4310868 word types
INFO : PROGRESS: at sentence #47720000, processed 559629701 words, keeping 4311490 word types
INFO : PROGRESS: at sentence #47730000, processed 559747890 words, keeping 4312108 word types
INFO : PROGRESS: at sentence #47740000, processed 559859098 words, keeping 4312907 word types
INFO : PROGRESS: at sentence #47750000, processed 559969706 words, keeping 4313555 word types
INFO : PROGRESS: at sentence #47760000, processed 560081572 words, keeping 4314238 word types
INFO : PROGRESS: at sentence #47770000, processed 560192068 words, keeping 4314810 word types
INFO : PROGRESS: at sentence #47780000, processed 560305378 words, keeping 4315375 word types
INFO : PROGRESS: at sentence #47790000, processed 560418113 words, keeping 4316081 word types
INFO : PROGRESS: at sentence #47800000, processed 560530300 words, keeping 4316701 word types
INFO : PROGRESS: at sentence #47810000, processed 560648519 

INFO : PROGRESS: at sentence #48590000, processed 569882215 words, keeping 4358883 word types
INFO : PROGRESS: at sentence #48600000, processed 570001289 words, keeping 4359346 word types
INFO : PROGRESS: at sentence #48610000, processed 570121333 words, keeping 4359797 word types
INFO : PROGRESS: at sentence #48620000, processed 570241096 words, keeping 4360226 word types
INFO : PROGRESS: at sentence #48630000, processed 570361849 words, keeping 4360820 word types
INFO : PROGRESS: at sentence #48640000, processed 570483428 words, keeping 4361284 word types
INFO : PROGRESS: at sentence #48650000, processed 570604441 words, keeping 4361782 word types
INFO : PROGRESS: at sentence #48660000, processed 570725657 words, keeping 4362260 word types
INFO : PROGRESS: at sentence #48670000, processed 570847436 words, keeping 4362812 word types
INFO : PROGRESS: at sentence #48680000, processed 570968239 words, keeping 4363298 word types
INFO : PROGRESS: at sentence #48690000, processed 571090271 

INFO : PROGRESS: at sentence #49460000, processed 580000042 words, keeping 4406958 word types
INFO : PROGRESS: at sentence #49470000, processed 580118041 words, keeping 4407484 word types
INFO : PROGRESS: at sentence #49480000, processed 580238197 words, keeping 4408074 word types
INFO : PROGRESS: at sentence #49490000, processed 580356675 words, keeping 4408598 word types
INFO : PROGRESS: at sentence #49500000, processed 580469642 words, keeping 4409112 word types
INFO : PROGRESS: at sentence #49510000, processed 580588197 words, keeping 4409595 word types
INFO : PROGRESS: at sentence #49520000, processed 580703838 words, keeping 4410163 word types
INFO : PROGRESS: at sentence #49530000, processed 580819701 words, keeping 4410677 word types
INFO : PROGRESS: at sentence #49540000, processed 580936986 words, keeping 4411227 word types
INFO : PROGRESS: at sentence #49550000, processed 581054808 words, keeping 4411758 word types
INFO : PROGRESS: at sentence #49560000, processed 581178352 

INFO : PROGRESS: at sentence #50340000, processed 590357502 words, keeping 4451419 word types
INFO : PROGRESS: at sentence #50350000, processed 590477179 words, keeping 4452106 word types
INFO : PROGRESS: at sentence #50360000, processed 590597561 words, keeping 4452576 word types
INFO : PROGRESS: at sentence #50370000, processed 590714795 words, keeping 4453074 word types
INFO : PROGRESS: at sentence #50380000, processed 590831296 words, keeping 4453595 word types
INFO : PROGRESS: at sentence #50390000, processed 590949751 words, keeping 4454112 word types
INFO : PROGRESS: at sentence #50400000, processed 591066779 words, keeping 4454633 word types
INFO : PROGRESS: at sentence #50410000, processed 591183589 words, keeping 4455075 word types
INFO : PROGRESS: at sentence #50420000, processed 591300711 words, keeping 4455513 word types
INFO : PROGRESS: at sentence #50430000, processed 591422722 words, keeping 4455989 word types
INFO : PROGRESS: at sentence #50440000, processed 591542001 

INFO : PROGRESS: at sentence #51220000, processed 600565952 words, keeping 4501238 word types
INFO : PROGRESS: at sentence #51230000, processed 600683328 words, keeping 4501780 word types
INFO : PROGRESS: at sentence #51240000, processed 600797728 words, keeping 4502573 word types
INFO : PROGRESS: at sentence #51250000, processed 600913183 words, keeping 4503226 word types
INFO : PROGRESS: at sentence #51260000, processed 601029065 words, keeping 4503831 word types
INFO : PROGRESS: at sentence #51270000, processed 601144163 words, keeping 4504396 word types
INFO : PROGRESS: at sentence #51280000, processed 601257239 words, keeping 4505109 word types
INFO : PROGRESS: at sentence #51290000, processed 601373591 words, keeping 4505666 word types
INFO : PROGRESS: at sentence #51300000, processed 601487553 words, keeping 4506097 word types
INFO : PROGRESS: at sentence #51310000, processed 601605054 words, keeping 4506650 word types
INFO : PROGRESS: at sentence #51320000, processed 601720262 

INFO : PROGRESS: at sentence #52100000, processed 610826641 words, keeping 4548289 word types
INFO : PROGRESS: at sentence #52110000, processed 610954135 words, keeping 4548721 word types
INFO : PROGRESS: at sentence #52120000, processed 611080471 words, keeping 4549049 word types
INFO : PROGRESS: at sentence #52130000, processed 611208037 words, keeping 4549408 word types
INFO : PROGRESS: at sentence #52140000, processed 611326595 words, keeping 4549948 word types
INFO : PROGRESS: at sentence #52150000, processed 611447023 words, keeping 4550309 word types
INFO : PROGRESS: at sentence #52160000, processed 611571698 words, keeping 4550684 word types
INFO : PROGRESS: at sentence #52170000, processed 611692888 words, keeping 4551136 word types
INFO : PROGRESS: at sentence #52180000, processed 611815990 words, keeping 4551556 word types
INFO : PROGRESS: at sentence #52190000, processed 611935375 words, keeping 4551950 word types
INFO : PROGRESS: at sentence #52200000, processed 612058871 

INFO : PROGRESS: at sentence #52980000, processed 621374197 words, keeping 4589388 word types
INFO : PROGRESS: at sentence #52990000, processed 621495666 words, keeping 4589967 word types
INFO : PROGRESS: at sentence #53000000, processed 621611720 words, keeping 4590489 word types
INFO : PROGRESS: at sentence #53010000, processed 621727404 words, keeping 4591033 word types
INFO : PROGRESS: at sentence #53020000, processed 621844053 words, keeping 4591645 word types
INFO : PROGRESS: at sentence #53030000, processed 621958500 words, keeping 4592281 word types
INFO : PROGRESS: at sentence #53040000, processed 622072487 words, keeping 4592798 word types
INFO : PROGRESS: at sentence #53050000, processed 622193023 words, keeping 4593342 word types
INFO : PROGRESS: at sentence #53060000, processed 622310808 words, keeping 4593845 word types
INFO : PROGRESS: at sentence #53070000, processed 622427452 words, keeping 4594398 word types
INFO : PROGRESS: at sentence #53080000, processed 622542905 

INFO : PROGRESS: at sentence #53860000, processed 631543364 words, keeping 4637559 word types
INFO : PROGRESS: at sentence #53870000, processed 631664769 words, keeping 4638030 word types
INFO : PROGRESS: at sentence #53880000, processed 631784633 words, keeping 4638515 word types
INFO : PROGRESS: at sentence #53890000, processed 631907224 words, keeping 4639092 word types
INFO : PROGRESS: at sentence #53900000, processed 632028913 words, keeping 4639599 word types
INFO : PROGRESS: at sentence #53910000, processed 632150202 words, keeping 4640031 word types
INFO : PROGRESS: at sentence #53920000, processed 632269325 words, keeping 4640503 word types
INFO : PROGRESS: at sentence #53930000, processed 632387883 words, keeping 4640950 word types
INFO : PROGRESS: at sentence #53940000, processed 632507499 words, keeping 4641402 word types
INFO : PROGRESS: at sentence #53950000, processed 632629904 words, keeping 4641838 word types
INFO : PROGRESS: at sentence #53960000, processed 632752855 

INFO : PROGRESS: at sentence #54740000, processed 642299420 words, keeping 4675880 word types
INFO : PROGRESS: at sentence #54750000, processed 642421338 words, keeping 4676337 word types
INFO : PROGRESS: at sentence #54760000, processed 642545597 words, keeping 4676794 word types
INFO : PROGRESS: at sentence #54770000, processed 642666797 words, keeping 4677137 word types
INFO : PROGRESS: at sentence #54780000, processed 642789078 words, keeping 4677462 word types
INFO : PROGRESS: at sentence #54790000, processed 642912124 words, keeping 4677860 word types
INFO : PROGRESS: at sentence #54800000, processed 643037318 words, keeping 4678285 word types
INFO : PROGRESS: at sentence #54810000, processed 643160602 words, keeping 4678714 word types
INFO : PROGRESS: at sentence #54820000, processed 643284911 words, keeping 4679161 word types
INFO : PROGRESS: at sentence #54830000, processed 643404714 words, keeping 4679609 word types
INFO : PROGRESS: at sentence #54840000, processed 643518554 

INFO : PROGRESS: at sentence #55620000, processed 652708845 words, keeping 4720349 word types
INFO : PROGRESS: at sentence #55630000, processed 652822430 words, keeping 4720905 word types
INFO : PROGRESS: at sentence #55640000, processed 652938412 words, keeping 4721457 word types
INFO : PROGRESS: at sentence #55650000, processed 653051483 words, keeping 4721979 word types
INFO : PROGRESS: at sentence #55660000, processed 653169123 words, keeping 4722636 word types
INFO : PROGRESS: at sentence #55670000, processed 653277028 words, keeping 4723268 word types
INFO : PROGRESS: at sentence #55680000, processed 653394084 words, keeping 4723877 word types
INFO : PROGRESS: at sentence #55690000, processed 653506989 words, keeping 4724500 word types
INFO : PROGRESS: at sentence #55700000, processed 653618202 words, keeping 4725055 word types
INFO : PROGRESS: at sentence #55710000, processed 653733674 words, keeping 4725575 word types
INFO : PROGRESS: at sentence #55720000, processed 653844421 

INFO : PROGRESS: at sentence #56500000, processed 662025754 words, keeping 4745958 word types
INFO : PROGRESS: at sentence #56510000, processed 662111599 words, keeping 4746062 word types
INFO : PROGRESS: at sentence #56520000, processed 662199812 words, keeping 4746178 word types
INFO : PROGRESS: at sentence #56530000, processed 662311750 words, keeping 4746587 word types
INFO : PROGRESS: at sentence #56540000, processed 662430470 words, keeping 4747234 word types
INFO : PROGRESS: at sentence #56550000, processed 662551759 words, keeping 4747740 word types
INFO : PROGRESS: at sentence #56560000, processed 662672594 words, keeping 4748315 word types
INFO : PROGRESS: at sentence #56570000, processed 662792172 words, keeping 4748813 word types
INFO : PROGRESS: at sentence #56580000, processed 662913252 words, keeping 4749349 word types
INFO : PROGRESS: at sentence #56590000, processed 663032771 words, keeping 4749849 word types
INFO : PROGRESS: at sentence #56600000, processed 663146444 

INFO : PROGRESS: at sentence #57380000, processed 672353091 words, keeping 4790003 word types
INFO : PROGRESS: at sentence #57390000, processed 672470021 words, keeping 4790604 word types
INFO : PROGRESS: at sentence #57400000, processed 672589045 words, keeping 4791132 word types
INFO : PROGRESS: at sentence #57410000, processed 672705485 words, keeping 4791707 word types
INFO : PROGRESS: at sentence #57420000, processed 672820071 words, keeping 4792228 word types
INFO : PROGRESS: at sentence #57430000, processed 672934138 words, keeping 4792835 word types
INFO : PROGRESS: at sentence #57440000, processed 673047775 words, keeping 4793440 word types
INFO : PROGRESS: at sentence #57450000, processed 673158614 words, keeping 4794084 word types
INFO : PROGRESS: at sentence #57460000, processed 673269669 words, keeping 4794552 word types
INFO : PROGRESS: at sentence #57470000, processed 673381316 words, keeping 4795078 word types
INFO : PROGRESS: at sentence #57480000, processed 673499026 

INFO : PROGRESS: at sentence #58260000, processed 682653694 words, keeping 4835740 word types
INFO : PROGRESS: at sentence #58270000, processed 682775328 words, keeping 4836203 word types
INFO : PROGRESS: at sentence #58280000, processed 682895376 words, keeping 4836660 word types
INFO : PROGRESS: at sentence #58290000, processed 683015169 words, keeping 4837069 word types
INFO : PROGRESS: at sentence #58300000, processed 683130093 words, keeping 4837531 word types
INFO : PROGRESS: at sentence #58310000, processed 683246213 words, keeping 4837960 word types
INFO : PROGRESS: at sentence #58320000, processed 683363099 words, keeping 4838362 word types
INFO : PROGRESS: at sentence #58330000, processed 683484058 words, keeping 4838861 word types
INFO : PROGRESS: at sentence #58340000, processed 683605649 words, keeping 4839294 word types
INFO : PROGRESS: at sentence #58350000, processed 683724458 words, keeping 4839773 word types
INFO : PROGRESS: at sentence #58360000, processed 683841950 

INFO : PROGRESS: at sentence #59140000, processed 693554926 words, keeping 4869765 word types
INFO : PROGRESS: at sentence #59150000, processed 693675690 words, keeping 4870176 word types
INFO : PROGRESS: at sentence #59160000, processed 693797322 words, keeping 4870532 word types
INFO : PROGRESS: at sentence #59170000, processed 693917271 words, keeping 4870929 word types
INFO : PROGRESS: at sentence #59180000, processed 694034680 words, keeping 4871451 word types
INFO : PROGRESS: at sentence #59190000, processed 694152288 words, keeping 4871858 word types
INFO : PROGRESS: at sentence #59200000, processed 694273264 words, keeping 4872260 word types
INFO : PROGRESS: at sentence #59210000, processed 694392265 words, keeping 4872799 word types
INFO : PROGRESS: at sentence #59220000, processed 694511120 words, keeping 4873214 word types
INFO : PROGRESS: at sentence #59230000, processed 694629049 words, keeping 4873632 word types
INFO : PROGRESS: at sentence #59240000, processed 694749138 

INFO : PROGRESS: at sentence #60020000, processed 703969238 words, keeping 4912889 word types
INFO : PROGRESS: at sentence #60030000, processed 704084548 words, keeping 4913466 word types
INFO : PROGRESS: at sentence #60040000, processed 704203805 words, keeping 4913949 word types
INFO : PROGRESS: at sentence #60050000, processed 704323739 words, keeping 4914524 word types
INFO : PROGRESS: at sentence #60060000, processed 704436056 words, keeping 4915119 word types
INFO : PROGRESS: at sentence #60070000, processed 704556198 words, keeping 4915581 word types
INFO : PROGRESS: at sentence #60080000, processed 704669127 words, keeping 4916217 word types
INFO : PROGRESS: at sentence #60090000, processed 704783462 words, keeping 4916887 word types
INFO : PROGRESS: at sentence #60100000, processed 704900993 words, keeping 4917354 word types
INFO : PROGRESS: at sentence #60110000, processed 705025296 words, keeping 4917906 word types
INFO : PROGRESS: at sentence #60120000, processed 705141921 

INFO : PROGRESS: at sentence #60900000, processed 714300698 words, keeping 4948527 word types
INFO : PROGRESS: at sentence #60910000, processed 714420837 words, keeping 4948960 word types
INFO : PROGRESS: at sentence #60920000, processed 714547204 words, keeping 4949358 word types
INFO : PROGRESS: at sentence #60930000, processed 714669673 words, keeping 4950009 word types
INFO : PROGRESS: at sentence #60940000, processed 714797166 words, keeping 4950362 word types
INFO : PROGRESS: at sentence #60950000, processed 714920693 words, keeping 4950776 word types
INFO : PROGRESS: at sentence #60960000, processed 715028420 words, keeping 4951078 word types
INFO : PROGRESS: at sentence #60970000, processed 715145224 words, keeping 4951689 word types
INFO : PROGRESS: at sentence #60980000, processed 715268499 words, keeping 4952099 word types
INFO : PROGRESS: at sentence #60990000, processed 715382164 words, keeping 4952337 word types
INFO : PROGRESS: at sentence #61000000, processed 715505778 

INFO : PROGRESS: at sentence #61780000, processed 724563283 words, keeping 4992881 word types
INFO : PROGRESS: at sentence #61790000, processed 724687849 words, keeping 4993287 word types
INFO : PROGRESS: at sentence #61800000, processed 724807749 words, keeping 4993594 word types
INFO : PROGRESS: at sentence #61810000, processed 724929677 words, keeping 4994024 word types
INFO : PROGRESS: at sentence #61820000, processed 725054125 words, keeping 4994368 word types
INFO : PROGRESS: at sentence #61830000, processed 725175386 words, keeping 4994740 word types
INFO : PROGRESS: at sentence #61840000, processed 725295353 words, keeping 4995029 word types
INFO : PROGRESS: at sentence #61850000, processed 725416434 words, keeping 4995405 word types
INFO : PROGRESS: at sentence #61860000, processed 725543650 words, keeping 4995791 word types
INFO : PROGRESS: at sentence #61870000, processed 725665417 words, keeping 4996312 word types
INFO : PROGRESS: at sentence #61880000, processed 725787968 

INFO : PROGRESS: at sentence #62660000, processed 735082589 words, keeping 5032138 word types
INFO : PROGRESS: at sentence #62670000, processed 735202828 words, keeping 5032529 word types
INFO : PROGRESS: at sentence #62680000, processed 735321449 words, keeping 5033061 word types
INFO : PROGRESS: at sentence #62690000, processed 735438100 words, keeping 5033558 word types
INFO : PROGRESS: at sentence #62700000, processed 735552371 words, keeping 5034082 word types
INFO : PROGRESS: at sentence #62710000, processed 735666848 words, keeping 5034626 word types
INFO : PROGRESS: at sentence #62720000, processed 735786148 words, keeping 5035140 word types
INFO : PROGRESS: at sentence #62730000, processed 735902950 words, keeping 5035664 word types
INFO : PROGRESS: at sentence #62740000, processed 736021441 words, keeping 5036169 word types
INFO : PROGRESS: at sentence #62750000, processed 736139483 words, keeping 5036706 word types
INFO : PROGRESS: at sentence #62760000, processed 736255891 

INFO : PROGRESS: at sentence #63540000, processed 745709943 words, keeping 5069515 word types
INFO : PROGRESS: at sentence #63550000, processed 745834470 words, keeping 5069897 word types
INFO : PROGRESS: at sentence #63560000, processed 745958677 words, keeping 5070317 word types
INFO : PROGRESS: at sentence #63570000, processed 746083444 words, keeping 5070737 word types
INFO : PROGRESS: at sentence #63580000, processed 746203325 words, keeping 5071163 word types
INFO : PROGRESS: at sentence #63590000, processed 746332299 words, keeping 5071693 word types
INFO : PROGRESS: at sentence #63600000, processed 746453751 words, keeping 5072068 word types
INFO : PROGRESS: at sentence #63610000, processed 746576305 words, keeping 5072498 word types
INFO : PROGRESS: at sentence #63620000, processed 746697152 words, keeping 5072862 word types
INFO : PROGRESS: at sentence #63630000, processed 746818979 words, keeping 5073375 word types
INFO : PROGRESS: at sentence #63640000, processed 746939235 

INFO : PROGRESS: at sentence #64420000, processed 756361553 words, keeping 5106869 word types
INFO : PROGRESS: at sentence #64430000, processed 756475199 words, keeping 5107215 word types
INFO : PROGRESS: at sentence #64440000, processed 756593233 words, keeping 5107707 word types
INFO : PROGRESS: at sentence #64450000, processed 756710918 words, keeping 5108133 word types
INFO : PROGRESS: at sentence #64460000, processed 756826489 words, keeping 5108524 word types
INFO : PROGRESS: at sentence #64470000, processed 756945924 words, keeping 5108934 word types
INFO : PROGRESS: at sentence #64480000, processed 757062300 words, keeping 5109415 word types
INFO : PROGRESS: at sentence #64490000, processed 757178103 words, keeping 5109863 word types
INFO : PROGRESS: at sentence #64500000, processed 757296892 words, keeping 5110307 word types
INFO : PROGRESS: at sentence #64510000, processed 757415533 words, keeping 5110822 word types
INFO : PROGRESS: at sentence #64520000, processed 757533344 

INFO : PROGRESS: at sentence #65300000, processed 766616425 words, keeping 5148762 word types
INFO : PROGRESS: at sentence #65310000, processed 766731997 words, keeping 5149233 word types
INFO : PROGRESS: at sentence #65320000, processed 766846655 words, keeping 5149688 word types
INFO : PROGRESS: at sentence #65330000, processed 766961021 words, keeping 5150231 word types
INFO : PROGRESS: at sentence #65340000, processed 767075447 words, keeping 5150873 word types
INFO : PROGRESS: at sentence #65350000, processed 767187776 words, keeping 5151377 word types
INFO : PROGRESS: at sentence #65360000, processed 767303021 words, keeping 5151870 word types
INFO : PROGRESS: at sentence #65370000, processed 767418996 words, keeping 5152396 word types
INFO : PROGRESS: at sentence #65380000, processed 767536182 words, keeping 5152830 word types
INFO : PROGRESS: at sentence #65390000, processed 767649907 words, keeping 5153312 word types
INFO : PROGRESS: at sentence #65400000, processed 767766906 

INFO : PROGRESS: at sentence #66180000, processed 776719639 words, keeping 5192800 word types
INFO : PROGRESS: at sentence #66190000, processed 776835031 words, keeping 5193293 word types
INFO : PROGRESS: at sentence #66200000, processed 776952493 words, keeping 5193801 word types
INFO : PROGRESS: at sentence #66210000, processed 777069929 words, keeping 5194295 word types
INFO : PROGRESS: at sentence #66220000, processed 777191772 words, keeping 5194834 word types
INFO : PROGRESS: at sentence #66230000, processed 777308519 words, keeping 5195297 word types
INFO : PROGRESS: at sentence #66240000, processed 777425279 words, keeping 5195851 word types
INFO : PROGRESS: at sentence #66250000, processed 777542949 words, keeping 5196382 word types
INFO : PROGRESS: at sentence #66260000, processed 777659534 words, keeping 5196920 word types
INFO : PROGRESS: at sentence #66270000, processed 777779542 words, keeping 5197387 word types
INFO : PROGRESS: at sentence #66280000, processed 777898610 

INFO : PROGRESS: at sentence #67060000, processed 786971700 words, keeping 5236912 word types
INFO : PROGRESS: at sentence #67070000, processed 787088684 words, keeping 5237539 word types
INFO : PROGRESS: at sentence #67080000, processed 787203962 words, keeping 5238106 word types
INFO : PROGRESS: at sentence #67090000, processed 787316771 words, keeping 5238682 word types
INFO : PROGRESS: at sentence #67100000, processed 787429893 words, keeping 5239244 word types
INFO : PROGRESS: at sentence #67110000, processed 787541294 words, keeping 5239926 word types
INFO : PROGRESS: at sentence #67120000, processed 787654998 words, keeping 5240489 word types
INFO : PROGRESS: at sentence #67130000, processed 787769420 words, keeping 5241064 word types
INFO : PROGRESS: at sentence #67140000, processed 787881015 words, keeping 5241697 word types
INFO : PROGRESS: at sentence #67150000, processed 787998172 words, keeping 5242375 word types
INFO : PROGRESS: at sentence #67160000, processed 788116295 

INFO : PROGRESS: at sentence #67940000, processed 797156037 words, keeping 5283969 word types
INFO : PROGRESS: at sentence #67950000, processed 797274590 words, keeping 5284477 word types
INFO : PROGRESS: at sentence #67960000, processed 797390019 words, keeping 5284951 word types
INFO : PROGRESS: at sentence #67970000, processed 797505786 words, keeping 5285390 word types
INFO : PROGRESS: at sentence #67980000, processed 797621594 words, keeping 5285820 word types
INFO : PROGRESS: at sentence #67990000, processed 797734580 words, keeping 5286246 word types
INFO : PROGRESS: at sentence #68000000, processed 797849866 words, keeping 5286684 word types
INFO : PROGRESS: at sentence #68010000, processed 797964613 words, keeping 5287256 word types
INFO : PROGRESS: at sentence #68020000, processed 798080657 words, keeping 5287779 word types
INFO : PROGRESS: at sentence #68030000, processed 798196426 words, keeping 5288373 word types
INFO : PROGRESS: at sentence #68040000, processed 798310844 

INFO : PROGRESS: at sentence #68820000, processed 807269716 words, keeping 5332441 word types
INFO : PROGRESS: at sentence #68830000, processed 807387208 words, keeping 5333016 word types
INFO : PROGRESS: at sentence #68840000, processed 807510028 words, keeping 5333613 word types
INFO : PROGRESS: at sentence #68850000, processed 807626149 words, keeping 5334158 word types
INFO : PROGRESS: at sentence #68860000, processed 807741592 words, keeping 5334712 word types
INFO : PROGRESS: at sentence #68870000, processed 807857114 words, keeping 5335224 word types
INFO : PROGRESS: at sentence #68880000, processed 807975577 words, keeping 5335782 word types
INFO : PROGRESS: at sentence #68890000, processed 808087561 words, keeping 5336342 word types
INFO : PROGRESS: at sentence #68900000, processed 808194094 words, keeping 5336891 word types
INFO : PROGRESS: at sentence #68910000, processed 808307563 words, keeping 5337362 word types
INFO : PROGRESS: at sentence #68920000, processed 808422268 

INFO : PROGRESS: at sentence #69700000, processed 817275243 words, keeping 5379922 word types
INFO : PROGRESS: at sentence #69710000, processed 817391000 words, keeping 5380366 word types
INFO : PROGRESS: at sentence #69720000, processed 817507235 words, keeping 5380797 word types
INFO : PROGRESS: at sentence #69730000, processed 817623133 words, keeping 5381310 word types
INFO : PROGRESS: at sentence #69740000, processed 817740026 words, keeping 5381758 word types
INFO : PROGRESS: at sentence #69750000, processed 817855727 words, keeping 5382244 word types
INFO : PROGRESS: at sentence #69760000, processed 817969604 words, keeping 5382867 word types
INFO : PROGRESS: at sentence #69770000, processed 818089696 words, keeping 5383341 word types
INFO : PROGRESS: at sentence #69780000, processed 818205175 words, keeping 5383854 word types
INFO : PROGRESS: at sentence #69790000, processed 818320698 words, keeping 5384345 word types
INFO : PROGRESS: at sentence #69800000, processed 818437147 

INFO : PROGRESS: at sentence #70580000, processed 827442800 words, keeping 5425634 word types
INFO : PROGRESS: at sentence #70590000, processed 827559488 words, keeping 5426290 word types
INFO : PROGRESS: at sentence #70600000, processed 827673415 words, keeping 5426888 word types
INFO : PROGRESS: at sentence #70610000, processed 827784765 words, keeping 5427475 word types
INFO : PROGRESS: at sentence #70620000, processed 827898890 words, keeping 5427982 word types
INFO : PROGRESS: at sentence #70630000, processed 828009688 words, keeping 5428559 word types
INFO : PROGRESS: at sentence #70640000, processed 828125974 words, keeping 5429038 word types
INFO : PROGRESS: at sentence #70650000, processed 828242374 words, keeping 5429599 word types
INFO : PROGRESS: at sentence #70660000, processed 828357318 words, keeping 5430099 word types
INFO : PROGRESS: at sentence #70670000, processed 828474131 words, keeping 5430597 word types
INFO : PROGRESS: at sentence #70680000, processed 828592571 

INFO : PROGRESS: at sentence #71460000, processed 837545958 words, keeping 5471892 word types
INFO : PROGRESS: at sentence #71470000, processed 837663397 words, keeping 5472291 word types
INFO : PROGRESS: at sentence #71480000, processed 837778996 words, keeping 5472774 word types
INFO : PROGRESS: at sentence #71490000, processed 837896083 words, keeping 5473265 word types
INFO : PROGRESS: at sentence #71500000, processed 838014762 words, keeping 5473622 word types
INFO : PROGRESS: at sentence #71510000, processed 838130767 words, keeping 5474133 word types
INFO : PROGRESS: at sentence #71520000, processed 838248636 words, keeping 5474585 word types
INFO : PROGRESS: at sentence #71530000, processed 838363569 words, keeping 5475088 word types
INFO : PROGRESS: at sentence #71540000, processed 838481718 words, keeping 5475497 word types
INFO : PROGRESS: at sentence #71550000, processed 838601409 words, keeping 5475880 word types
INFO : PROGRESS: at sentence #71560000, processed 838718120 

INFO : PROGRESS: at sentence #72340000, processed 847667432 words, keeping 5517992 word types
INFO : PROGRESS: at sentence #72350000, processed 847785695 words, keeping 5518612 word types
INFO : PROGRESS: at sentence #72360000, processed 847900308 words, keeping 5519226 word types
INFO : PROGRESS: at sentence #72370000, processed 848012720 words, keeping 5519891 word types
INFO : PROGRESS: at sentence #72380000, processed 848124430 words, keeping 5520524 word types
INFO : PROGRESS: at sentence #72390000, processed 848238341 words, keeping 5521142 word types
INFO : PROGRESS: at sentence #72400000, processed 848348948 words, keeping 5521782 word types
INFO : PROGRESS: at sentence #72410000, processed 848462624 words, keeping 5522431 word types
INFO : PROGRESS: at sentence #72420000, processed 848576918 words, keeping 5522987 word types
INFO : PROGRESS: at sentence #72430000, processed 848692466 words, keeping 5523534 word types
INFO : PROGRESS: at sentence #72440000, processed 848802945 

INFO : PROGRESS: at sentence #73220000, processed 857844102 words, keeping 5559260 word types
INFO : PROGRESS: at sentence #73230000, processed 857961333 words, keeping 5559753 word types
INFO : PROGRESS: at sentence #73240000, processed 858070311 words, keeping 5560043 word types
INFO : PROGRESS: at sentence #73250000, processed 858185907 words, keeping 5560473 word types
INFO : PROGRESS: at sentence #73260000, processed 858300169 words, keeping 5560793 word types
INFO : PROGRESS: at sentence #73270000, processed 858415096 words, keeping 5561206 word types
INFO : PROGRESS: at sentence #73280000, processed 858526456 words, keeping 5562172 word types
INFO : PROGRESS: at sentence #73290000, processed 858637765 words, keeping 5562484 word types
INFO : PROGRESS: at sentence #73300000, processed 858755456 words, keeping 5562887 word types
INFO : PROGRESS: at sentence #73310000, processed 858872643 words, keeping 5563283 word types
INFO : PROGRESS: at sentence #73320000, processed 858988035 

INFO : PROGRESS: at sentence #74100000, processed 867911115 words, keeping 5603588 word types
INFO : PROGRESS: at sentence #74110000, processed 868024107 words, keeping 5604098 word types
INFO : PROGRESS: at sentence #74120000, processed 868136892 words, keeping 5604621 word types
INFO : PROGRESS: at sentence #74130000, processed 868249858 words, keeping 5605259 word types
INFO : PROGRESS: at sentence #74140000, processed 868362328 words, keeping 5605836 word types
INFO : PROGRESS: at sentence #74150000, processed 868478456 words, keeping 5606452 word types
INFO : PROGRESS: at sentence #74160000, processed 868589547 words, keeping 5607170 word types
INFO : PROGRESS: at sentence #74170000, processed 868700798 words, keeping 5607706 word types
INFO : PROGRESS: at sentence #74180000, processed 868815475 words, keeping 5608236 word types
INFO : PROGRESS: at sentence #74190000, processed 868931221 words, keeping 5608760 word types
INFO : PROGRESS: at sentence #74200000, processed 869044936 

INFO : PROGRESS: at sentence #74980000, processed 877861526 words, keeping 5655366 word types
INFO : PROGRESS: at sentence #74990000, processed 877974755 words, keeping 5655902 word types
INFO : PROGRESS: at sentence #75000000, processed 878088230 words, keeping 5656461 word types
INFO : PROGRESS: at sentence #75010000, processed 878202351 words, keeping 5656979 word types
INFO : PROGRESS: at sentence #75020000, processed 878316229 words, keeping 5657507 word types
INFO : PROGRESS: at sentence #75030000, processed 878431677 words, keeping 5658109 word types
INFO : PROGRESS: at sentence #75040000, processed 878545863 words, keeping 5658697 word types
INFO : PROGRESS: at sentence #75050000, processed 878662340 words, keeping 5659182 word types
INFO : PROGRESS: at sentence #75060000, processed 878769629 words, keeping 5659752 word types
INFO : PROGRESS: at sentence #75070000, processed 878880129 words, keeping 5660281 word types
INFO : PROGRESS: at sentence #75080000, processed 878987137 

INFO : PROGRESS: at sentence #75850000, processed 887733485 words, keeping 5699392 word types
INFO : PROGRESS: at sentence #75860000, processed 887852787 words, keeping 5699835 word types
INFO : PROGRESS: at sentence #75870000, processed 887966414 words, keeping 5700261 word types
INFO : PROGRESS: at sentence #75880000, processed 888083664 words, keeping 5701024 word types
INFO : PROGRESS: at sentence #75890000, processed 888198144 words, keeping 5701862 word types
INFO : PROGRESS: at sentence #75900000, processed 888314729 words, keeping 5702560 word types
INFO : PROGRESS: at sentence #75910000, processed 888433124 words, keeping 5703025 word types
INFO : PROGRESS: at sentence #75920000, processed 888554370 words, keeping 5703481 word types
INFO : PROGRESS: at sentence #75930000, processed 888668365 words, keeping 5704138 word types
INFO : PROGRESS: at sentence #75940000, processed 888790328 words, keeping 5704653 word types
INFO : PROGRESS: at sentence #75950000, processed 888909356 

INFO : PROGRESS: at sentence #76720000, processed 897814215 words, keeping 5746958 word types
INFO : PROGRESS: at sentence #76730000, processed 897934105 words, keeping 5747449 word types
INFO : PROGRESS: at sentence #76740000, processed 898054925 words, keeping 5747930 word types
INFO : PROGRESS: at sentence #76750000, processed 898172063 words, keeping 5748340 word types
INFO : PROGRESS: at sentence #76760000, processed 898287513 words, keeping 5748890 word types
INFO : PROGRESS: at sentence #76770000, processed 898401892 words, keeping 5749437 word types
INFO : PROGRESS: at sentence #76780000, processed 898516318 words, keeping 5749915 word types
INFO : PROGRESS: at sentence #76790000, processed 898633779 words, keeping 5750480 word types
INFO : PROGRESS: at sentence #76800000, processed 898749550 words, keeping 5750892 word types
INFO : PROGRESS: at sentence #76810000, processed 898865039 words, keeping 5751514 word types
INFO : PROGRESS: at sentence #76820000, processed 898979463 

INFO : PROGRESS: at sentence #77600000, processed 907803362 words, keeping 5796868 word types
INFO : PROGRESS: at sentence #77610000, processed 907911498 words, keeping 5797420 word types
INFO : PROGRESS: at sentence #77620000, processed 908027391 words, keeping 5797917 word types
INFO : PROGRESS: at sentence #77630000, processed 908138635 words, keeping 5798430 word types
INFO : PROGRESS: at sentence #77640000, processed 908251008 words, keeping 5798953 word types
INFO : PROGRESS: at sentence #77650000, processed 908366024 words, keeping 5799470 word types
INFO : PROGRESS: at sentence #77660000, processed 908478464 words, keeping 5799999 word types
INFO : PROGRESS: at sentence #77670000, processed 908592995 words, keeping 5800496 word types
INFO : PROGRESS: at sentence #77680000, processed 908708045 words, keeping 5801095 word types
INFO : PROGRESS: at sentence #77690000, processed 908822751 words, keeping 5801685 word types
INFO : PROGRESS: at sentence #77700000, processed 908938069 

INFO : PROGRESS: at sentence #78480000, processed 917893360 words, keeping 5841110 word types
INFO : PROGRESS: at sentence #78490000, processed 918008728 words, keeping 5841736 word types
INFO : PROGRESS: at sentence #78500000, processed 918121781 words, keeping 5842389 word types
INFO : PROGRESS: at sentence #78510000, processed 918234347 words, keeping 5842932 word types
INFO : PROGRESS: at sentence #78520000, processed 918346077 words, keeping 5843391 word types
INFO : PROGRESS: at sentence #78530000, processed 918460979 words, keeping 5843891 word types
INFO : PROGRESS: at sentence #78540000, processed 918574580 words, keeping 5844426 word types
INFO : PROGRESS: at sentence #78550000, processed 918691092 words, keeping 5844925 word types
INFO : PROGRESS: at sentence #78560000, processed 918804074 words, keeping 5845516 word types
INFO : PROGRESS: at sentence #78570000, processed 918921066 words, keeping 5845989 word types
INFO : PROGRESS: at sentence #78580000, processed 919041452 

INFO : PROGRESS: at sentence #79360000, processed 928050722 words, keeping 5883402 word types
INFO : PROGRESS: at sentence #79370000, processed 928165850 words, keeping 5883886 word types
INFO : PROGRESS: at sentence #79380000, processed 928280775 words, keeping 5884331 word types
INFO : PROGRESS: at sentence #79390000, processed 928394454 words, keeping 5884924 word types
INFO : PROGRESS: at sentence #79400000, processed 928512661 words, keeping 5885365 word types
INFO : PROGRESS: at sentence #79410000, processed 928623055 words, keeping 5885828 word types
INFO : PROGRESS: at sentence #79420000, processed 928733224 words, keeping 5886368 word types
INFO : PROGRESS: at sentence #79430000, processed 928845211 words, keeping 5886933 word types
INFO : PROGRESS: at sentence #79440000, processed 928960203 words, keeping 5887448 word types
INFO : PROGRESS: at sentence #79450000, processed 929075293 words, keeping 5887943 word types
INFO : PROGRESS: at sentence #79460000, processed 929189157 

INFO : PROGRESS: at sentence #80240000, processed 938103849 words, keeping 5928003 word types
INFO : PROGRESS: at sentence #80250000, processed 938216953 words, keeping 5928544 word types
INFO : PROGRESS: at sentence #80260000, processed 938331426 words, keeping 5929238 word types
INFO : PROGRESS: at sentence #80270000, processed 938446058 words, keeping 5929661 word types
INFO : PROGRESS: at sentence #80280000, processed 938557424 words, keeping 5930156 word types
INFO : PROGRESS: at sentence #80290000, processed 938669094 words, keeping 5930655 word types
INFO : PROGRESS: at sentence #80300000, processed 938783962 words, keeping 5931110 word types
INFO : PROGRESS: at sentence #80310000, processed 938898292 words, keeping 5931594 word types
INFO : PROGRESS: at sentence #80320000, processed 939016198 words, keeping 5932137 word types
INFO : PROGRESS: at sentence #80330000, processed 939130594 words, keeping 5932577 word types
INFO : PROGRESS: at sentence #80340000, processed 939240605 

INFO : PROGRESS: at sentence #81120000, processed 948108682 words, keeping 5970655 word types
INFO : PROGRESS: at sentence #81130000, processed 948220944 words, keeping 5971103 word types
INFO : PROGRESS: at sentence #81140000, processed 948336397 words, keeping 5971490 word types
INFO : PROGRESS: at sentence #81150000, processed 948439598 words, keeping 5972006 word types
INFO : PROGRESS: at sentence #81160000, processed 948544394 words, keeping 5972458 word types
INFO : PROGRESS: at sentence #81170000, processed 948662333 words, keeping 5972788 word types
INFO : PROGRESS: at sentence #81180000, processed 948769979 words, keeping 5973286 word types
INFO : PROGRESS: at sentence #81190000, processed 948882399 words, keeping 5973777 word types
INFO : PROGRESS: at sentence #81200000, processed 948991810 words, keeping 5974393 word types
INFO : PROGRESS: at sentence #81210000, processed 949102494 words, keeping 5975072 word types
INFO : PROGRESS: at sentence #81220000, processed 949217098 

INFO : PROGRESS: at sentence #82000000, processed 958206246 words, keeping 6010869 word types
INFO : PROGRESS: at sentence #82010000, processed 958317968 words, keeping 6011388 word types
INFO : PROGRESS: at sentence #82020000, processed 958433405 words, keeping 6011834 word types
INFO : PROGRESS: at sentence #82030000, processed 958543647 words, keeping 6012449 word types
INFO : PROGRESS: at sentence #82040000, processed 958654485 words, keeping 6012836 word types
INFO : PROGRESS: at sentence #82050000, processed 958761750 words, keeping 6013379 word types
INFO : PROGRESS: at sentence #82060000, processed 958873230 words, keeping 6013905 word types
INFO : PROGRESS: at sentence #82070000, processed 958984514 words, keeping 6014501 word types
INFO : PROGRESS: at sentence #82080000, processed 959097029 words, keeping 6015086 word types
INFO : PROGRESS: at sentence #82090000, processed 959209589 words, keeping 6015584 word types
INFO : PROGRESS: at sentence #82100000, processed 959320671 

INFO : PROGRESS: at sentence #82880000, processed 968284712 words, keeping 6051623 word types
INFO : PROGRESS: at sentence #82890000, processed 968401530 words, keeping 6052044 word types
INFO : PROGRESS: at sentence #82900000, processed 968518278 words, keeping 6052461 word types
INFO : PROGRESS: at sentence #82910000, processed 968633900 words, keeping 6052852 word types
INFO : PROGRESS: at sentence #82920000, processed 968749787 words, keeping 6053217 word types
INFO : PROGRESS: at sentence #82930000, processed 968867944 words, keeping 6053580 word types
INFO : PROGRESS: at sentence #82940000, processed 968985480 words, keeping 6053966 word types
INFO : PROGRESS: at sentence #82950000, processed 969101348 words, keeping 6054352 word types
INFO : PROGRESS: at sentence #82960000, processed 969222619 words, keeping 6054701 word types
INFO : PROGRESS: at sentence #82970000, processed 969339086 words, keeping 6055137 word types
INFO : PROGRESS: at sentence #82980000, processed 969458385 

INFO : PROGRESS: at sentence #83760000, processed 978614894 words, keeping 6087205 word types
INFO : PROGRESS: at sentence #83770000, processed 978733050 words, keeping 6087644 word types
INFO : PROGRESS: at sentence #83780000, processed 978851909 words, keeping 6088048 word types
INFO : PROGRESS: at sentence #83790000, processed 978971345 words, keeping 6088416 word types
INFO : PROGRESS: at sentence #83800000, processed 979090010 words, keeping 6088787 word types
INFO : collected 6089101 word types from a corpus of 979182445 raw words and 83807921 sentences
INFO : max_final_vocab=3000000 and min_count=10 resulted in calc_min_count=3, effective_min_count=10
INFO : Loading a fresh vocabulary
INFO : effective_min_count=10 retains 971985 unique words (15% of original 6089101, drops 5117116)
INFO : effective_min_count=10 leaves 968571088 word corpus (98% of original 979182445, drops 10611357)
INFO : deleting the raw counts dictionary of 6089101 items
INFO : sample=0 downsamples 0 most-com

In [5]:
LOG.info('Saving word2vec for English wikipedia corpus')
eng_wikipedia.save('eng_wikipedia.{}.{}.vec'.format(corpus_characteristics, datetime.now().strftime('%Y.%m.%d')))

INFO : Saving word2vec for English wikipedia corpus
INFO : saving Word2Vec object under eng_wikipedia.processed.2019.03.17.vec, separately None
INFO : storing np array 'vectors' to eng_wikipedia.processed.2019.03.17.vec.wv.vectors.npy
INFO : not storing attribute vectors_norm
INFO : storing np array 'syn1neg' to eng_wikipedia.processed.2019.03.17.vec.trainables.syn1neg.npy
INFO : not storing attribute cum_table
INFO : saved eng_wikipedia.processed.2019.03.17.vec


In [6]:
with open('eng_wikipedia.vec.{}.{}.params'.format(corpus_characteristics, datetime.now().strftime('%Y.%m.%d')), 'wt') as writer:
    json.dump(keyword_params, writer)

### Persist the word vectors to disk
they should be cross platform, cross language loadable

In [7]:
word_vectors = eng_wikipedia.wv
the_filename = 'eng_wikipedia.{}.{}.kv'.format(corpus_characteristics, datetime.now().strftime('%Y.%m.%d'))
word_vectors.save(the_filename)

INFO : saving Word2VecKeyedVectors object under eng_wikipedia.processed.2019.03.17.kv, separately None
INFO : storing np array 'vectors' to eng_wikipedia.processed.2019.03.17.kv.vectors.npy
INFO : not storing attribute vectors_norm
INFO : saved eng_wikipedia.processed.2019.03.17.kv


## Some QA

In [8]:
eng_wikipedia.wv.most_similar('girl')

INFO : precomputing L2-norms of word weight vectors


[('boy', 0.7223787307739258),
 ('sweet-faced', 0.6542738676071167),
 ('kid', 0.64897620677948),
 ('woman', 0.6388858556747437),
 ('schoolgirl', 0.638203501701355),
 ('spinsterish', 0.6342185735702515),
 ('Hugheses', 0.6278829574584961),
 ('Sun-yi', 0.6258794665336609),
 ('Hye-shin', 0.6245971322059631),
 ('boyfriend', 0.6231043338775635)]

In [9]:
eng_wikipedia.wv.similar_by_word('man')

[('woman', 0.6794941425323486),
 ('stranger', 0.6384934186935425),
 ('fornicator', 0.6273878216743469),
 ('men', 0.6252682209014893),
 ('Berlifitzing', 0.616712212562561),
 ('quiet-spoken', 0.6153212189674377),
 ('noble-minded', 0.6135879755020142),
 ('boy', 0.6049578189849854),
 ('woman-hater', 0.6043761968612671),
 ('true-hearted', 0.6017013788223267)]

In [10]:
the_filename = 'eng_wikipedia.{}.{}.kv'.format(corpus_characteristics, datetime.now().strftime('%Y.%m.%d'))
eng_wikipedia = KeyedVectors.load(the_filename, mmap='r')

INFO : loading Word2VecKeyedVectors object from eng_wikipedia.processed.2019.03.17.kv
INFO : loading vectors from eng_wikipedia.processed.2019.03.17.kv.vectors.npy with mmap=r
INFO : setting ignored attribute vectors_norm to None
INFO : loaded eng_wikipedia.processed.2019.03.17.kv


In [11]:
eng_wikipedia.most_similar('man')

INFO : precomputing L2-norms of word weight vectors


[('woman', 0.6794941425323486),
 ('stranger', 0.6384934186935425),
 ('fornicator', 0.6273878216743469),
 ('men', 0.6252682209014893),
 ('Berlifitzing', 0.616712212562561),
 ('quiet-spoken', 0.6153212189674377),
 ('noble-minded', 0.6135879755020142),
 ('boy', 0.6049578189849854),
 ('woman-hater', 0.6043761968612671),
 ('true-hearted', 0.6017013788223267)]

In [12]:
eng_wikipedia.most_similar('sing', topn=10) 

[('singing', 0.7671092748641968),
 ('sang', 0.6991115808486938),
 ('sings', 0.6704493165016174),
 ('Menfolk', 0.6676279902458191),
 ('sung', 0.665298581123352),
 ('mezzos', 0.6444470882415771),
 ('chorus', 0.6287607550621033),
 ('Evviva', 0.6282480955123901),
 ('Libiamo', 0.6190140247344971),
 ('sopranist', 0.6165405511856079)]

In [13]:
eng_wikipedia.most_similar('poor', topn=10) 

[('impoverished', 0.6119289398193359),
 ('poorer', 0.5838309526443481),
 ('inadequate', 0.5576444268226624),
 ('poverty-stricken', 0.5569161176681519),
 ('substandard', 0.555551290512085),
 ('abysmal', 0.5481146574020386),
 ('disadvantaged', 0.5454990863800049),
 ('indifferent', 0.5431267023086548),
 ('mediocre', 0.5398264527320862),
 ('wretched', 0.535747766494751)]

In [14]:
eng_wikipedia.most_similar('cook')

[('cooks', 0.7281091213226318),
 ('cooking', 0.6575371623039246),
 ('cooked', 0.6321240663528442),
 ('chef', 0.6286415457725525),
 ('dishwasher', 0.6188214421272278),
 ('short-order', 0.613112211227417),
 ('Elmé', 0.6110155582427979),
 ('Ranhofer', 0.5976795554161072),
 ('broil', 0.5925281643867493),
 ('meal', 0.5914445519447327)]

In [15]:
eng_wikipedia.similar_by_word('awkward')

[('clumsy', 0.7073782086372375),
 ('awkwardness', 0.6625166535377502),
 ('uncomfortable', 0.6459647417068481),
 ('scatter-brained', 0.6426798105239868),
 ('hard-to-follow', 0.6413736343383789),
 ('annoying', 0.6355262994766235),
 ('overfamiliar', 0.6307566165924072),
 ('likeably', 0.6297885775566101),
 ('irritatingly', 0.6256672143936157),
 ('annoyingly', 0.6219601631164551)]

## History of feedback into the Corpus Processing step
One run produced:

eng_wikipedia.similar_by_word('citizen')

[('citizenship', 0.6755009889602661),
 ('citizens', 0.6596301198005676),
 ('non-citizen', 0.6446812152862549),
 ('Steinkoanler', 0.639006495475769),
 ('BOTC', 0.6191369295120239),
 ('CUKC', 0.6105862259864807),
 ('Non-citizens', 0.6045684814453125),
 ('non-citizens', 0.6042179465293884),
 ('noncitizen', 0.5995326042175293),
 ('citizenships', 0.5991703271865845)]
 
 Here, you can see that Non-citizen should not have appeared, since there is also a prevalent lower case.