In [1]:
import os
import nltk
import numpy as np
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Mangarakov
[nltk_data]     Alexandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
assets_dir = os.path.realpath("../assets/annotated-corpus")
train_dir = os.path.join(assets_dir, "test")

In [4]:
topics = os.listdir(train_dir)

In [5]:
sentences = []
for t in topics:
    workdir = os.path.join(train_dir, t)
    for filename in os.listdir(workdir):
        with open(os.path.join(workdir, filename)) as f:
            lines = "".join(f.readlines())
            sentences_raw = lines.split("\n\n")
            for s in sentences_raw:
                words = s.split("\n")
                if len(words) == 0 or words[0] == "":
                    continue
                stems_raw = list(map(lambda x: x.split("\t")[1], words))
                lemmas = list(map(lambda x: x.split("\t")[2], words))
                stems = []
                for i in range(len(stems_raw)):
                    if lemmas[i] not in stopwords.words("english"):
                        stems.append(stems_raw[i])
                sentences.append(stems)


In [6]:
len(sentences)

16277

In [7]:
ngram_length = 3

In [8]:
ngrams = []
word_count = {}
ngrams_count = {}
for s in sentences:
    counter = 0
    for w in s:
        if w not in word_count.keys():
            word_count[w] = 0
        word_count[w] += 1
        counter += 1
    if counter < ngram_length:
        continue
    for i in range(len(s) - ngram_length + 1):
        ngram = tuple(s[i:i+ngram_length])
        if ngram not in ngrams_count.keys():
            ngrams_count[ngram] = 0
        ngrams_count[ngram] += 1
        ngrams.append(ngram)


In [9]:
len(ngrams)

170813

In [10]:
sorted(word_count.items(), key=lambda x: -x[1])[:30]

[('39', 2802),
 ('new', 1349),
 ('said', 1312),
 ('has', 1219),
 ('reuter', 1040),
 ('ap', 1038),
 ('year', 948),
 ('was', 855),
 ('us', 848),
 ('quot', 700),
 ('two', 672),
 ('compani', 660),
 ('first', 657),
 ('say', 554),
 ('one', 549),
 ('world', 536),
 ('report', 532),
 ('u', 511),
 ('monday', 510),
 ('game', 509),
 ('tuesday', 505),
 ('thursday', 466),
 ('state', 466),
 ('1', 464),
 ('win', 461),
 ('wednesday', 453),
 ('inc', 449),
 ('plan', 446),
 ('week', 445),
 ('time', 441)]

In [11]:
sorted(ngrams_count.items(), key=lambda x: -x[1])[:30]

[(('new', 'york', 'reuter'), 141),
 (('quot', 'profil', 'research'), 71),
 (('n', 'quot', 'profil'), 40),
 (('boston', 'red', 'sox'), 39),
 (('york', 'reuter', 'u'), 33),
 (('presid', 'vladimir', 'putin'), 25),
 (('world', '39', 'largest'), 23),
 (('georg', 'w', 'bush'), 22),
 (('new', 'york', 'yanke'), 22),
 (('prime', 'minist', 'ariel'), 21),
 (('minist', 'ariel', 'sharon'), 21),
 (('intern', 'space', 'station'), 20),
 (('reuter', 'u', 'stock'), 19),
 (('reuter', 'oil', 'price'), 19),
 (('st', 'loui', 'cardin'), 18),
 (('secur', 'exchang', 'commiss'), 18),
 (('russian', 'presid', 'vladimir'), 17),
 (('crude', 'oil', 'price'), 17),
 (('delta', 'air', 'line'), 17),
 (('third', 'quarter', 'earn'), 17),
 (('high', 'oil', 'price'), 16),
 (('major', 'leagu', 'basebal'), 16),
 (('secretari', 'state', 'colin'), 16),
 (('prime', 'minist', 'toni'), 15),
 (('minist', 'toni', 'blair'), 15),
 (('world', 'cup', 'qualifi'), 15),
 (('washington', 'reuter', 'u'), 15),
 (('world', '39', 'biggest'), 15

In [12]:
total_words = sum(word_count.values())
total_words


203230

### f(n, c) - ngrams_count[ngram], частота встречаемости ключевого слова n в паре с коллокатом c;
### N - total_words, общее число словоупотреблений в корпусе (тексте);
### П_i f(u_i) - count_mul_result, Произведение абсолютных частот i-й униграммы в 3-грамме

In [13]:
ngram_score = {}
for ngram in set(ngrams):
    count_mul_result = 1
    for word in ngram:
        count_mul_result *= word_count[word]
    ngram_score[ngram] = np.log2(ngrams_count[ngram] * (total_words**(ngram_length-1)) / count_mul_result)
sorted(ngram_score.items(), key=lambda x: -x[1])[0:30]


[(('tung', 'chee', 'hwa'), 35.26550771428075),
 (('kandanski', '781', '442'), 35.26550771428075),
 (('shadi', 'nook', 'cranni'), 35.26550771428075),
 (('klien', 'vitantonio', 'liuzzi'), 35.26550771428075),
 (('macaca', 'munzala', 'arunach'), 35.26550771428075),
 (('unsign', 'adewal', 'ogunley'), 35.26550771428075),
 (('intravascular', 'coagul', 'dic'), 35.26550771428075),
 (('ku', 'klux', 'klan'), 35.26550771428075),
 (('thelma', 'drake', 'norfolk'), 35.26550771428075),
 (('gino', 'guidug', 'guh'), 35.26550771428075),
 (('petroliam', 'nasion', 'bhd'), 35.26550771428075),
 (('pickoff', 'cutoff', 'bunt'), 35.26550771428075),
 (('palett', 'pastel', 'hue'), 35.26550771428075),
 (('bink', 'lookalik', 'gungan'), 35.26550771428075),
 (('humidor', 'darth', 'vader'), 35.26550771428075),
 (('exot', 'melaleuca', 'iguana'), 35.26550771428075),
 (('sarwan', 'shivnarin', 'chanderpaul'), 35.26550771428075),
 (('munzala', 'arunach', 'macaqu'), 35.26550771428075),
 (('olympiqu', 'marseill', '1993'), 35

In [14]:
ngram_score = {}
for ngram in set(ngrams):
    count_sum_log_result = 0
    for word in ngram:
        count_sum_log_result += np.log2(word_count[word])
    ngram_score[ngram] = np.log2(ngrams_count[ngram]) + (ngram_length-1)*np.log2(total_words) - count_sum_log_result
sorted(ngram_score.items(), key=lambda x: -x[1])[30:60]

[(('jane', 'westborough', 'woke'), 35.26550771428075),
 (('nr', 'narayana', 'murthi'), 35.26550771428075),
 (('sher', 'bahadur', 'deuba'), 35.26550771428075),
 (('yu', 'shyi', 'kun'), 35.26550771428075),
 (('guidug', 'guh', 'doo'), 35.26550771428075),
 (('troi', 'rivier', 'que'), 35.26550771428075),
 (('netinfomanag', 'postfix', 'serveradmin'), 35.26550771428075),
 (('lonesom', 'dove', 'aggi'), 35.26550771428075),
 (('bb', 'lob', 'avg'), 35.26550771428075),
 (('mcteer', 'lonesom', 'dove'), 35.26550771428075),
 (('laserjet', '4345mfp', 'multifunct'), 35.26550771428075),
 (('folger', 'espresso', 'dunkin'), 35.26550771428075),
 (('middleborough', 'middleboro', 'cobra'), 35.26550771428075),
 (('781', '442', '0750'), 35.26550771428075),
 (('nesn', 'weei', 'lhp'), 35.26550771428075),
 (('drool', 'alt', 'rocker'), 35.26550771428075),
 (('cna', 'academia', 'sinica'), 35.26550771428075),
 (('fsb', 'fud', 'foi'), 35.26550771428075),
 (('fourier', 'spectromet', 'pfs'), 35.26550771428075),
 (('bla

In [15]:
from nltk.collocations import  *
from nltk import Text
import nltk
text = []
for s in sentences:
    text += s
finder = TrigramCollocationFinder.from_words(Text(text))
finder.nbest(nltk.collocations.TrigramAssocMeasures().mi_like, 30)

[('1913', 'doesnt', 'clearcut'),
 ('4345mfp', 'multifunct', 'copier'),
 ('781', '442', '0750'),
 ('axel', 'bugg', 'brasilia'),
 ('azahari', 'noordin', 'moh'),
 ('bb', 'lob', 'avg'),
 ('bink', 'lookalik', 'gungan'),
 ('binti', 'pengiran', 'salleh'),
 ('blatant', 'disregard', 'hilari'),
 ('cna', 'academia', 'sinica'),
 ('drool', 'alt', 'rocker'),
 ('ellen', 'zane', 'oversaw'),
 ('exot', 'melaleuca', 'iguana'),
 ('folger', 'espresso', 'dunkin'),
 ('fourier', 'spectromet', 'pfs'),
 ('fsb', 'fud', 'foi'),
 ('gino', 'guidug', 'guh'),
 ('guidug', 'guh', 'doo'),
 ('humidor', 'darth', 'vader'),
 ('intravascular', 'coagul', 'dic'),
 ('inver', 'caledonian', 'thistl'),
 ('jane', 'westborough', 'woke'),
 ('jarkko', 'nieminen', 'overpow'),
 ('jo', 'wilfri', 'tsonga'),
 ('kandanski', '781', '442'),
 ('klien', 'vitantonio', 'liuzzi'),
 ('kristen', 'philipkoski', 'headshak'),
 ('ku', 'klux', 'klan'),
 ('laserjet', '4345mfp', 'multifunct'),
 ('lonesom', 'dove', 'aggi')]