In [1]:
import os
import numpy as np
from nltk.corpus import stopwords
from nltk.collocations import  *
from nltk import Text
import nltk

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/maxim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
assets_dir = os.path.realpath("../assets/annotated-corpus")
train_dir = os.path.join(assets_dir, "test")

In [4]:
topics = os.listdir(train_dir)

In [5]:
sentences = []
for t in topics:
    workdir = os.path.join(train_dir, t)
    for filename in os.listdir(workdir):
        with open(os.path.join(workdir, filename)) as f:
            lines = "".join(f.readlines())
            sentences_raw = lines.split("\n\n")
            for sentence in sentences_raw:
                words = sentence.split("\n")
                if len(words) == 0 or words[0] == "":
                    continue
                stems_raw = list(map(lambda x: x.split("\t")[1], words))
                lemmas = list(map(lambda x: x.split("\t")[2], words))
                stems = []
                for i in range(len(stems_raw)):
                    if lemmas[i] not in stopwords.words("english"):
                        stems.append(stems_raw[i])
                sentences.append(stems)

In [6]:
len(sentences)

16277

In [7]:
ngram_length = 3

In [8]:
ngrams = []
word_count = {}
ngrams_count = {}
for sentence in sentences:
    counter = 0
    for word in sentence:
        if word not in word_count.keys():
            word_count[word] = 0
        word_count[word] += 1
        counter += 1
    if counter < ngram_length:
        continue
    for i in range(len(sentence) - ngram_length + 1):
        ngram = tuple(sentence[i:i + ngram_length])
        if ngram not in ngrams_count.keys():
            ngrams_count[ngram] = 0
        ngrams_count[ngram] += 1
        ngrams.append(ngram)

In [9]:
len(ngrams)

174818

In [10]:
sorted(word_count.items(), key=lambda x: -x[1])[:30]

[('39', 2802),
 ('new', 1349),
 ('said', 1312),
 ('has', 1219),
 ('reuter', 1148),
 ('ap', 1038),
 ('year', 948),
 ('was', 855),
 ('us', 850),
 ('gt', 760),
 ('lt', 753),
 ('quot', 700),
 ('two', 672),
 ('compani', 660),
 ('first', 657),
 ('say', 554),
 ('one', 549),
 ('world', 536),
 ('report', 532),
 ('u', 511),
 ('monday', 510),
 ('game', 509),
 ('tuesday', 505),
 ('1', 472),
 ('state', 466),
 ('thursday', 466),
 ('win', 461),
 ('wednesday', 453),
 ('inc', 449),
 ('2', 448)]

In [11]:
sorted(ngrams_count.items(), key=lambda x: -x[1])[:30]

[(('lt', 'b', 'gt'), 348),
 (('b', 'gt', 'lt'), 174),
 (('gt', 'lt', 'b'), 174),
 (('new', 'york', 'reuter'), 141),
 (('lt', 'href', 'http'), 114),
 (('href', 'http', 'www'), 110),
 (('http', 'www', 'investor'), 96),
 (('www', 'investor', 'reuter'), 96),
 (('investor', 'reuter', 'com'), 96),
 (('reuter', 'com', 'fullquot'), 96),
 (('com', 'fullquot', 'aspx'), 96),
 (('fullquot', 'aspx', 'ticker'), 96),
 (('target', 'stock', 'quickinfo'), 96),
 (('stock', 'quickinfo', 'fullquot'), 96),
 (('quickinfo', 'fullquot', 'gt'), 96),
 (('n', 'lt', 'gt'), 72),
 (('quot', 'profil', 'research'), 71),
 (('n', 'target', 'stock'), 70),
 (('lt', 'p', 'gt'), 66),
 (('inc', 'lt', 'href'), 58),
 (('n', 'quot', 'profil'), 40),
 (('boston', 'red', 'sox'), 39),
 (('gt', 'lt', 'font'), 35),
 (('p', 'gt', 'lt'), 33),
 (('york', 'reuter', 'u'), 33),
 (('gt', 'lt', 'p'), 31),
 (('presid', 'vladimir', 'putin'), 25),
 (('lt', 'font', 'face'), 23),
 (('font', 'face', 'verdana'), 23),
 (('face', 'verdana', 'san'), 2

In [12]:
total_words = sum(word_count.values())
total_words

207236

### f(n, c) - ngrams_count[ngram], частота встречаемости ключевого слова n в паре с коллокатом c;
### N - total_words, общее число словоупотреблений в корпусе (тексте);
### П_i f(u_i) - count_mul_result, Произведение абсолютных частот i-й униграммы в 3-грамме

In [13]:
ngram_score = {}
for ngram in set(ngrams):
    count_mul_result = 1
    for word in ngram:
        count_mul_result *= word_count[word]
    ngram_score[ngram] = np.log2(ngrams_count[ngram] * (total_words**(ngram_length-1)) / count_mul_result)
sorted(ngram_score.items(), key=lambda x: -x[1])[0:60]

[(('mcteer', 'lonesom', 'dove'), 35.321830233977494),
 (('exot', 'melaleuca', 'iguana'), 35.321830233977494),
 (('lonesom', 'dove', 'aggi'), 35.321830233977494),
 (('terin', 'humphrey', 'annia'), 35.321830233977494),
 (('thelma', 'drake', 'norfolk'), 35.321830233977494),
 (('nr', 'narayana', 'murthi'), 35.321830233977494),
 (('unsign', 'adewal', 'ogunley'), 35.321830233977494),
 (('ellen', 'zane', 'oversaw'), 35.321830233977494),
 (('mou', 'tamanthi', 'hydroelectr'), 35.321830233977494),
 (('laserjet', '4345mfp', 'multifunct'), 35.321830233977494),
 (('sarwan', 'shivnarin', 'chanderpaul'), 35.321830233977494),
 (('781', '442', '0750'), 35.321830233977494),
 (('drool', 'alt', 'rocker'), 35.321830233977494),
 (('binti', 'pengiran', 'salleh'), 35.321830233977494),
 (('suitor', 'foodland', 'foa'), 35.321830233977494),
 (('bb', 'lob', 'avg'), 35.321830233977494),
 (('nokiajoinssecuredigitalindustrygroup', '2100', '1039_3'),
  35.321830233977494),
 (('troi', 'rivier', 'que'), 35.321830233977

In [14]:
text = []
for sentence in sentences:
    text += sentence
finder = TrigramCollocationFinder.from_words(Text(text))
finder.nbest(nltk.collocations.TrigramAssocMeasures().mi_like, 30)

[('1913', 'doesnt', 'clearcut'),
 ('1x1', 'ord', '200301151450'),
 ('2100', '1039_3', '5365922'),
 ('4345mfp', 'multifunct', 'copier'),
 ('563', 'kph', 'vampir'),
 ('781', '442', '0750'),
 ('azahari', 'noordin', 'moh'),
 ('bb', 'lob', 'avg'),
 ('bink', 'lookalik', 'gungan'),
 ('binti', 'pengiran', 'salleh'),
 ('blatant', 'disregard', 'hilari'),
 ('cna', 'academia', 'sinica'),
 ('drool', 'alt', 'rocker'),
 ('ellen', 'zane', 'oversaw'),
 ('exot', 'melaleuca', 'iguana'),
 ('folger', 'espresso', 'dunkin'),
 ('fourier', 'spectromet', 'pfs'),
 ('fsb', 'fud', 'foi'),
 ('gino', 'guidug', 'guh'),
 ('guidug', 'guh', 'doo'),
 ('humidor', 'darth', 'vader'),
 ('intravascular', 'coagul', 'dic'),
 ('inver', 'caledonian', 'thistl'),
 ('jane', 'westborough', 'woke'),
 ('jarkko', 'nieminen', 'overpow'),
 ('jo', 'wilfri', 'tsonga'),
 ('kandanski', '781', '442'),
 ('klien', 'vitantonio', 'liuzzi'),
 ('ku', 'klux', 'klan'),
 ('laserjet', '4345mfp', 'multifunct')]