In [None]:
!python --version

Python 3.9.16


In [None]:
import regex as re
import os
import nltk
from nltk.corpus import stopwords
from google.colab import drive

drive.mount('/content/gdrive')

nltk.download('stopwords')

assets_url = 'gdrive/My Drive/Colab Notebooks/nlp-2023/assets/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stops = set(stopwords.words('english'))

In [None]:
from IPython.display import clear_output

trigrams = []
trigram_frequences_dict = dict()
word_frequences_dict = dict()
S = 0

text = ''

file_no = 0
for address, dirs, files in os.walk(assets_url):
  for name in files:
    file_no += 1
    with open(os.path.join(address, name), mode='r') as annotated_document_file:
      for sentence in annotated_document_file.read().split('\n\n'):
        lemms = []
        for annotation in sentence.split('\n'):
          word_stem_lem = annotation.split('\t')
          if len(word_stem_lem) == 3:
            lemma = word_stem_lem[2]
            # Очистить полученные данные от знаков пунктуации. Можно использовать регулярное выражение: [^\P{P}-]+;
            if not re.match('[^\P{P}-]+', lemma):
              lemma = re.sub('[^\P{P}-]+', '', lemma).lower() # Привести полученные данные к нижнему регистру;
              # Очистить полученные данные от стоп слов. Можно использовать nltk.corpus.stopwords;
              if not lemma in stops:
                lemms.append(lemma)
                # fill word frquences dictionary
                word_frequences_dict[lemma] = word_frequences_dict.get(lemma, 0) + 1
                S += 1
        for i in range(len(lemms) - 2):
          trigrams.append((lemms[i], lemms[i+1], lemms[i+2]))
          # fill trigram frquences dictionary
          trigram_frequences_dict[trigrams[-1]] = trigram_frequences_dict.get(trigrams[-1], 0) + 1
        text += ' '.join(lemms) + '.\n'
print(len(trigrams))

2832572


In [None]:
import math

trigram_mi_scores_dict = dict()

# calculate MI
for trigram, trigram_freq in list(trigram_frequences_dict.items()):
  lemm_1_freq = word_frequences_dict[trigram[0]]
  lemm_2_freq = word_frequences_dict[trigram[1]]
  lemm_3_freq = word_frequences_dict[trigram[2]]
  trigram_mi_scores_dict[trigram] = math.log2(trigram_freq * (S**2) / (lemm_1_freq * lemm_2_freq * lemm_3_freq))

In [None]:
# top-30 by mine MI
sorted_mi_scores = sorted(trigram_mi_scores_dict.items(), key=lambda x: x[1], reverse=True)
print('top 1-30 trigrams by mine MI')
for trigram, mi_score in sorted_mi_scores[:30]:
  print(f"{trigram} - MI = {mi_score}")

top 1-30 trigrams by mine MI
('secondsfor', 'shiva', 'shankari') - MI = 43.4142033696431
('desa', 'rueng', 'bakjok') - MI = 43.4142033696431
('ratu', 'jope', 'seniloli') - MI = 43.4142033696431
('fadhil', 'muhsen', 'salom') - MI = 43.4142033696431
('prithviraj', 'chauhan', 'palam') - MI = 43.4142033696431
('champa', 'devi', 'shukla') - MI = 43.4142033696431
('prakash', 'sharan', 'mahat') - MI = 43.4142033696431
('mihd', 'kahr', 'zeye') - MI = 43.4142033696431
('realists', 'fantasist', 'rafat') - MI = 43.4142033696431
('abdellah', 'hawari', 'believedto') - MI = 43.4142033696431
('ewen', 'macaskill', 'inlondon') - MI = 43.4142033696431
('floridasaying', 'vais', 'aider') - MI = 43.4142033696431
('educationsuperintendent', 'inez', 'tenenbaum') - MI = 43.4142033696431
('20744 ', 'toshinari', 'takaoka') - MI = 43.4142033696431
('tunde', 'sanni', 'ilorin') - MI = 43.4142033696431
('verissimo', 'correia', 'seabra') - MI = 43.4142033696431
('madame', 'edmey', 'cimeus') - MI = 43.4142033696431
(

In [None]:
import nltk
from nltk.collocations import *
from nltk.corpus import PlaintextCorpusReader

trigram_measures = nltk.collocations.TrigramAssocMeasures()

tokens = nltk.word_tokenize(text, 'english', True)
print(tokens[:10])

text = nltk.Text(tokens)

#http://www.nltk.org/_modules/nltk/collocations.html
finder_thr = TrigramCollocationFinder.from_words(text)

print('top 1-30 trigrams by nltk PMI')
thirty_best_trigrams_nltk = finder_thr.nbest(trigram_measures.pmi, 30)

['alberta', 'say', 'public', 'inquiry', 'calgary', 'voting', 'scandal', 'press.', 'canadian', 'press']
top 1-30 trigrams by nltk PMI


In [None]:
thirty_best_trigrams_nltk

[('0005173', 'miami050', '0004287'),
 ('0009875', 'buffalo040', '0005173'),
 ('20744', 'toshinari', 'takaoka'),
 ('318245', '2819', 'yearling.'),
 ('46645', 'spellsquot', 'googl'),
 ('7roger', 'sooley', 'compstar'),
 ('a=maw', 's=mobile20and20wireless20technology', 'o=fptgt'),
 ('abdellah', 'hawari', 'believedto'),
 ('abend', 'einem', 'offiziellen'),
 ('abolishes', 'preallocation', 'tyranny.'),
 ('accountsalmost', 'identically', 'devoting'),
 ('adatta', 'agli', 'ultimi'),
 ('adda', 'addb', 'addc'),
 ('ampex.', 'sonyand', 'ampexhave'),
 ('andkawasaki', 'kisen', 'kaisha'),
 ('andrival', 'kalle', 'palander'),
 ('anieres', 'cologny', 'carouge'),
 ('annuncia', 'nuova', 'tecnologiaquot'),
 ('anpac', 'unione', 'piloti.'),
 ('arepair', 'keycache', 'strate.'),
 ('arraylist3', 'adda', 'addb'),
 ('aspwiki', 'snipsnap', 'aswiki'),
 ('aswiki', 'egroupwarewiki', 'hiki'),
 ('bainum', 'outspent', 'mufi'),
 ('balsamic', 'viniggas', 'trollaxors'),
 ('bassiana', 'metarhizium', 'anisopliae.'),
 ('bayerisc