In [50]:
import regex as re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

tsv = open(f"alt.atheism.tsv").readlines()

items = [row.replace("\n", '').split('\t')[0].lower() for row in tsv]

print('total:', len(items))
print(items[:5])



items = [x for x in items if not re.match('[^\P{P}-]+', x)]
print('\nafter regex:', len(items))
print(items[:5])


stop_words = set(stopwords.words('english'))
items = [w for w in items if not w.lower() in stop_words]
print('\nafter removing stop words:', len(items))
print(items[:5])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ander\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


total: 176171
['archive-name', ':', 'atheism', '/', 'resources']

after regex: 155026
['archive-name', 'atheism', 'resources', 'alt-atheism-archive-name', 'resources']

after removing stop words: 77569
['archive-name', 'atheism', 'resources', 'alt-atheism-archive-name', 'resources']


In [51]:
def make_3_gram(items):
    _3_grams = []

    for _1, _2, _3 in zip(*[iter(items)]*3):
        _3_grams.append((_1, _2, _3))

    return _3_grams

_3_grams = make_3_gram(items[0:6]) + make_3_gram(items[1:6]) + make_3_gram(items[2:6])
_3_grams




[('archive-name', 'atheism', 'resources'),
 ('alt-atheism-archive-name', 'resources', 'last-modified'),
 ('atheism', 'resources', 'alt-atheism-archive-name'),
 ('resources', 'alt-atheism-archive-name', 'resources')]

In [82]:
from operator import itemgetter

frequency_cache = {}
frequency_cache_n_grams = {}

def get_frequency(frequency_cache, item, items):
    if item not in frequency_cache:
        frequency_cache[item] = items.count(item)
        
    return frequency_cache[item]

def t_score(items):
    _3_grams = make_3_gram(items[0:]) + make_3_gram(items[1:]) + make_3_gram(items[2:])
    #print(len(_3_grams))
    result = []

    for _1, _2, _3 in _3_grams:
        frequency = get_frequency(frequency_cache_n_grams, (_1,_2,_3), _3_grams) #_3_grams.count((_1, _2, _3))
        _1_frequency = get_frequency(frequency_cache, _1, items)
        _2_frequency = get_frequency(frequency_cache, _2, items)
        _3_frequency = get_frequency(frequency_cache, _3, items)
        #print(len(frequency_cache))
        t_score = (frequency - ((_1_frequency * _2_frequency * _3_frequency) / len(items) ** 2)) / (frequency ** 0.5)
        result.append((t_score, _1, _2, _3))

    return result
        
t_scored = sorted(t_score(items[:]), key = itemgetter(0), reverse=True)
t_scored = list(dict.fromkeys(t_scored))

In [81]:
t_scored[:30]

[(7.141360941903107, 'jon', 'livesey', 'writes'),
 (6.999981874537044, 'livesey@solntze.wpd.sgi', 'com', 'jon'),
 (6.999979655092601, 'com', 'jon', 'livesey'),
 (5.916076147094015, 'keith', 'allan', 'schneider'),
 (5.830948806305854, 'keith@cco.caltech.edu', 'keith', 'allan'),
 (5.656829709291775, 'allan', 'schneider', 'writes'),
 (5.376675682356701, 'writes', '|>', '|>'),
 (5.291501001083373, 'jaeger@buphy.bu.edu', 'gregg', 'jaeger'),
 (4.999971147507143, 'gregg', 'jaeger', 'writes'),
 (4.898970085158797, 'darice@yoyo.cc.monash', 'edu', 'au'),
 (4.898969504331362, 'bobbe@vice.ico.tek', 'com', 'robert'),
 (4.898969504331362, 'com', 'robert', 'beauchaine'),
 (4.690413735444997, 'bob', 'beauchaine', 'bobbe@vice.ico.tek'),
 (4.6904130899910035, 'au', 'fred', 'rice'),
 (4.690403977736855, 'edu', 'au', 'fred'),
 (4.690403463598875, 'beauchaine', 'bobbe@vice.ico.tek', 'com'),
 (4.582575295110016, 'sank', 'manhattan', 'sea'),
 (4.582575116893019, 'stay', 'blew', 'bronx'),
 (4.5825749881807445

In [93]:
import math 

def mi_score(items):
    _3_grams = make_3_gram(items[0:]) + make_3_gram(items[1:]) + make_3_gram(items[2:])
    
    result = []

    for _1, _2, _3 in _3_grams:
        frequency = get_frequency(frequency_cache_n_grams, (_1,_2,_3), _3_grams)
        _1_frequency = get_frequency(frequency_cache, _1, items)
        _2_frequency = get_frequency(frequency_cache, _2, items)
        _3_frequency = get_frequency(frequency_cache, _3, items)
        
        mi_score = math.log2((frequency * (len(items)**2))/(_1_frequency * _2_frequency * _3_frequency))
        result.append((mi_score, _1, _2, _3))

    return result
        
mi_scored = sorted(mi_score(items[:]), key = itemgetter(0), reverse=True)
mi_scored = list(dict.fromkeys(mi_scored))

In [94]:
mi_scored[:30]

[(32.48638516490747, 'deluxe', 'moulded', '3d'),
 (32.48638516490747, 'cathedral', 'beneath', 'oceans'),
 (32.48638516490747, 'dunkle', 'seite', 'des'),
 (32.48638516490747, 'astrology', 'graphology', 'pseudo-sciences'),
 (32.48638516490747,
  '7ltvtmvtu66nz6sbbpw9qkbjarby',
  's2sz9nf5htdii0r6sseypl0r6',
  '9bv9oke'),
 (32.48638516490747, 'mellish', 'fog', 'ic'),
 (32.48638516490747, '87', 'allergic', 'mollusks'),
 (32.48638516490747, 'cis', 'id#', '71611,365'),
 (32.48638516490747, 'buddism', 'instant', 'hislife'),
 (32.48638516490747, 'straight(', 'arrow', '17-year'),
 (32.48638516490747, 'sky', 'volcanoes', 'earthquakes'),
 (32.48638516490747, 'acme', 'bbs', '+64'),
 (32.48638516490747, 'replication', 'saves', 'replicators'),
 (32.48638516490747, 'al-qanawi', 'ref', 'bouhdiba'),
 (32.48638516490747, 'jaca', 'negra', 'luna'),
 (32.48638516490747, 'aceitunas', 'en', 'mi'),
 (32.48638516490747, 'aunque', 'sepa', 'los'),
 (32.48638516490747, 'muddleheaded', 'fourth-', 'reich-sophistiqu

In [87]:
import nltk
from nltk.collocations import *
from nltk.corpus import PlaintextCorpusReader

trigram_measures = nltk.collocations.TrigramAssocMeasures()

tokens = [row.replace("\n", '').split('\t')[0].lower() for row in tsv]

text = nltk.Text(tokens)

finder_thr = TrigramCollocationFinder.from_words(text)


finder_thr.nbest(trigram_measures.pmi, 30)

[('#what', 'mediated', 'thair'),
 (')",', 'clarendon', 'paperbacks'),
 ('1938', 'websters', 'nonetheless'),
 ('402', 'notre', 'dame'),
 ('7119', 'laurel', 'canyon'),
 ('844-7298', 'telex', '5560074'),
 ('93apr18124333@solan10.solan.unit', 'no>', 'cindy@solan10.solan.unit'),
 ('<1993apr16.130430', '1@ccsua.ctstateu.edu>', 'kellyb@ccsua.ctstateu.edu'),
 ('<cindy', '93apr18124333@solan10.solan.unit', 'no>'),
 ('<ednclark', '734054731@kraken>', 'ednclark@kraken.itc.gu'),
 ('<monack', '733980580@helium>', 'monack@helium.gas.uug'),
 ('[...numerous', 'ctrl-ls', 'deleted...hehehe...]'),
 ('[judges', '13.5]', '"...conceive'),
 ('_by_defending_myself_against_the_jew',
  ',_i_am_',
  'fighting_for_the_work_of_the_lord'),
 ('_the_wholly_babble', ':_the_users_guide_to_invisible_', '_pink_unicorns_'),
 ('abnormal', 'psych', 'ward'),
 ('aceitunas', 'en', 'mi'),
 ('acme', 'bbs', '+64'),
 ('anon', 'pts@ecl.psu.edu', 'ob'),
 ('aunque', 'sepa', 'los'),
 ('bury', 'strangers', 'in...'),
 ('cis', 'id#', '71