In [22]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
import numpy as np
import nltk
import spacy
import spacy.symbols as ss
from spacy import displacy
import en_core_web_lg
import pandas as pd
from tqdm import tqdm_notebook, tqdm_pandas
from collections import Counter
pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_rows', 300)

In [12]:
tqdm_notebook().pandas()







In [9]:
nlp = en_core_web_lg.load()

In [16]:
df = pd.read_table("../../../../../tasks/02-structural-linguistics/blog2008.txt", names=['text'])[:]

In [17]:
df = df.assign(doc=df.text.progress_apply(nlp))




In [19]:
say_syns = set([
    "say", "tell", "speak", "claim", "communicate", "talk",
    "explain", "verbalize", "chat", "communicate", "describe",
    "express", "advertise", "broadcast", "declare", "name"])
def find_collocations(doc, say_syns=say_syns):
    collocations = []
    for tok in doc:
        if tok.lemma_ in say_syns and tok.pos == ss.VERB:
            for verb_child in tok.children:
                if verb_child.pos == ss.ADV and verb_child.text.endswith('ly'):
                    collocations.append((tok.lemma_, verb_child.lemma_))
                    for adv_child in verb_child.children:
                        if adv_child.dep == ss.conj:
                            collocations.append((tok.lemma_, adv_child.lemma_))
    return collocations

In [20]:
df = df.assign(collocations = df['doc'].progress_apply(find_collocations))




In [21]:
df[df['collocations'].progress_apply(lambda x: len(x) != 0)]




Unnamed: 0,text,doc,collocations
40,The electoral commission initially claimed that roughly a quarter of returning officers disappeared for 36 hours without announcing results and had switched off their mobile phones .,"(The, electoral, commission, initially, claimed, that, roughly, a, quarter, of, returning, officers, disappeared, for, 36, hours, without, announcing, results, and, had, switched, off, their, mobile, phones, .)","[(claim, initially)]"
176,The government still is n't saying definitively but it would appear that they are giving credence to reports that Bhutto was shot to death .,"(The, government, still, is, n't, saying, definitively, but, it, would, appear, that, they, are, giving, credence, to, reports, that, Bhutto, was, shot, to, death, .)","[(say, definitively)]"
264,These pages recently said goodbye to global warming .,"(These, pages, recently, said, goodbye, to, global, warming, .)","[(say, recently)]"
725,"Not overtly and not directly , but she will speak in code saying that Obama ca n't win .","(Not, overtly, and, not, directly, ,, but, she, will, speak, in, code, saying, that, Obama, ca, n't, win, .)","[(speak, overtly), (speak, directly)]"
1147,By the media continually telling the public that they want change it soon became a self-fulfilling prophecy .,"(By, the, media, continually, telling, the, public, that, they, want, change, it, soon, became, a, self, -, fulfilling, prophecy, .)","[(tell, continually)]"
1547,"In the name of academic freedom of speech , Ahmadinejad was invited and subsequently spoke to an appreciative audience at Columbia University in September .","(In, the, name, of, academic, freedom, of, speech, ,, Ahmadinejad, was, invited, and, subsequently, spoke, to, an, appreciative, audience, at, Columbia, University, in, September, .)","[(speak, subsequently)]"
1555,"When you bring this question up , you 're really saying , ' You 're a racist ' or ' Are you a racist ? '\nAnd the answer is , ' No , I 'm not a racist , ' he said .","( , When, you, bring, this, question, up, ,, you, 're, really, saying, ,, ', You, 're, a, racist, ', or, ', Are, you, a, racist, ?, ', \n, And, the, answer, is, ,, ', No, ,, I, ', m, not, a, racist, ,, ', , he, said, .)","[(say, really)]"
1782,"John McCain had one of his better debates , especially when talking about foreign policy and his record as a fiscal conservative .","(John, McCain, had, one, of, his, better, debates, ,, especially, when, talking, about, foreign, policy, and, his, record, as, a, fiscal, conservative, .)","[(talk, especially)]"
1795,"He cites a footnote in an essay by by Dr. Gerd Puin on p . 743 of Ibn Warraq 's 2002 book What The Koran Really Says : Angelika Neuwirth has given the impression the photographs taken in order to build up the "" Koran Archiv "" in Munich ... were destroyed at the end of World War II .","(He, cites, a, footnote, in, an, essay, by, by, Dr., Gerd, Puin, on, p, ., 743, of, Ibn, Warraq, 's, 2002, book, What, The, Koran, Really, Says, :, Angelika, Neuwirth, has, given, the, impression, the, photographs, taken, in, order, to, build, up, the, "", Koran, Archiv, "", in, Munich, ..., were,...","[(say, really)]"
2298,"Mrs. Clinton quickly said she had meant no slight , and on Monday she issued a statement proposing a truce .","(Mrs., Clinton, quickly, said, she, had, meant, no, slight, ,, and, on, Monday, she, issued, a, statement, proposing, a, truce, .)","[(say, quickly)]"


In [44]:
counters = {say_syn: Counter() for say_syn in say_syns}
for pairs in df['collocations']:
    for (say_syn, adv) in pairs:
        counters[say_syn].update([adv])

In [46]:
result = pd.DataFrame(
    [(say_syn, ",".join([adv for (adv, _) in counters[say_syn].most_common(10)])) for say_syn in say_syns],
    columns=['verb', 'most_used_adv'])
result

Unnamed: 0,verb,most_used_adv
0,broadcast,"widely,inadvertently,publicly,allegedly,clearly,nationally,presently,repeatedly"
1,declare,"publicly,falsely,suddenly,recently,officially,unequivocally,openly,proudly,triumphantly,ominously"
2,explain,"clearly,fully,patiently,probably,certainly,really,easily,exactly,recently,necessarily"
3,describe,"accurately,only,previously,specifically,adequately,thusly,really,perfectly,correctly,unfortunately"
4,talk,"directly,really,only,openly,actually,publicly,clearly,repeatedly,specifically,personally"
5,speak,"directly,publicly,only,fiercely,recently,generally,forcefully,openly,politically,loudly"
6,verbalize,
7,chat,"directly,coincidentally"
8,advertise,"heavily,aggressively,originally,openly,proudly,largely,previously"
9,claim,"falsely,previously,repeatedly,recently,initially,actually,absurdly,credibly,publicly,ludicrously"


In [45]:
result.to_csv('collocation_result.csv', index=False)