# Collocation & POS filtering
https://medium.com/@nicharuch/collocations-identifying-phrases-that-act-like-individual-words-in-nlp-f58a93a2f84a

In [4]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

import nltk
from nltk.collocations import *

import pandas as pd

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [2]:
df = pd.read_pickle("./news_df.pkl")
print(df['news_category'].value_counts())
politics_df = (df[df['news_category'] == 'sports'])

sports        874
national      791
politics      712
technology    628
world         441
business      377
Name: news_category, dtype: int64


In [6]:
# Maybe convert it to set
formatted_sports_data = []
for news in politics_df['news_article']:
    news = nlp(news)
    for word in news:
        term = word.text
        formatted_sports_data.append(term)

In [10]:
# change this to read in your data
finderBigram = BigramCollocationFinder.from_words(formatted_sports_data)
finderTrigram = TrigramCollocationFinder.from_words(formatted_sports_data)

In [18]:
bigramFreqTable = pd.DataFrame(list(finderBigram.ngram_fd.items()), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

In [24]:
bigramFreqTable[:10]

AttributeError: 'LazyConfigValue' object has no attribute 'lower'

           bigram  freq
215        (,, ")   531
210        (., ")   464
5       (in, the)   368
34       (., The)   268
139     (of, the)   220
406  (World, Cup)   204
64       (,, who)   198
221       (", he)   185
218    (added, .)   171
59       (-, old)   162

In [1]:
# Handling desieredd POS types
def rightTypes(ngram):
    
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [20]:
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

In [21]:

filtered_bi[:10]

AttributeError: 'LazyConfigValue' object has no attribute 'lower'

                 bigram  freq
406        (World, Cup)   204
62          (MS, Dhoni)    77
1859  (Mumbai, Indians)    68
461   (Premier, League)    65
849              (", ")    62
346      (Virat, Kohli)    59
1431  (Delhi, Capitals)    58
61        (captain, MS)    45
1705   (Knight, Riders)    42
1704  (Kolkata, Knight)    42

In [28]:
bigramPMITable = pd.DataFrame(list(finderBigram.score_ngrams(bigram_measures.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

In [29]:
bigramPMITable[:10]

AttributeError: 'LazyConfigValue' object has no attribute 'lower'

                 bigram        PMI
0       (Ashish, Nehra)  14.313828
4     (Maurizio, Sarri)  14.313828
5       (Old, Trafford)  14.313828
1          (Hong, Kong)  14.313828
3       (Manoj, Tiwary)  14.313828
2     (MA, Chidambaram)  14.313828
6        (Ballon, d'Or)  13.898790
7       (Sultan, Azlan)  13.898790
8        (Anil, Kumble)  13.898790
9  (Carlos, Brathwaite)  13.898790

In [33]:
bigramChiTable = pd.DataFrame(list(finderBigram.score_ngrams(bigram_measures.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [34]:

bigramChiTable.head(20)

AttributeError: 'LazyConfigValue' object has no attribute 'lower'

                      bigram        chi-sq
0            (Ashish, Nehra)  61096.000000
7            (Manoj, Tiwary)  61096.000000
11           (Sultan, Azlan)  61096.000000
10             (Ole, Gunnar)  61096.000000
9            (Old, Trafford)  61096.000000
8          (Maurizio, Sarri)  61096.000000
1             (Ballon, d'Or)  61096.000000
6          (MA, Chidambaram)  61096.000000
3               (Hong, Kong)  61096.000000
2          (Ethics, Officer)  61096.000000
5               (Lok, Sabha)  61096.000000
4           (Knight, Riders)  61096.000000
12  (Challengers, Bangalore)  58832.221812
13      (Royal, Challengers)  58832.221812
14                (La, Liga)  58040.249704
15       (Rajasthan, Royals)  56659.935327
16            (New, Zealand)  55131.802691
17              (Sri, Lanka)  53904.704148
18                    (_, _)  53219.537336
19                  (AB, de)  52948.132964