# Collocations

In [1]:

#load all libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
import string

In [2]:
#load tweets data
tweets = pd.read_csv('tweets.csv')

In [3]:
tweets.head(2)

Unnamed: 0,tweet,Column2,Column3,Column4,Column5,Column6,TEXT
0,i should really update my avi soon.,,,,,,i should really update my avi soon.
1,i just rather for the truth to come out,,,,,,i just rather for the truth to come out


Extract only the reviews...

In [4]:
comments = tweets['TEXT']
comments.describe()

count                        999972
unique                       902352
top       #jackandjackdoingitright 
freq                           2584
Name: TEXT, dtype: object

## Preprocessing

In [5]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

In [6]:
comments = comments.astype('str')

In [7]:
#remove non-ascii characters
comments = comments.map(lambda x: _removeNonAscii(x))

In [8]:
#get stop words of all languages
STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}

In [9]:
#function to detect language based on # of stop words for particular language
def get_language(text):
    words = set(nltk.wordpunct_tokenize(text.lower()))
    lang = max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key = lambda x: x[1])[0]
    if lang == 'english':
        return True
    else:
        return False

In [10]:
#filter for only english comments
eng_comments=comments[comments.apply(get_language)]

In [11]:
eng_comments.head(10)

0                   i should really update my avi soon.
1               i just rather for the truth to come out
2                          i'm shooting charles barkley
3                         @kaylieaichner you're perfect
6     first 30seconds and i hate it! i got a ear inf...
7     but do not sleep late often because i will mak...
8                people criticize chris bosh too much..
9     up your followers here real aktif asli bukan o...
13                  its amazing. go get doing it right!
14    we had foods together and i wish we were bette...
Name: TEXT, dtype: object

In [12]:
#drop duplicates
eng_comments.drop_duplicates(inplace=True)
eng_comments.describe()

count                                                481780
unique                                               481780
top       i use to think i was such a good judge of char...
freq                                                      1
Name: TEXT, dtype: object

In [13]:
#load spacy
nlp = spacy.load('en', disable=['parser','ner'])


In [14]:
#function to clean and lemmatize comments
def clean_comments(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub(" ", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct)
    lemma = [token.lemma_ for token in doc]
    return lemma

In [15]:
#apply function to clean and lemmatize comments
lemmatized = eng_comments.map(clean_comments)

In [16]:
#make sure to lowercase everything
lemmatized = lemmatized.map(lambda x: [word.lower() for word in x])

In [17]:
lemmatized.head()

0       [i, should, really, update, -pron-, avi, soon]
1    [i, just, rather, for, the, truth, to, come, out]
2                       [i, m, shoot, charle, barkley]
3              [ , kaylieaichner, -pron-, re, perfect]
6    [first, 30second, and, i, hate, -pron-,  , i, ...
Name: TEXT, dtype: object

In [18]:
#turn all comments' tokens into one single list
unlist_comments = [item for items in lemmatized for item in items]

In [47]:
l=len(unlist_comments)
l

5679407

## Initialize NLTK's Bigrams Finder

In [19]:
bigrams = nltk.collocations.BigramAssocMeasures()


In [20]:
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_comments)


## 1. Counting Frequencies of Adjacent Words
- Main idea: simply order by frequency
- Issues: too sensitive to very frequent pairs and pronouns/articles/prepositions come up often
- Solution: filter for certain types of word pos

In [21]:
bigram_freq = bigramFinder.ngram_fd.items()

In [22]:
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

In [24]:
bigramFreqTable.head(100).reset_index(drop=True)

Unnamed: 0,bigram,freq
0,"(i, m)",39307
1,"(-pron-, be)",31231
2,"(don, t)",24904
3,"(-pron-, )",24219
4,"(-pron-, s)",24111
5,"( , i)",18766
6,"( , -pron-)",17031
7,"(be, -pron-)",13806
8,"(i, be)",12936
9,"(-pron-, re)",12282


In [25]:
bigramFreqTable[:10]

Unnamed: 0,bigram,freq
16,"(i, m)",39307
86,"(-pron-, be)",31231
136,"(don, t)",24904
31,"(-pron-, )",24219
734,"(-pron-, s)",24111
32,"( , i)",18766
300,"( , -pron-)",17031
99,"(be, -pron-)",13806
1636,"(i, be)",12936
23,"(-pron-, re)",12282


In [42]:
#get english stopwordsor ' ' in ngram 
en_stopwords = set(stopwords.words('english'))

In [66]:
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' ' in ngram or 't' in ngram or 'ta' in ngram or '  ' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD','VBG','VBN','VBP','VBZ','RB','RBR', 'RBS','PRP$')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS','VB', 'VBD','VBG','VBN','VBP','VBZ','RB','RBR', 'RBS','PRP$')
    tags = nltk.pos_tag(ngram) 
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [67]:
#filter bigrams
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

In [68]:
filtered_bi[:100]

Unnamed: 0,bigram,freq
8224,"(please, follow)",1536
2575,"(fall, asleep)",1062
2696,"(go, back)",1030
859,"(come, back)",864
88,"(good, friend)",831
253,"(happy, birthday)",763
11959,"(even, know)",752
12465,"(wanna, go)",746
15482,"(good, thing)",692
2205,"(really, want)",635


In [79]:
len(filtered_bi)
top100=filtered_bi[:100]
top100.bigram.values

array([('please', 'follow'), ('fall', 'asleep'), ('go', 'back'),
       ('come', 'back'), ('good', 'friend'), ('happy', 'birthday'),
       ('even', 'know'), ('wanna', 'go'), ('good', 'thing'),
       ('really', 'want'), ('free', 'agency'), ('last', 'night'),
       ('first', 'time'), ('lgg3ph', 'simply'), ('canada', 'day'),
       ('get', 'back'), ('never', 'get'), ('luke', 'hemming'),
       ('love', 'u'), ('late', 'night'), ('go', 'get'), ('teen', 'wolf'),
       ('someone', 'else'), ('never', 'know'), ('good', 'morning'),
       ('look', 'forward'), ('long', 'time'), ('good', 'day'),
       ('good', 'night'), ('high', 'school'), ('ice', 'cream'),
       ('last', 'time'), ('u', 'get'), ('come', 'home'), ('go', 'buy'),
       ('new', 'song'), ('go', 'home'), ('holy', 'shit'), ('let', 'go'),
       ('thank', 'god'), ('world', 'cup'), ('new', 'month'),
       ('get', 'home'), ('many', 'people'), ('good', 'time'), ('na', 'go'),
       ('always', 'get'), ('na', 'get'), ('next', 'time'),


In [78]:
freq_bi = filtered_bi[:100].bigram.values
freq_bi[1][0]


'fall'

## 2. PMI
    

In [80]:
bigramFinder.apply_freq_filter(100)

In [81]:
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

In [83]:
bigramPMITable[:100]

Unnamed: 0,bigram,PMI
0,"(peanut, butter)",14.586649
1,"(hobby, lobby)",14.145231
2,"(taco, bell)",13.804351
3,"(nash, nash)",12.980102
4,"(michael, clifford)",12.823400
5,"(teen, wolf)",12.764447
6,"(ice, cream)",12.699349
7,"(tornado, warning)",12.631726
8,"(social, medium)",12.503249
9,"(social, network)",12.334934


In [85]:
pmi_bi = bigramPMITable[:100].bigram.values
pmi_bi

array([('peanut', 'butter'), ('hobby', 'lobby'), ('taco', 'bell'),
       ('nash', 'nash'), ('michael', 'clifford'), ('teen', 'wolf'),
       ('social', 'network'), ('lgg3ph', 'simply'), ('luke', 'hemming'),
       ('birth', 'control'), ('wide', 'awake'), ('difference', 'between'),
       ('free', 'agency'), ('my', 'gosh'), ('free', 'agent'),
       ('fall', 'asleep'), ('ha', 'ha'), ('awkward', 'moment'),
       ('nba', 'free'), ('oh', 'my'), ('new', 'york'), ('each', 'other'),
       ('my', 'god'), ('24', 'hour'), ('world', 'cup'), ('gon', 'na'),
       ('pay', 'attention'), ('july', '1st'), ('simply', 'because'),
       ('luke5sos', 'luke'), ('calm', 'down'), ('scary', 'movie'),
       ('look', 'forward'), ('high', 'school'), ('middle', 'school'),
       ('little', 'bit'), ('god', 'bless'), ('d', 'rather'),
       ('real', 'quick'), ('o', 'o'), ('every', 'single'), ('year', 'ago'),
       ('happy', 'birthday'), ('pretty', 'sure'), ('an', 'idiot'),
       ('anyone', 'else'), ('canada'

## 3. Chi-Square

In [94]:
bigramChiTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [98]:
bigramChiTable = bigramChiTable[bigramChiTable.bigram.map(lambda x: rightTypes(x))]

In [99]:
bigramChiTable.head(100)

Unnamed: 0,bigram,chi-sq
1,"(hobby, lobby)",3.080174e+06
2,"(teen, wolf)",2.956892e+06
3,"(taco, bell)",2.803896e+06
4,"(peanut, butter)",2.681851e+06
5,"(ice, cream)",2.374153e+06
6,"(lgg3ph, simply)",2.359048e+06
8,"(luke, hemming)",2.109776e+06
9,"(fall, asleep)",1.425335e+06
10,"(free, agency)",1.396259e+06
11,"(social, medium)",1.393127e+06


In [100]:
chi_bi = bigramChiTable[:100].bigram.values
chi_bi

array([('hobby', 'lobby'), ('teen', 'wolf'), ('taco', 'bell'),
       ('peanut', 'butter'), ('ice', 'cream'), ('lgg3ph', 'simply'),
       ('luke', 'hemming'), ('fall', 'asleep'), ('free', 'agency'),
       ('social', 'medium'), ('nash', 'nash'), ('michael', 'clifford'),
       ('social', 'network'), ('free', 'agent'), ('world', 'cup'),
       ('happy', 'birthday'), ('awkward', 'moment'), ('please', 'follow'),
       ('look', 'forward'), ('high', 'school'), ('ha', 'ha'),
       ('new', 'york'), ('canada', 'day'), ('luke5sos', 'luke'),
       ('pay', 'attention'), ('year', 'ago'), ('late', 'night'),
       ('anyone', 'else'), ('little', 'bit'), ('holy', 'shit'),
       ('scary', 'movie'), ('someone', 'else'), ('last', 'night'),
       ('god', 'bless'), ('real', 'quick'), ('happy', 'canada'),
       ('next', 'week'), ('come', 'back'), ('make', 'sense'),
       ('middle', 'school'), ('good', 'luck'), ('new', 'month'),
       ('far', 'away'), ('wait', 'till'), ('new', 'song'),
       ('eve

## Bigram Comparison

In [111]:
bigramsCompare = pd.DataFrame([freq_bi, pmi_bi, chi_bi]).T

In [112]:
bigramsCompare.columns = ['Frequency With Filter', 'PMI',  'Chi-Sq Test']

In [115]:
bigramsCompare

Unnamed: 0,Frequency With Filter,PMI,Chi-Sq Test
0,"(please, follow)","(peanut, butter)","(hobby, lobby)"
1,"(fall, asleep)","(hobby, lobby)","(teen, wolf)"
2,"(go, back)","(taco, bell)","(taco, bell)"
3,"(come, back)","(nash, nash)","(peanut, butter)"
4,"(good, friend)","(michael, clifford)","(ice, cream)"
5,"(happy, birthday)","(teen, wolf)","(lgg3ph, simply)"
6,"(even, know)","(ice, cream)","(luke, hemming)"
7,"(wanna, go)","(tornado, warning)","(fall, asleep)"
8,"(good, thing)","(social, medium)","(free, agency)"
9,"(really, want)","(social, network)","(social, medium)"


In [118]:
bigramsCompare.to_excel("C:/data/690/zhaohui_690_assignment1.xlsx")