In [57]:
import pandas as pd
import numpy as np

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import string
import math
import re
from collections import Counter

In [58]:
data = pd.read_json('gg2013.json')

In [59]:
data_df = pd.DataFrame(data)

In [60]:
print(data.head(50))

                    id                                               text  \
0   290620657987887104             JLo's dress! #eredcarpet #GoldenGlobes   
1   290620657887219713  What's making Sofia Vergara's boobs stay like ...   
2   290620657828524032  RT @FabSugar: Kerry Washington is EVERYTHING. ...   
3   290620657799159809     Anne Hathaway has got me living. #GoldenGlobes   
4   290620657778188288  Jennifer Lopez's lace dress? Thoughts? #Golden...   
5   290620657719455745  Podrán criticar a #Adele de su moda y su maniq...   
6   290620657715273728                           US, Weakly #GoldenGlobes   
7   290620657715253248  RT @BillMc7: "Wait. What's that smell?!" (ever...   
8   290620657706872832        Hugh Jackman is so awesome!!! #goldenglobes   
9   290620657560084480  It was awkward. RT @hollywoodhwife: They cut t...   
10  290620657551671297                    hellen mirren *O* #goldenglobes   
11  290620657526505475  Jennifer Lopez's dress is jaw droppingly amazi...   

In [61]:
# return cleaned Tweet as string
# remove stopwords, user handles, punctuation, urls

def cleanTweets(tweet):
    tt = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)

    punctuation = list(string.punctuation)
    
    # strip stopwords, punctuation, url components 
    stop = stopwords.words('english') + punctuation + ['t.co', 'http', 'https', '...', '..', ':\\', 'RT', '#']

    strip_nums = re.sub("\d+", "", tweet)
    tokenized = tt.tokenize(strip_nums)
    terms_stop = [term for term in tokenized if term not in stop]
    cleaned = [term for term in terms_stop]
    cleaned = ' '.join(cleaned)
    
    return cleaned

In [64]:
data['text'] = data['text'].map(cleanTweets)

In [65]:
print(data['text'].head())

0                JLo's dress #eredcarpet #GoldenGlobes
1    What's making Sofia Vergara's boobs stay like ...
2    Kerry Washington EVERYTHING Dying Miu Miu gown...
3               Anne Hathaway got living #GoldenGlobes
4    Jennifer Lopez's lace dress Thoughts #GoldenGl...
Name: text, dtype: object


In [73]:
# filter only tweets that include terms in include_terms, do not include terms in remove_terms

include_terms = ['host', 'hosted', 'hosting', 'hosts']
remove_terms = ['next year']
host = [];

for tweet in data['text']:
    if any(term in tweet for term in include_terms) and any(term not in tweet for term in remove_terms):
        host.append(tweet)
    

In [74]:
print(host)

['Looking forward watching Tina Fey Amy Poehler host #GoldenGlobes', "It's hosts Tina Fey Amy Poehler #goldenglobes #redcarpet http://t.co/lqCocQ", "Tonight's dual hosting duties represent culmination decade Amy Tina partnerships http://t.co/scSThrn #GoldenGlobes", 'My green suede tuxedo pinching bit Velvet Rope Awards honoring best crowd control Topo Gigio I hosting #GoldenGlobes', 'Best choice host ever Nice job GG people', '“ We ’ going keep things loose ” said Amy Poehler co-host Tina Fey ’ plan evening #GoldenGlobes http://t.co/ogLGq', '#GoldenGlobes hosts Tina Fey Amy Poehler show matching husband wife outfits red carpet http://t.co/XFCjRj', '#GoldenGlobes hosts Tina Fey Amy Poehler show matching husband wife outfits red carpet http://t.co/XFCjRj', "Tonight's dual hosting duties represent culmination decade Amy Tina partnerships http://t.co/scSThrn #GoldenGlobes", "If Red Carpet hosts asked one woman they're wearing i'm going lose mind #goldenglobes", "Tonight's dual hosting duti

In [75]:
# find bigrams 

bgrams = [];

for tweet in host:
    bgrams += list(nltk.bigrams(tweet.split()))

In [76]:
print(bgrams)

[('Looking', 'forward'), ('forward', 'watching'), ('watching', 'Tina'), ('Tina', 'Fey'), ('Fey', 'Amy'), ('Amy', 'Poehler'), ('Poehler', 'host'), ('host', '#GoldenGlobes'), ("It's", 'hosts'), ('hosts', 'Tina'), ('Tina', 'Fey'), ('Fey', 'Amy'), ('Amy', 'Poehler'), ('Poehler', '#goldenglobes'), ('#goldenglobes', '#redcarpet'), ('#redcarpet', 'http://t.co/lqCocQ'), ("Tonight's", 'dual'), ('dual', 'hosting'), ('hosting', 'duties'), ('duties', 'represent'), ('represent', 'culmination'), ('culmination', 'decade'), ('decade', 'Amy'), ('Amy', 'Tina'), ('Tina', 'partnerships'), ('partnerships', 'http://t.co/scSThrn'), ('http://t.co/scSThrn', '#GoldenGlobes'), ('My', 'green'), ('green', 'suede'), ('suede', 'tuxedo'), ('tuxedo', 'pinching'), ('pinching', 'bit'), ('bit', 'Velvet'), ('Velvet', 'Rope'), ('Rope', 'Awards'), ('Awards', 'honoring'), ('honoring', 'best'), ('best', 'crowd'), ('crowd', 'control'), ('control', 'Topo'), ('Topo', 'Gigio'), ('Gigio', 'I'), ('I', 'hosting'), ('hosting', '#Gold

In [77]:
# find frequency distribution of bigrams

fdist = nltk.FreqDist(bgrams)

In [79]:
print(fdist.most_common(10))

[(('Tina', 'Fey'), 585), (('Amy', 'Poehler'), 568), (('Fey', 'Amy'), 421), (('Tina', 'Amy'), 360), (('Golden', 'Globes'), 355), (('host', 'everything'), 167), (('Poehler', 'host'), 141), (('Poehler', 'Tina'), 132), (('Amy', 'Tina'), 128), (('Amy', 'host'), 120)]


In [None]:
upper_words = []
for tweet in host:
    for word in tweet:
        hold = word.lower()
        if hold != word and word[0] != '#':
            upper_words.append(word)

In [None]:
print(upper_words)