In [65]:
import json 
import pandas as pd
import re
import nltk
from nltk.util import ngrams
from collections import Counter

df = pd.read_json("./gg2013.json")

In [66]:
#Analyze common phrase occurences
'''
Input: A list of strings after parsing
Output: The most common grams containing 'best'
Remarks:
    1. Longest award phrase is 12 words
'''
def common_phrases(keywords):
    bigrams = []
    trigrams = []
    quadgrams = []
    multigrams = []
    megagrams = []
    for item in keywords:
        clean_string = re.sub('[^A-Za-z0-9]+', ' ', item)
        tokens = nltk.word_tokenize(clean_string)
        grams2 = ngrams(tokens, 2)
        grams3 = ngrams(tokens, 3)
        grams4 = ngrams(tokens, 4)
        grams9 = ngrams(tokens, 9)
        grams12 = ngrams(tokens, 12)
        for gram in grams2:
            bigrams.append(gram)
        for gram in grams3:
            trigrams.append(gram)
        for gram in grams4:
            quadgrams.append(gram)
        for gram in grams9:
            multigrams.append(gram)
        for gram in grams12:
            megagrams.append(gram)

    common2 = Counter(bigrams).most_common()
    common3 = Counter(trigrams).most_common()
    common4 = Counter(quadgrams).most_common()
    common9 = Counter(multigrams).most_common()
    common12 = Counter(megagrams).most_common()

    common2 = filter(lambda x: x[0][0] == "best", common2)
    common3 = filter(lambda x: x[0][0] == "best", common3)
    common4 = filter(lambda x: x[0][0] == "best", common4)
    common9 = filter(lambda x: x[0][0] == "best", common9)
    common12 = filter(lambda x: x[0][0] == "best", common12)

    common2 = list(common2)
    common3 = list(common3)
    common4 = list(common4)
    common9 = list(common9)
    common12 = list(common12)

    combined = common2 + common3 + common4 + common9 + common12
    return combined

In [67]:
'''
Input: List of strings
Output: Strings with its frequency
Remarks:
    1. Can also count common strings in strings for strategy2
'''
def most_common_beststring(strings):
    best_string = filter(lambda x: "best" in x, strings)
    best_string = list(best_string)
    return Counter(best_string).most_common()

In [68]:
#Strategy 1 scan after the word won:
df2 = df[df['text'].str.contains(" won ") | df['text'].str.contains(" Won ")]
tweets = df2['text'].tolist()
keywords = []
#regexp = re.compile(r'[!?.;#]+(?=$|\s)')
regexp = re.compile(r'[!?.;#,@]')

stop_words = ['for','at','in','and','because','but','tonight','before','lol']

#Parse Keywords. All phrase after 'won best' will be captured.
for tweet in tweets:
    words = tweet.lower().split()
    index_won = words.index("won")
    if words[index_won + 1] == "best":
        keyword = "best "
        curr_index = index_won + 2
        while curr_index != len(words):
            if regexp.search(words[curr_index]) or words[curr_index] in stop_words:
                #keyword += words[curr_index]
                keyword = keyword.strip()
                break
            keyword += words[curr_index] + " "
            curr_index += 1
        keywords.append(keyword)

strategy1 = common_phrases(keywords)
print(strategy1)

[(('best', 'director'), 41), (('best', 'supporting'), 37), (('best', 'original'), 37), (('best', 'actress'), 28), (('best', 'actor'), 22), (('best', 'picture'), 14), (('best', 'screenplay'), 12), (('best', 'motion'), 11), (('best', 'tv'), 9), (('best', 'performance'), 8), (('best', 'animated'), 8), (('best', 'song'), 7), (('best', 'foreign'), 7), (('best', 'movie'), 5), (('best', 'drama'), 4), (('best', 'comedy'), 4), (('best', 'dressed'), 3), (('best', 'skin'), 1), (('best', 'blurry'), 1), (('best', 'life'), 1), (('best', 'anal'), 1), (('best', 'husband'), 1), (('best', 'film'), 1), (('best', 'pic'), 1), (('best', 'original', 'song'), 23), (('best', 'supporting', 'actor'), 11), (('best', 'supporting', 'actress'), 11), (('best', 'motion', 'picture'), 10), (('best', 'performance', 'by'), 8), (('best', 'animated', 'feature'), 5), (('best', 'foreign', 'film'), 3), (('best', 'tv', 'series'), 3), (('best', 'comedy', 'or'), 2), (('best', 'actress', 'on'), 1), (('best', 'actor', 'on'), 1), ((

In [69]:
#Strategy 2 scan before the word won:
df3 = df[df['text'].str.contains(" goes to ")]
tweets = df3['text'].tolist()
keywords2 = []
#Parse Backwards, stop at the word best
for tweet in tweets:
    words = tweet.lower().split()
    index = words.index("to")
    if words[index - 1] == "goes":
        keyword = ""
        curr_index = index - 2
        while curr_index >= 0:
            if curr_index == 0 or words[curr_index] == "best":
                keyword = words[curr_index] + keyword
                break
            keyword = " " + words[curr_index] + keyword
            curr_index -= 1
        keywords2.append(keyword)

#print(keywords2)
strategy2 = common_phrases(keywords2)
print(strategy2)

most_common_beststring(keywords)

[(('best', 'actress'), 148), (('best', 'supporting'), 142), (('best', 'motion'), 115), (('best', 'actor'), 96), (('best', 'original'), 39), (('best', 'screenplay'), 36), (('best', 'director'), 36), (('best', 'tv'), 33), (('best', 'animated'), 17), (('best', 'foreign'), 15), (('best', 'speech'), 13), (('best', 'comedy'), 9), (('best', 'drama'), 8), (('best', 'performance'), 8), (('best', 'picture'), 8), (('best', 'miniseries'), 7), (('best', 'mini'), 5), (('best', 'movie'), 4), (('best', 'television'), 4), (('best', 'dressed'), 3), (('best', 'goldenglobes'), 3), (('best', 'editing'), 3), (('best', 'sound'), 2), (('best', 'use'), 2), (('best', 'musical'), 2), (('best', 'gay'), 2), (('best', 'presenters'), 2), (('best', 'tan'), 2), (('best', 'hair'), 2), (('best', 'humans'), 1), (('best', 'imitation'), 1), (('best', 'political'), 1), (('best', 'awkward'), 1), (('best', 'originalscore'), 1), (('best', 'painted'), 1), (('best', 'song'), 1), (('best', 'swallowed'), 1), (('best', 'acceptance'

[('best', 90),
 ('best director', 40),
 ('best actress', 26),
 ('best original song', 22),
 ('best actor', 20),
 ('best supporting', 14),
 ('best original', 12),
 ('best supporting actor', 11),
 ('best supporting actress', 10),
 ('best screenplay', 10),
 ('best picture', 10),
 ('best song', 7),
 ('best performance by an actress', 6),
 ('best motion picture', 5),
 ('best drama', 4),
 ('best animated feature', 4),
 ('best movie', 4),
 ('best tv', 3),
 ('best foreign film', 3),
 ('best foreign', 3),
 ('best dressed', 2),
 ('best performance by an actor', 2),
 ('best comedy or musical', 2),
 ('best motion picture -', 2),
 ('best actress on mini', 1),
 ('best actor on a drama', 1),
 ('best tv drama', 1),
 ('best original score', 1),
 ('best skin tight', 1),
 ('best motion picture drama', 1),
 ('best supporting actress😁✊', 1),
 ('best original song motion picture', 1),
 ('best supporting performance', 1),
 ('best screenplay -', 1),
 ('best screenplay/motion picture', 1),
 ('best original scr

In [70]:
most_common_beststring(keywords2)


[('best motion picture drama', 93),
 ('best supporting actor, motion picture', 48),
 ('best actress in a tv comedy or musical', 29),
 ('best screenplay', 26),
 ('best supporting actress tv series, miniseries, or tv movie', 22),
 ('best original song', 18),
 ('best actor in a miniseries/tv movie', 17),
 ('best actress', 16),
 ('best director', 15),
 ('best actor tv series - comedy or musical', 14),
 ('best actress in a mini-series/tv movie', 13),
 ('best actress in a motion picture drama', 13),
 ('best supporting actress in a motion picture', 12),
 ('best actor in a motion picture drama', 12),
 ('best supporting actress', 10),
 ('best director for motion picture', 10),
 ('best foreign film', 9),
 ('best animated feature film', 9),
 ('best supporting actor', 8),
 ('best director - motion picture', 8),
 ('best original score', 7),
 ('best supporting actor in a tv show, miniseries or tv movie', 7),
 ('best actress in a tv series, drama,', 7),
 ('best tv comedy/musical', 7),
 ('best miniser