In [1]:
import json 
import pandas as pd
import re
import nltk
from nltk.util import ngrams
from collections import Counter
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_json("./gg2015.json")

In [2]:
#Analyze common phrase occurences
'''
Input: A list of strings after parsing
Output: The most common grams containing 'best'
Remarks:
    1. Longest award phrase is 12 words
'''
def common_phrases(keywords):
    bigrams = []
    trigrams = []
    quadgrams = []
    multigrams = []
    megagrams = []
    for item in keywords:
        clean_string = re.sub('[^A-Za-z0-9]+', ' ', item)
        tokens = nltk.word_tokenize(clean_string)
        grams2 = ngrams(tokens, 2)
        grams3 = ngrams(tokens, 3)
        grams4 = ngrams(tokens, 4)
        grams9 = ngrams(tokens, 9)
        grams12 = ngrams(tokens, 12)
        for gram in grams2:
            bigrams.append(gram)
        for gram in grams3:
            trigrams.append(gram)
        for gram in grams4:
            quadgrams.append(gram)
        for gram in grams9:
            multigrams.append(gram)
        for gram in grams12:
            megagrams.append(gram)

    common2 = Counter(bigrams).most_common()
    common3 = Counter(trigrams).most_common()
    common4 = Counter(quadgrams).most_common()
    common9 = Counter(multigrams).most_common()
    common12 = Counter(megagrams).most_common()

    common2 = filter(lambda x: x[0][0] == "best", common2)
    common3 = filter(lambda x: x[0][0] == "best", common3)
    common4 = filter(lambda x: x[0][0] == "best", common4)
    common9 = filter(lambda x: x[0][0] == "best", common9)
    common12 = filter(lambda x: x[0][0] == "best", common12)

    common2 = list(common2)
    common3 = list(common3)
    common4 = list(common4)
    common9 = list(common9)
    common12 = list(common12)

    combined = common2 + common3 + common4 + common9 + common12
    return combined

In [3]:
'''
Input: List of strings
Output: Strings with its frequency
Remarks:
    1. Can also count common strings in strings for strategy2
'''
def most_common_beststring(strings):
    best_string = filter(lambda x: "best" in x, strings)
    best_string = list(best_string)
    return Counter(best_string).most_common()

In [4]:
#Strategy 1 scan after the word won:
df2 = df[df['text'].str.contains(" won ") | df['text'].str.contains(" Won ")]
tweets = df2['text'].tolist()
keywords = []
#regexp = re.compile(r'[!?.;#]+(?=$|\s)')
regexp = re.compile(r'[!?.;#,@:]')

stop_words = ['at','and','on','because','but','tonight','before','lol','since','i','I']

#Parse Keywords. All phrase after 'won best' will be captured.
for tweet in tweets:
    words = tweet.lower().split()
    index_won = words.index("won")
    if words[index_won + 1] == "best":
        keyword = "best "
        curr_index = index_won + 2
        while curr_index != len(words):
            if regexp.search(words[curr_index]) or words[curr_index] in stop_words:
                #keyword += words[curr_index]
                keyword = keyword.strip()
                break
            keyword += words[curr_index] + " "
            curr_index += 1
        keywords.append(keyword)

strategy1 = common_phrases(keywords)
print(strategy1)

[(('best', 'actress'), 243), (('best', 'actor'), 223), (('best', 'foreign'), 200), (('best', 'picture'), 174), (('best', 'drama'), 124), (('best', 'supporting'), 98), (('best', 'animated'), 95), (('best', 'tv'), 49), (('best', 'original'), 49), (('best', 'screenplay'), 45), (('best', 'motion'), 42), (('best', 'comedy'), 38), (('best', 'director'), 30), (('best', 'song'), 21), (('best', 'film'), 20), (('best', 'mini'), 13), (('best', 'score'), 11), (('best', 'movie'), 9), (('best', 'musical'), 7), (('best', 'miniseries'), 6), (('best', 'dressed'), 5), (('best', 'pic'), 5), (('best', 'dramatic'), 4), (('best', 'news'), 4), (('best', 'acceptance'), 3), (('best', 'animation'), 3), (('best', 'television'), 3), (('best', 'visible'), 3), (('best', 'comic'), 3), (('best', 'hipster'), 3), (('best', 'new'), 2), (('best', 'outfit'), 2), (('best', 'ass'), 2), (('best', 'prosthetic'), 2), (('best', 'best'), 2), (('best', 'series'), 2), (('best', 'look'), 1), (('best', 'supportive'), 1), (('best', '

In [5]:
#Strategy 2 scan before the word won:
df3 = df[df['text'].str.contains(" goes to ")]
tweets = df3['text'].tolist()
keywords2 = []
#Parse Backwards, stop at the word best
for tweet in tweets:
    words = nltk.word_tokenize(tweet.lower())
    index = words.index("to")
    if words[index - 1] == "goes":
        keyword = ""
        curr_index = index - 2
        while curr_index >= 0:
            if curr_index == 0 or words[curr_index] == "best":
                keyword = words[curr_index] + keyword
                break
            keyword = " " + words[curr_index] + keyword
            curr_index -= 1
        keywords2.append(keyword)

#print(keywords2)
strategy2 = common_phrases(keywords2)
print(strategy2)

most_common_beststring(keywords)

[(('best', 'actor'), 5058), (('best', 'actress'), 2502), (('best', 'supporting'), 1778), (('best', 'motion'), 1627), (('best', 'tv'), 1123), (('best', 'original'), 974), (('best', 'animated'), 752), (('best', 'director'), 671), (('best', 'screenplay'), 551), (('best', 'dressed'), 489), (('best', 'foreign'), 352), (('best', 'eyebrow'), 171), (('best', 'picture'), 159), (('best', 'mini'), 134), (('best', 'nickname'), 133), (('best', 'miniseries'), 129), (('best', 'speech'), 90), (('best', 'golden'), 76), (('best', 'comedy'), 72), (('best', 'drama'), 72), (('best', 'dance'), 70), (('best', 'television'), 33), (('best', 'husband'), 29), (('best', 'ignorance'), 29), (('best', 'acceptance'), 26), (('best', 'reason'), 22), (('best', 'performance'), 18), (('best', 'nomination'), 17), (('best', 'couple'), 14), (('best', 'cry'), 14), (('best', 'tbt'), 14), (('best', 'creator'), 13), (('best', 'necklace'), 13), (('best', 'purple'), 13), (('best', 'hugger'), 13), (('best', 'listener'), 13), (('bes

[('best', 298),
 ('best foreign film', 189),
 ('best drama', 117),
 ('best actor', 74),
 ('best actress in a comedy', 69),
 ('best picture', 45),
 ('best picture in a category that included', 44),
 ('best screenplay', 38),
 ('best actress in a drama for her role in "still alice"', 37),
 ('best picture - drama', 35),
 ('best animated', 34),
 ('best picture drama', 33),
 ('best actress', 29),
 ('best actor in a drama for', 27),
 ('best tv', 26),
 ('best actress in a tv series – comedy or', 23),
 ('best supporting actress', 22),
 ('best original', 22),
 ('best supporting', 21),
 ('best animated feature', 21),
 ('best animated film', 17),
 ('best director for', 17),
 ('best comedy', 16),
 ('best motion', 13),
 ('best film', 11),
 ('best mini series', 10),
 ('best score', 10),
 ('best actor for his role as educated', 10),
 ('best song', 9),
 ('best actor in a drama tv series', 9),
 ('best motion picture', 9),
 ('best original song', 8),
 ('best animated feature film', 7),
 ('best actor for'

In [6]:
common_strings = most_common_beststring(keywords2)

In [7]:
'''
Input: gram sets
Output: phrases with frequency
'''
def accumulate_votes(grams1, grams2, common_strings):
    awards = {}
    '''
    for gram in grams1:
        untokenize = ' '.join(gram[0])
        awards[untokenize] = len(gram[0]) * gram[1]
    for gram in grams2:
        untokenize = ' '.join(gram[0])
        if untokenize in awards:
            awards[untokenize] = awards[untokenize] * 2.5
        else:
            awards[untokenize] = len(gram[0]) * gram[1]
    '''
    for string in common_strings:
        first_two = ' '.join(nltk.word_tokenize(string[0])[:2])
        if string[0] in awards:
            awards[string[0]] = awards[string[0]] * 2.5
        elif first_two in awards:
            awards[string[0]] = awards[first_two] * string[1]
        else:
            awards[string[0]] = len(string[0]) * string[1]
    return sorted(awards.items(), key = lambda x: x[1], reverse = True)
votes = accumulate_votes(strategy1, strategy2, common_strings)
#Need a way to combine similar categories
print(len(votes))

1148


In [8]:
collocation_words = {
    #"tv":"television",
    "pic":"picture",
    "for":"-",
    "in":"-",
    'or':'/',
    'of':'-'
}

skip_words = ['a']

paraphrase = [',','@','(',')','#']

# Find a good format for award names.
def gram_cleaning(grams):
    new_grams = []
    for gram in grams:
        word_list = []
        for word in gram[0]:
            if word in collocation_words:
                word = collocation_words[word]
            if word in skip_words:
                break
            if word not in paraphrase:
                word_list.append(word)
        word_tuple = tuple(word_list)
        gram_tuple = (word_tuple,gram[1])
        new_grams.append(gram_tuple)
    return new_grams

# Separate 'A/B' type of words into 'A / B' to add more information to resolute.
def sticky_word_string(phrase):
    phrase_list = nltk.word_tokenize(phrase)
    token_list = []
    for word in phrase_list:
        flag = 0
        for i,character in enumerate(word):
            if character == '/' and i != 0:
                token_list.append(word[0:i])
                token_list.append(word[i])
                flag = i
                if i != len(word)-1:
                    token_list.append(word[i+1:len(word)])
                break
        if flag == 0 and word not in skip_words:
            token_list.append(word)
    clean_string = ' '.join(token_list)
    return clean_string

# Replacing some collocation words.
def string_cleaning(grams):
    new_grams = []
    for gram in grams:
        word_list = []
        clean_gram = sticky_word_string(gram[0])
        temp_list = nltk.word_tokenize(clean_gram)
        for word in temp_list:
            if word in collocation_words:
                word = collocation_words[word]
            if word not in paraphrase:
                word = word.strip()
                word_list.append(word)
        phrase = ' '.join(word_list)
        gram_tuple = (phrase,gram[1])
        new_grams.append(gram_tuple)
    return new_grams

#for vote in votes:
#print(strategy1)
new1 = gram_cleaning(strategy1)
new2 = gram_cleaning(strategy2)
new3 = string_cleaning(common_strings)
votes = accumulate_votes(new1, new2, new3)
#Need a way to combine similar categories
print(len(votes))
#votes

957


[('best director - motion picture', 36471093.75),
 ('best actor - motion picture drama', 20393371.58203125),
 ('best screenplay - motion picture', 14549414.0625),
 ('best director motion picture', 9904781.25),
 ('best actress - motion picture drama', 3417968.75),
 ('best actress - motion picture comedy / musical', 2246093.75),
 ('best actress - tv series drama', 1742343.75),
 ('best motion picture drama', 893554.6875),
 ('best screenplay motion picture', 790987.5),
 ('best supporting actor - motion picture', 612304.6875),
 ('best actor comedy / musical', 580078.125),
 ('best actor - motion picture comedy / musical', 326562.5),
 ('best actress - comedy', 274050.0),
 ('best original score - motion picture', 221484.375),
 ('best motion picture comedy / musical', 206718.75),
 ('best original song - motion picture', 175546.875),
 ('best actress - comedy / musical', 165375.0),
 ('best actress - tv series musical / comedy', 133890.625),
 ('best actor tv series drama', 116187.5),
 ('best actor

In [9]:
# Ignore paraphrases and merge the similar results. Get new votes.
words_pattern = '[a-zA-Z]+'
resolution_list = []
awards_dict = {}
for i,vote in enumerate(votes):
    words = re.findall(words_pattern, vote[0], flags=re.IGNORECASE)
    words = ' '.join(words)
    if words in awards_dict:
        awards_dict[words].append(i)
    else:
        awards_dict[words] = [i]
#pprint(awards_dict)

new_list = []
for key, val in awards_dict.items():
    if len(val) == 1:
        new_list.append(list(votes[val[0]]))
    else:
        sum = 0
        for i in val:
            sum+=votes[i][1]
        new_list.append([votes[val[0]][0],sum])
#new_list

[['best director - motion picture', 46380789.0],
 ['best actor - motion picture drama', 20444554.08203125],
 ['best screenplay - motion picture', 15340431.5625],
 ['best actress - motion picture drama', 3456713.75],
 ['best actress - motion picture comedy / musical', 2328686.75],
 ['best actress - tv series drama', 1754104.75],
 ['best motion picture drama', 898215.1875],
 ['best supporting actor - motion picture', 612304.6875],
 ['best actor comedy / musical', 618580.625],
 ['best actor - motion picture comedy / musical', 332764.5],
 ['best actress - comedy', 274050.0],
 ['best original score - motion picture', 221773.375],
 ['best motion picture comedy / musical', 207516.75],
 ['best original song - motion picture', 177136.875],
 ['best actress - comedy / musical', 209223.0],
 ['best actress - tv series musical / comedy', 136158.625],
 ['best actor tv series drama', 240073.0],
 ['best screenplay award', 108135],
 ['best actor drama', 109914.0],
 ['best actress drama', 105084.0],
 ['b

In [10]:
'''
TF-IDF computation. Get a weighted word vector representation.
'''
pd. set_option('display.max_columns', None)
pd. set_option('display.max_rows', None)

corpus = []
for item in new_list:
    corpus.append(item[0][1:])

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
tfidf = pd.DataFrame(denselist, columns=feature_names)

# Add more importance to some key words (can get from user inputs) which most distinguish different award names.
keyword_list = ['actress','supporting','actor','director','drama','musical','television','comedy','tv']
for keyword in keyword_list:
    tfidf[keyword] = tfidf[keyword]*2
#tfidf.iloc[0]

In [11]:
# Compute pair similarity between each word vectors.
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

# Larger threshold means merge more strictly.
threshold = 0.9
final_list = []
cs = cosine_similarity(tfidf,tfidf)
#ans = pairwise_distances(tfidf,tfidf,'chebyshev')
index_list = []
for v,score_row in enumerate(cs):
    similarity = []
    merge_index = [v]
    for i, score in enumerate(score_row):
        if score > threshold and v!=i:
            similarity.append([new_list[v],score,new_list[i],v,i])
            merge_index.append(i)
    final_list.append(similarity)
    index_list.append(merge_index)
print('The merging result:')
#pprint(final_list)
#index_list

The merging result:


In [12]:
'''
Similarity Graph merging
Find the similar clusters by recursively merging the similar sentences.
Use the highest score sentence as its cluster name.
'''


def merge_index(curr_index,original_list,curr_cluster):
    for index in original_list[curr_index]:
        if index not in curr_cluster:
            curr_cluster.append(index)
            merge_index(index,original_list,curr_cluster)
    return curr_cluster

cluster_list = []
access = []
for v,il in enumerate(index_list):
    if v not in access:
        cl = merge_index(v,index_list,[])
        for i in cl:
            access.append(i)
        cluster_list.append(cl)

final_result = []
for cluster in cluster_list:
    sum_s = 0
    for item in cluster:
        sum_s += new_list[item][1]
    item_list = [new_list[cluster[0]][0],sum_s]
    final_result.append(item_list)


def sort_score(e):
    return e[1]
final_result.sort(reverse=True,key=sort_score)

#pprint(final_result)
print(len(final_result))




568


In [13]:
# Currently use 40 as its possible awards number. Can let user input awards number and double it.
final_awards_name = final_result[:40]
pprint(final_awards_name)

[['best director - motion picture', 46444699.0],
 ['best actor - motion picture drama', 20584448.08203125],
 ['best screenplay - motion picture', 15340521.5625],
 ['best actress - motion picture drama', 3579185.75],
 ['best actress - motion picture comedy / musical', 2802760.375],
 ['best actress - tv series drama', 1797820.25],
 ['best actor comedy / musical', 1073631.5],
 ['best motion picture drama', 930911.4375],
 ['best supporting actor - motion picture', 615999.6875],
 ['best motion picture comedy / musical', 314756.75],
 ['best actress - comedy', 308826.0],
 ['best actor tv series drama', 300044.0],
 ['best original score - motion picture', 227244.875],
 ['best original song - motion picture', 177294.875],
 ['best screenplay award', 108135],
 ['best supporting actress - tv movie series / miniseries', 85489.25],
 ['best tv series comedy / musical', 74368.0625],
 ['best tv series drama', 51964.5],
 ['best actress - miniseries / tv movie', 41670.0],
 ['best actor - tv comedy', 3861

In [14]:
awards_token_dict = {}
for i, awards_name in enumerate(final_awards_name):
    #clean_string = re.sub('[^A-Za-z0-9]+', ' ', awards_name[0])
    clean_string = awards_name[0]
    tokens = nltk.word_tokenize(clean_string)
    awards_token_dict[awards_name[0]] = [tokens]
    if '/' in awards_name[0]:
        dirty_tokens = nltk.word_tokenize(awards_name[0])
        i = dirty_tokens.index('/')
        stickytoken = dirty_tokens[i-1] + '/' + dirty_tokens[i+1]
        tokens1 = tokens
        tokens1.remove(dirty_tokens[i-1])
        tokens1.remove(dirty_tokens[i+1])
        tokens1.append(stickytoken)
        awards_token_dict[awards_name[0]].append(tokens1)

#print(awards_token_dict)

In [19]:
def extract_people_names(store,text_list,awards_name,awards_token):
    for tweet in text_list:
        for award_token in awards_token:
            if all(token in tweet.lower() for token in award_token):
                if awards_name in store:
                    store[awards_name].append(tweet)
                else:
                    store[awards_name] = [tweet]
        if  awards_name in store.keys() and len(store[awards_name]) > 500:
            break
    return store

def construct_regex(num):
    final = ''
    for i in range(num):
        substr = '[A-Z][a-z]* ?'
        final += substr
    final1 = '"'+final+'"'
    return final1

def extract_movie_song(store,text_list,awards_name,awards_token,num):
    for tweet in text_list:
        for award_token in awards_token:
            if all(token in tweet.lower() for token in award_token):
                for i in range(1,num):
                    regexp = construct_regex(i)
                    x = re.findall(regexp,tweet)
                    if len(x) != 0:
                        if awards_name in store:
                            store[awards_name]+=x
                        else:
                            store[awards_name] = x
        if  awards_name in store.keys() and len(store[awards_name]) > 500:
            break
    return store





In [20]:
store = dict()
df = pd.read_json("./gg2015.json")['text']
occupation_words = ["actor","director","actress","singer","scientist"]
for awards_name, awards_token in awards_token_dict.items():
    if any(occupation in awards_name for occupation in occupation_words):
        store = extract_people_names(store,df,awards_name,awards_token)
        # Award for movie/songs
    else:
        store = extract_movie_song(store,df,awards_name,awards_token,5)


    #print(store)
pprint(store)




{'best actor - comedy series': ['Best Actor in a TV Series, Musical or Comedy '
                                "- Guessing this'll be William H. Macy. Kinda "
                                "wish it'd go to Don Cheadle though. "
                                '#GoldenGlobes',
                                'Jane Fonda &amp; Lily Tomlin present the '
                                'nominees for Best Actor in a TV Series, '
                                'Comedy or Musical. :-) #GoldenGlobes',
                                'Best Actor in a Comedy or Mini-Series: Jeffry '
                                'Tambor for Transparent!  #GoldenGlobes',
                                'Best Actor in a Television Series - Comedy or '
                                'Musical - Jeffrey Tambor (@jeffreytambor) - '
                                'Transparent (@transparent_tv) - #GoldenGlobes',
                                'BEST ACTOR IN A TV SERIES, COMEDY OR MUSICAL '
                     

In [21]:
from nltk.tag.stanford import StanfordNERTagger
import string

#st = StanfordNERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz','stanford-ner/stanford-ner.jar')
punc = string.punctuation
names = nltk.corpus.names.words()

results = {}
for awards_name,awards_tweets in store.items():
    result = []
    if any(occupation in awards_name for occupation in occupation_words):
        for awards_tweet in awards_tweets:
            tokens = nltk.word_tokenize(awards_tweet)
            tl = len(tokens)
            for i in range(0,tl-1):
                if tokens[i] in names:
                    result.append(tokens[i]+' '+tokens[i+1])
    else:
        result = store[awards_name]
    results[awards_name] = (Counter(result).most_common()[:3])
    #print(result)
#pprint(results)

In [22]:
for award in final_awards_name:
    if award[0] in results.keys():
        print(award[0],results[award[0]])



best director - motion picture [('Richard Linklater', 472), ('Harrison Ford', 1), ('Ford presents', 1)]
best actor - motion picture drama [('Eddie Redmanye', 401), ('Eddie Redmayne', 79), ('Michael Keaton', 2)]
best screenplay - motion picture [('"Birdman"', 5), ('"The Grand Budapest Hotel"', 2)]
best actress - motion picture drama [('Julianne Moore', 475), ('Moore (', 440), ('Alice -', 428)]
best actress - motion picture comedy / musical [('Amy Adams', 22), ('Adams for', 6), ('Adams wins', 6)]
best actress - tv series drama [('Ruth Wilson', 129), ('Wilson for', 50), ('Wilson wins', 26)]
best actor comedy / musical [('Jeffrey Tambor', 244), ('Michael Keaton', 186), ('Amy Adams', 8)]
best motion picture drama [('"Boyhood"', 300), ('"Still Alice"', 110), ('"Selma"', 5)]
best supporting actor - motion picture [('Robert Duvall', 1)]
best motion picture comedy / musical [('"Big Eyes"', 16), ('"Birdman"', 4), ('"The Grand Budapest Hotel"', 2)]
best actress - comedy [('Gina Rodriguez', 457), 