In [314]:
import json 
import pandas as pd
import re
import nltk
from nltk.util import ngrams
from collections import Counter
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer

data_path = "./gg2015.json"
df = pd.read_json(data_path)

In [315]:
#Analyze common phrase occurences
'''
Input: A list of strings after parsing
Output: The most common grams containing 'best'
Remarks:
    1. Longest award phrase is 12 words
'''
def common_phrases(keywords):
    bigrams = []
    trigrams = []
    quadgrams = []
    multigrams = []
    megagrams = []
    for item in keywords:
        clean_string = re.sub('[^A-Za-z0-9]+', ' ', item)
        tokens = nltk.word_tokenize(clean_string)
        grams2 = ngrams(tokens, 2)
        grams3 = ngrams(tokens, 3)
        grams4 = ngrams(tokens, 4)
        grams9 = ngrams(tokens, 9)
        grams12 = ngrams(tokens, 12)
        for gram in grams2:
            bigrams.append(gram)
        for gram in grams3:
            trigrams.append(gram)
        for gram in grams4:
            quadgrams.append(gram)
        for gram in grams9:
            multigrams.append(gram)
        for gram in grams12:
            megagrams.append(gram)

    common2 = Counter(bigrams).most_common()
    common3 = Counter(trigrams).most_common()
    common4 = Counter(quadgrams).most_common()
    common9 = Counter(multigrams).most_common()
    common12 = Counter(megagrams).most_common()

    common2 = filter(lambda x: x[0][0] == "best", common2)
    common3 = filter(lambda x: x[0][0] == "best", common3)
    common4 = filter(lambda x: x[0][0] == "best", common4)
    common9 = filter(lambda x: x[0][0] == "best", common9)
    common12 = filter(lambda x: x[0][0] == "best", common12)

    common2 = list(common2)
    common3 = list(common3)
    common4 = list(common4)
    common9 = list(common9)
    common12 = list(common12)

    combined = common2 + common3 + common4 + common9 + common12
    return combined

In [316]:
'''
Input: List of strings
Output: Strings with its frequency
Remarks:
    1. Can also count common strings in strings for strategy2
'''
def most_common_beststring(strings):
    best_string = filter(lambda x: "best" in x, strings)
    best_string = list(best_string)
    return Counter(best_string).most_common()

In [317]:
#Strategy 1 scan after the word won:
df2 = df[df['text'].str.contains(" won ") | df['text'].str.contains(" Won ")]
tweets = df2['text'].tolist()
keywords = []
#regexp = re.compile(r'[!?.;#]+(?=$|\s)')
regexp = re.compile(r'[!?.;#,@:]')

stop_words = ['at','and','on','because','but','tonight','before','lol','since','i','I']

#Parse Keywords. All phrase after 'won best' will be captured.
for tweet in tweets:
    words = tweet.lower().split()
    index_won = words.index("won")
    if words[index_won + 1] == "best":
        keyword = "best "
        curr_index = index_won + 2
        while curr_index != len(words):
            if regexp.search(words[curr_index]) or words[curr_index] in stop_words:
                #keyword += words[curr_index]
                keyword = keyword.strip()
                break
            keyword += words[curr_index] + " "
            curr_index += 1
        keywords.append(keyword)

strategy1 = common_phrases(keywords)
#print(strategy1)

In [318]:
#Strategy 2 scan before the word won:
df3 = df[df['text'].str.contains(" goes to ")]
tweets = df3['text'].tolist()
keywords2 = []
#Parse Backwards, stop at the word best
for tweet in tweets:
    words = nltk.word_tokenize(tweet.lower())
    index = words.index("to")
    if words[index - 1] == "goes":
        keyword = ""
        curr_index = index - 2
        while curr_index >= 0:
            if curr_index == 0 or words[curr_index] == "best":
                keyword = words[curr_index] + keyword
                break
            keyword = " " + words[curr_index] + keyword
            curr_index -= 1
        keywords2.append(keyword)

#print(keywords2)
strategy2 = common_phrases(keywords2)
#print(strategy2)

#most_common_beststring(keywords)

In [319]:
common_strings = most_common_beststring(keywords2)

In [320]:
'''
Input: gram sets
Output: phrases with frequency
'''
def accumulate_votes(grams1, grams2, common_strings):
    awards = {}
    '''
    for gram in grams1:
        untokenize = ' '.join(gram[0])
        awards[untokenize] = len(gram[0]) * gram[1]
    for gram in grams2:
        untokenize = ' '.join(gram[0])
        if untokenize in awards:
            awards[untokenize] = awards[untokenize] * 2.5
        else:
            awards[untokenize] = len(gram[0]) * gram[1]
    '''
    for string in common_strings:
        first_two = ' '.join(nltk.word_tokenize(string[0])[:2])
        if string[0] in awards:
            awards[string[0]] = awards[string[0]] * 2.5
        elif first_two in awards:
            awards[string[0]] = awards[first_two] * string[1]
        else:
            awards[string[0]] = len(string[0]) * string[1]
    return sorted(awards.items(), key = lambda x: x[1], reverse = True)
votes = accumulate_votes(strategy1, strategy2, common_strings)
#Need a way to combine similar categories
print(len(votes))

1148


In [321]:
collocation_words = {
    #"tv":"television",
    "pic":"picture",
    "for":"-",
    "in":"-",
    'or':'/',
    'of':'-'
}

skip_words = ['a']

paraphrase = [',','@','(',')','#']

# Find a good format for award names.
def gram_cleaning(grams):
    new_grams = []
    for gram in grams:
        word_list = []
        for word in gram[0]:
            if word in collocation_words:
                word = collocation_words[word]
            if word in skip_words:
                break
            if word not in paraphrase:
                word_list.append(word)
        word_tuple = tuple(word_list)
        gram_tuple = (word_tuple,gram[1])
        new_grams.append(gram_tuple)
    return new_grams

# Separate 'A/B' type of words into 'A / B' to add more information to resolute.
def sticky_word_string(phrase):
    phrase_list = nltk.word_tokenize(phrase)
    token_list = []
    for word in phrase_list:
        flag = 0
        for i,character in enumerate(word):
            if character == '/' and i != 0:
                token_list.append(word[0:i])
                token_list.append(word[i])
                flag = i
                if i != len(word)-1:
                    token_list.append(word[i+1:len(word)])
                break
        if flag == 0 and word not in skip_words:
            token_list.append(word)
    clean_string = ' '.join(token_list)
    return clean_string

# Replacing some collocation words.
def string_cleaning(grams):
    new_grams = []
    for gram in grams:
        word_list = []
        clean_gram = sticky_word_string(gram[0])
        temp_list = nltk.word_tokenize(clean_gram)
        for word in temp_list:
            if word in collocation_words:
                word = collocation_words[word]
            if word not in paraphrase:
                word = word.strip()
                word_list.append(word)
        phrase = ' '.join(word_list)
        gram_tuple = (phrase,gram[1])
        new_grams.append(gram_tuple)
    return new_grams

#for vote in votes:
#print(strategy1)
new1 = gram_cleaning(strategy1)
new2 = gram_cleaning(strategy2)
new3 = string_cleaning(common_strings)
votes = accumulate_votes(new1, new2, new3)
#Need a way to combine similar categories
print(len(votes))
#votes

957


In [322]:
# Ignore paraphrases and merge the similar results. Get new votes.
words_pattern = '[a-zA-Z]+'
resolution_list = []
awards_dict = {}
for i,vote in enumerate(votes):
    words = re.findall(words_pattern, vote[0], flags=re.IGNORECASE)
    words = ' '.join(words)
    if words in awards_dict:
        awards_dict[words].append(i)
    else:
        awards_dict[words] = [i]
#pprint(awards_dict)

new_list = []
for key, val in awards_dict.items():
    if len(val) == 1:
        new_list.append(list(votes[val[0]]))
    else:
        sum = 0
        for i in val:
            sum+=votes[i][1]
        new_list.append([votes[val[0]][0],sum])
#new_list

In [323]:
'''
TF-IDF computation. Get a weighted word vector representation.
'''
pd. set_option('display.max_columns', None)
pd. set_option('display.max_rows', None)

corpus = []
for item in new_list:
    corpus.append(item[0][1:])

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
tfidf = pd.DataFrame(denselist, columns=feature_names)

# Add more importance to some key words (can get from user inputs) which most distinguish different award names.
keyword_list = ['actress','supporting','actor','director','drama','musical','television','comedy','tv']
for keyword in keyword_list:
    tfidf[keyword] = tfidf[keyword]*2
#tfidf.iloc[0]

In [324]:
# Compute pair similarity between each word vectors.
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

# Larger threshold means merge more strictly.
threshold = 0.9
final_list = []
cs = cosine_similarity(tfidf,tfidf)
#ans = pairwise_distances(tfidf,tfidf,'chebyshev')
index_list = []
for v,score_row in enumerate(cs):
    similarity = []
    merge_index = [v]
    for i, score in enumerate(score_row):
        if score > threshold and v!=i:
            similarity.append([new_list[v],score,new_list[i],v,i])
            merge_index.append(i)
    final_list.append(similarity)
    index_list.append(merge_index)
print('The merging result:')
#pprint(final_list)
#index_list

The merging result:


In [325]:
'''
Similarity Graph merging
Find the similar clusters by recursively merging the similar sentences.
Use the highest score sentence as its cluster name.
'''


def merge_index(curr_index,original_list,curr_cluster):
    for index in original_list[curr_index]:
        if index not in curr_cluster:
            curr_cluster.append(index)
            merge_index(index,original_list,curr_cluster)
    return curr_cluster

cluster_list = []
access = []
for v,il in enumerate(index_list):
    if v not in access:
        cl = merge_index(v,index_list,[])
        for i in cl:
            access.append(i)
        cluster_list.append(cl)

final_result = []
for cluster in cluster_list:
    sum_s = 0
    for item in cluster:
        sum_s += new_list[item][1]
    item_list = [new_list[cluster[0]][0],sum_s]
    final_result.append(item_list)


def sort_score(e):
    return e[1]
final_result.sort(reverse=True,key=sort_score)

#pprint(final_result)
print(len(final_result))




568


In [326]:
# Currently use 40 as its possible awards number. Can let user input awards number and double it.
final_awards_name = final_result[:40]
#pprint(final_awards_name)

In [327]:
awards_token_dict = {}
removable_tokens = {'mini','-','/','feature'}
for i, awards_name in enumerate(final_awards_name):
    #clean_string = re.sub('[^A-Za-z0-9]+', ' ', awards_name[0])
    clean_string = awards_name[0]
    tokens = nltk.word_tokenize(clean_string)
    for rt in removable_tokens:
        if rt in tokens:
            tokens.remove(rt)
    awards_token_dict[awards_name[0]] = [tokens]
    #print(tokens)
    if ' / ' in awards_name[0]:
        dirty_tokens = nltk.word_tokenize(awards_name[0])
        i = dirty_tokens.index('/')
        stickytoken = dirty_tokens[i-1] + '/' + dirty_tokens[i+1]
        tokens1 = tokens[:]
        #tokens1.remove('/')
        s1 = dirty_tokens[i-1]
        s2 = dirty_tokens[i+1]
        if s1 in tokens1:
            tokens1.remove(s1)
        if s2 in tokens1:
            tokens1.remove(s2)
        tokens1.append(stickytoken)
        awards_token_dict[awards_name[0]].append(tokens1)
        

pprint(awards_token_dict)

{'best actor - comedy series': [['best', 'actor', 'comedy', 'series']],
 'best actor - mini-series / tv movie': [['best',
                                          'actor',
                                          'mini-series',
                                          'tv',
                                          'movie'],
                                         ['best',
                                          'actor',
                                          'movie',
                                          'mini-series/tv']],
 'best actor - miniseries / tv movie': [['best',
                                         'actor',
                                         'miniseries',
                                         'tv',
                                         'movie'],
                                        ['best',
                                         'actor',
                                         'movie',
                                         'miniseries/tv

In [328]:
def extract_people_names(store,text_list,awards_name,awards_token,tweet_store):
    for tid, tweet in enumerate(text_list):
        for award_token in awards_token:
            if all(token in tweet.lower() for token in award_token):
                if awards_name in store:
                    store[awards_name].append(tweet)
                    tweet_store[awards_name].append([tid,tweet])
                else:
                    store[awards_name] = [tweet]
                    tweet_store[awards_name]=[[tid,tweet]]
        if  awards_name in store.keys() and len(store[awards_name]) > 500:
            break
    return [store, tweet_store]

def construct_regex(num):
    final = ''
    for i in range(num):
        substr = '[A-Z][a-z]* ?'
        final += substr
    final1 = '"'+final+'"'
    return final1

def extract_movie_song(store,text_list,awards_name,awards_token,num,tweet_store):
    occupation_words = ["actor","director","actress","singer","scientist"]
    for tid, tweet in enumerate(text_list):
        for award_token in awards_token:
            if all(token in tweet.lower() for token in award_token) and not any(occupation in tweet.lower() for occupation in occupation_words):
                for i in range(1,num):
                    regexp = construct_regex(i)
                    x = re.findall(regexp,tweet)
                    if len(x) != 0:
                        if awards_name in store:
                            store[awards_name]+=x
                        else:
                            store[awards_name] = x
                if awards_name in tweet_store:
                    tweet_store[awards_name].append([tid,tweet])
                else:
                    tweet_store[awards_name] = [[tid,tweet]]
        if  awards_name in store.keys() and len(store[awards_name]) > 1000:
            break
    return [store, tweet_store]





In [329]:
store = dict()
tweet_store = dict()
df = pd.read_json(data_path)['text']
occupation_words = ["actor","director","actress","singer","scientist"]
for awards_name, awards_token in awards_token_dict.items():
    if any(occupation in awards_name for occupation in occupation_words):
        [store, tweet_store] = extract_people_names(store,df,awards_name,awards_token,tweet_store)
        # Award for movie/songs
    else:
        [store, tweet_store] = extract_movie_song(store,df,awards_name,awards_token,5,tweet_store)


    #print(store)
#pprint(store)
#pprint(tweet_store)



In [330]:

import string

#st = StanfordNERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz','stanford-ner/stanford-ner.jar')
punc = string.punctuation
names = nltk.corpus.names.words()

results = {}
rexp = '[A-Z][a-z]* [A-Z][a-z]*'
for awards_name,awards_tweets in store.items():
    temp_result = {}
    result = []
    flag = 0
    if any(occupation in awards_name for occupation in occupation_words):
        for awards_tweet in awards_tweets:
            x = re.findall(rexp,awards_tweet)
            if len(x)!= 0:
                for sx in x:
                    #print(sx)
                    tokens = nltk.word_tokenize(sx)
                    #print(tokens)
                    if tokens[0] in names:
                        #print(dataframe[extraid])
                        result.append(sx)
    else:
        result = store[awards_name]
    temp_result = (Counter(result).most_common())
    if len(temp_result)> 1 and temp_result[0][1] > 50:
        for pre_name, pre_result in results.items():
            if temp_result[0][0] == pre_result[0][0] and temp_result[1][0] == pre_result[1][0]:
                #print(temp_result)
                #print(results[pre_name])
                if temp_result[0][1] > pre_result[0][1]:
                    results[pre_name] = temp_result
                    flag = 1
    
        if flag == 0:
            results[awards_name] = temp_result

    #print(result)
pprint(len(results))

24


In [331]:
for award in store.keys():
    if award in results.keys():
        #print(award[0],results[award[0]])
        print(award)
    #else:
     #   del tweet_store[award]
        

best director - motion picture
best actor - motion picture drama
best actress - motion picture drama
best actress - motion picture comedy / musical
best actress - tv series drama
best actor comedy / musical
best motion picture drama
best motion picture comedy / musical
best actress - comedy
best actor tv series drama
best original song - motion picture
best screenplay award
best supporting actress - tv movie series / miniseries
best tv series drama
best actress - miniseries / tv movie
best actor - tv comedy
best actor - miniseries / tv movie
best supporting actress - motion picture
best actor - mini-series / tv movie
best supporting actor - series mini-series / tv movie
best actress - motion picture
best actor - series
best screenplay
best screenplay golden globe


In [332]:
def find_presenters(awards_store,dataframe):
    presenter_store = {}
    scope = 20
    names = nltk.corpus.names.words()
    rexp = '[A-Z][a-z]* [A-Z][a-z]*'
    for award_name, tweets in awards_store.items():
        present_list = []
        for tid, tweet in tweets:
            if scope < tid < len(dataframe)-scope:
                for extraid in range(tid-scope,tid+scope):
                    if 'present' in dataframe[extraid].lower() or 'introduc' in dataframe[extraid].lower():
                        x = re.findall(rexp,dataframe[extraid])
                        if len(x)!= 0:
                            for sx in x:
                                #print(sx)
                                tokens = nltk.word_tokenize(sx)
                                #print(tokens)
                                if tokens[0] in names:
                                    #print(dataframe[extraid])
                                    present_list.append(sx)
        presenter_store[award_name] = Counter(present_list).most_common()[:3]
    return presenter_store

present_store = find_presenters(tweet_store,df)
#pprint(present_store)

    

In [333]:
def find_related_nominees(awards_store,dataframe):
    nominees_store = {}
    scope = 20
    names = nltk.corpus.names.words()
    rexp = '[A-Z][a-z]* [A-Z][a-z]*'
    for award_name, tweets in awards_store.items():
        nominee_list = []
        if any(occupation in award_name for occupation in occupation_words):
            for tid, tweet in tweets:
                if scope < tid < len(dataframe)-scope:
                    for extraid in range(tid-scope,tid+scope):
                        if 'win' in dataframe[extraid].lower():
                            x = re.findall(rexp,dataframe[extraid])
                            if len(x)!= 0:
                                for sx in x:
                                    #print(sx)
                                    tokens = nltk.word_tokenize(sx)
                                    #print(tokens)
                                    if tokens[0] in names:
                                        print(dataframe[extraid])
                                        nominee_list.append(sx)
        else:
            if award_name in results.keys():
                nominee_list = results[award_name][:5]
        nominees_store[award_name] = Counter(nominee_list).most_common()[:5]
    return nominees_store

#nominees_store1 = find_related_nominees(tweet_store,df)

In [373]:
def is_similar(name1, name2):
    nt1 = nltk.word_tokenize(name1)
    nt2 = nltk.word_tokenize(name2)
    if any(tk in nt1 for tk in nt2):
        return True
    if nltk.edit_distance(name1,name2) < 3:
        return True
    return False
print (is_similar('Richard Linklater','Ava Du'))

False


In [374]:
def find_nominees(results):
    nominees_store = {}
    occupation_words = ["actor","director","actress","singer","scientist"]
    rexp = '[A-Z][a-z]* [A-Z][a-z]*'
    for awards_name, result in results.items():
        nominee = []
         # Merge People
        if any(occupation in awards_name for occupation in occupation_words):
            for name in result:
                x = re.findall(rexp,name[0])
                if len(x)!=0:
                    if len(nominee) == 0:
                        nominee.append(name)
                    else:
                        if not any(is_similar(prev_name[0],name[0]) for prev_name in nominee):
                            nominee.append(name)

        else:
            nominee = results[awards_name][:5]
        nominees_store[awards_name] = nominee
    return nominees_store

nominees_store2 = find_nominees(results)
pprint(nominees_store2)

[('Richard Linklater', 404)]
Ava Du
AAAA
[('Richard Linklater', 404), ('Ava Du', 47)]
Wes Anderson
AAAA
[('Richard Linklater', 404), ('Ava Du', 47), ('Wes Anderson', 1)]
Ava Duvernay
[('Richard Linklater', 404), ('Ava Du', 47), ('Wes Anderson', 1)]
David Fincher
AAAA
[('Richard Linklater', 404), ('Ava Du', 47), ('Wes Anderson', 1), ('David Fincher', 1)]
Alejandro Gonz
AAAA
[('Richard Linklater', 404), ('Ava Du', 47), ('Wes Anderson', 1), ('David Fincher', 1), ('Alejandro Gonz', 1)]
Harrison Ford
AAAA
[('Eddie Redmanye', 222)]
Eddie Redmayne
[('Eddie Redmanye', 222)]
David Oyelowo
AAAA
[('Eddie Redmanye', 222), ('David Oyelowo', 11)]
Jake Gyllenhaal
AAAA
[('Eddie Redmanye', 222), ('David Oyelowo', 11), ('Jake Gyllenhaal', 6)]
Eddie Radmayne
[('Eddie Redmanye', 222), ('David Oyelowo', 11), ('Jake Gyllenhaal', 6)]
Benedict Cumberbatch
AAAA
[('Eddie Redmanye', 222), ('David Oyelowo', 11), ('Jake Gyllenhaal', 6), ('Benedict Cumberbatch', 5)]
Steve Carell
AAAA
[('Eddie Redmanye', 222), ('Dav

In [365]:
def resolute_results():
    pass

In [375]:
for award_name, award_result in results.items():
    print("Award Name: ")
    print(award_name.capitalize())
    print("Presenter: ")
    print(present_store[award_name][0])
    print("Nominess: ")
    #print(nominees_store1[award_name])
    print(nominees_store2[award_name])
    print("Winner: ")
    print(award_result[0])
    print("\n")


Award Name: 
Best director - motion picture
Presenter: 
('Harrison Ford', 35)
Nominess: 
[('Richard Linklater', 404), ('Ava Du', 47), ('Wes Anderson', 1), ('David Fincher', 1), ('Alejandro Gonz', 1), ('Harrison Ford', 1)]
Winner: 
('Richard Linklater', 404)


Award Name: 
Best actor - motion picture drama
Presenter: 
('Gwyneth Paltrow', 26)
Nominess: 
[('Eddie Redmanye', 222), ('David Oyelowo', 11), ('Jake Gyllenhaal', 6), ('Benedict Cumberbatch', 5), ('Steve Carell', 2), ('Michael Keaton', 2), ('Julianne Moore', 1), ('Gwyneth Paltrow', 1)]
Winner: 
('Eddie Redmanye', 222)


Award Name: 
Best actress - motion picture drama
Presenter: 
('Jamie Dornan', 19)
Nominess: 
[('Julianne Moore', 398), ('Jennifer Aniston', 9), ('Patricia Arquette', 9), ('Rosamund Pike', 8), ('Felicity Jones', 5), ('Reese Witherspoon', 4), ('Amy Adams', 3), ('Oscar R', 1)]
Winner: 
('Julianne Moore', 398)


Award Name: 
Best actress - motion picture comedy / musical
Presenter: 
('Ricky Gervais', 77)
Nominess: 
[('