In [81]:
import json 
import pandas as pd
import re
import nltk
import string
from nltk.util import ngrams
from collections import Counter
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

# Datapath
data_path = "./gg2015.json"
# List some words which help you most in distinguishing different award names.
keyword_list = ['actress','supporting','actor','director','drama','musical','television','comedy','tv']
# Specific words (eg. person name, movie name) you want to review its attitude.
sentiment_word = 'Tina Fey'
# Enter a rough number on how many awards you want to look for.
awards_num = 20


df = pd.read_json(data_path)

In [82]:
#Analyze common phrase occurences
'''
Input: A list of strings after parsing
Output: The most common grams containing 'best'
Remarks:
    1. Longest award phrase is 12 words
'''
def common_phrases(keywords):
    bigrams = []
    trigrams = []
    quadgrams = []
    multigrams = []
    megagrams = []
    for item in keywords:
        clean_string = re.sub('[^A-Za-z0-9]+', ' ', item)
        tokens = nltk.word_tokenize(clean_string)
        grams2 = ngrams(tokens, 2)
        grams3 = ngrams(tokens, 3)
        grams4 = ngrams(tokens, 4)
        grams9 = ngrams(tokens, 9)
        grams12 = ngrams(tokens, 12)
        for gram in grams2:
            bigrams.append(gram)
        for gram in grams3:
            trigrams.append(gram)
        for gram in grams4:
            quadgrams.append(gram)
        for gram in grams9:
            multigrams.append(gram)
        for gram in grams12:
            megagrams.append(gram)

    common2 = Counter(bigrams).most_common()
    common3 = Counter(trigrams).most_common()
    common4 = Counter(quadgrams).most_common()
    common9 = Counter(multigrams).most_common()
    common12 = Counter(megagrams).most_common()

    common2 = filter(lambda x: x[0][0] == "best", common2)
    common3 = filter(lambda x: x[0][0] == "best", common3)
    common4 = filter(lambda x: x[0][0] == "best", common4)
    common9 = filter(lambda x: x[0][0] == "best", common9)
    common12 = filter(lambda x: x[0][0] == "best", common12)

    common2 = list(common2)
    common3 = list(common3)
    common4 = list(common4)
    common9 = list(common9)
    common12 = list(common12)

    combined = common2 + common3 + common4 + common9 + common12
    return combined

In [83]:
'''
Input: List of strings
Output: Strings with its frequency
Remarks:
    1. Can also count common strings in strings for strategy2
'''
def most_common_beststring(strings):
    best_string = filter(lambda x: "best" in x, strings)
    best_string = list(best_string)
    return Counter(best_string).most_common()

In [84]:
#Strategy 1 scan after the word won:
df2 = df[df['text'].str.contains(" won ") | df['text'].str.contains(" Won ")]
tweets = df2['text'].tolist()
keywords = []
#regexp = re.compile(r'[!?.;#]+(?=$|\s)')
regexp = re.compile(r'[!?.;#,@:]')

stop_words = ['at','and','on','because','but','tonight','before','lol','since','i','I']

#Parse Keywords. All phrase after 'won best' will be captured.
for tweet in tweets:
    words = tweet.lower().split()
    index_won = words.index("won")
    if words[index_won + 1] == "best":
        keyword = "best "
        curr_index = index_won + 2
        while curr_index != len(words):
            if regexp.search(words[curr_index]) or words[curr_index] in stop_words:
                #keyword += words[curr_index]
                keyword = keyword.strip()
                break
            keyword += words[curr_index] + " "
            curr_index += 1
        keywords.append(keyword)

strategy1 = common_phrases(keywords)
#print(strategy1)

In [85]:
#Strategy 2 scan before the word won:
df3 = df[df['text'].str.contains(" goes to ")]
tweets = df3['text'].tolist()
keywords2 = []
#Parse Backwards, stop at the word best
for tweet in tweets:
    words = nltk.word_tokenize(tweet.lower())
    index = words.index("to")
    if words[index - 1] == "goes":
        keyword = ""
        curr_index = index - 2
        while curr_index >= 0:
            if curr_index == 0 or words[curr_index] == "best":
                keyword = words[curr_index] + keyword
                break
            keyword = " " + words[curr_index] + keyword
            curr_index -= 1
        keywords2.append(keyword)

#print(keywords2)
strategy2 = common_phrases(keywords2)
#print(strategy2)

#most_common_beststring(keywords)

In [86]:
common_strings = most_common_beststring(keywords2)

In [87]:
'''
Input: gram sets
Output: phrases with frequency
'''
def accumulate_votes(grams1, grams2, common_strings):
    awards = {}
    '''
    for gram in grams1:
        untokenize = ' '.join(gram[0])
        awards[untokenize] = len(gram[0]) * gram[1]
    for gram in grams2:
        untokenize = ' '.join(gram[0])
        if untokenize in awards:
            awards[untokenize] = awards[untokenize] * 2.5
        else:
            awards[untokenize] = len(gram[0]) * gram[1]
    '''
    for string in common_strings:
        first_two = ' '.join(nltk.word_tokenize(string[0])[:2])
        if string[0] in awards:
            awards[string[0]] = awards[string[0]] * 2.5
        elif first_two in awards:
            awards[string[0]] = awards[first_two] * string[1]
        else:
            awards[string[0]] = len(string[0]) * string[1]
    return sorted(awards.items(), key = lambda x: x[1], reverse = True)
votes = accumulate_votes(strategy1, strategy2, common_strings)
#Need a way to combine similar categories
print(len(votes))

1148


In [88]:
collocation_words = {
    #"tv":"television",
    "pic":"picture",
    "for":"-",
    "in":"-",
    'or':'/',
    'of':'-'
}

skip_words = ['a']

paraphrase = [',','@','(',')','#']

# Find a good format for award names.
def gram_cleaning(grams):
    new_grams = []
    for gram in grams:
        word_list = []
        for word in gram[0]:
            if word in collocation_words:
                word = collocation_words[word]
            if word in skip_words:
                break
            if word not in paraphrase:
                word_list.append(word)
        word_tuple = tuple(word_list)
        gram_tuple = (word_tuple,gram[1])
        new_grams.append(gram_tuple)
    return new_grams

# Separate 'A/B' type of words into 'A / B' to add more information to resolute.
def sticky_word_string(phrase):
    phrase_list = nltk.word_tokenize(phrase)
    token_list = []
    for word in phrase_list:
        flag = 0
        for i,character in enumerate(word):
            if character == '/' and i != 0:
                token_list.append(word[0:i])
                token_list.append(word[i])
                flag = i
                if i != len(word)-1:
                    token_list.append(word[i+1:len(word)])
                break
        if flag == 0 and word not in skip_words:
            token_list.append(word)
    clean_string = ' '.join(token_list)
    return clean_string

# Replacing some collocation words.
def string_cleaning(grams):
    new_grams = []
    for gram in grams:
        word_list = []
        clean_gram = sticky_word_string(gram[0])
        temp_list = nltk.word_tokenize(clean_gram)
        for word in temp_list:
            if word in collocation_words:
                word = collocation_words[word]
            if word not in paraphrase:
                word = word.strip()
                word_list.append(word)
        phrase = ' '.join(word_list)
        gram_tuple = (phrase,gram[1])
        new_grams.append(gram_tuple)
    return new_grams

#for vote in votes:
#print(strategy1)
new1 = gram_cleaning(strategy1)
new2 = gram_cleaning(strategy2)
new3 = string_cleaning(common_strings)
votes = accumulate_votes(new1, new2, new3)
#Need a way to combine similar categories
print(len(votes))
#votes

957


In [89]:
# Ignore paraphrases and merge the similar results. Get new votes.
words_pattern = '[a-zA-Z]+'
resolution_list = []
awards_dict = {}
for i,vote in enumerate(votes):
    words = re.findall(words_pattern, vote[0], flags=re.IGNORECASE)
    words = ' '.join(words)
    if words in awards_dict:
        awards_dict[words].append(i)
    else:
        awards_dict[words] = [i]
#pprint(awards_dict)

new_list = []
for key, val in awards_dict.items():
    if len(val) == 1:
        new_list.append(list(votes[val[0]]))
    else:
        sum = 0
        for i in val:
            sum+=votes[i][1]
        new_list.append([votes[val[0]][0],sum])
#new_list

In [90]:
'''
TF-IDF computation. Get a weighted word vector representation.
'''
pd. set_option('display.max_columns', None)
pd. set_option('display.max_rows', None)

corpus = []
for item in new_list:
    corpus.append(item[0][1:])

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
tfidf = pd.DataFrame(denselist, columns=feature_names)

# Add more importance to some key words (can get from user inputs) which most distinguish different award names.
#keyword_list = ['actress','supporting','actor','director','drama','musical','television','comedy','tv']
for keyword in keyword_list:
    tfidf[keyword] = tfidf[keyword]*2
#tfidf.iloc[0]

In [91]:
# Compute pair similarity between each word vectors.

# Larger threshold means merge more strictly.
threshold = 0.9
final_list = []
cs = cosine_similarity(tfidf,tfidf)
#ans = pairwise_distances(tfidf,tfidf,'chebyshev')
index_list = []
for v,score_row in enumerate(cs):
    similarity = []
    merge_index = [v]
    for i, score in enumerate(score_row):
        if score > threshold and v!=i:
            similarity.append([new_list[v],score,new_list[i],v,i])
            merge_index.append(i)
    final_list.append(similarity)
    index_list.append(merge_index)
print('The merging result:')
#pprint(final_list)
#index_list

The merging result:


In [92]:
'''
Similarity Graph merging
Find the similar clusters by recursively merging the similar sentences.
Use the highest score sentence as its cluster name.
'''


def merge_index(curr_index,original_list,curr_cluster):
    for index in original_list[curr_index]:
        if index not in curr_cluster:
            curr_cluster.append(index)
            merge_index(index,original_list,curr_cluster)
    return curr_cluster

cluster_list = []
access = []
for v,il in enumerate(index_list):
    if v not in access:
        cl = merge_index(v,index_list,[])
        for i in cl:
            access.append(i)
        cluster_list.append(cl)

final_result = []
for cluster in cluster_list:
    sum_s = 0
    for item in cluster:
        sum_s += new_list[item][1]
    item_list = [new_list[cluster[0]][0],sum_s]
    final_result.append(item_list)


def sort_score(e):
    return e[1]
final_result.sort(reverse=True,key=sort_score)

#pprint(final_result)
print(len(final_result))




568


In [93]:
# Currently use 40 as its possible awards number. Can let user input awards number and double it.
final_awards_name = final_result[:awards_num*2]
pprint(final_awards_name)

[['best director - motion picture', 46444699.0],
 ['best actor - motion picture drama', 20584448.08203125],
 ['best screenplay - motion picture', 15340521.5625],
 ['best actress - motion picture drama', 3579185.75],
 ['best actress - motion picture comedy / musical', 2802760.375],
 ['best actress - tv series drama', 1797820.25],
 ['best actor comedy / musical', 1073631.5],
 ['best motion picture drama', 930911.4375],
 ['best supporting actor - motion picture', 615999.6875],
 ['best motion picture comedy / musical', 314756.75],
 ['best actress - comedy', 308826.0],
 ['best actor tv series drama', 300044.0],
 ['best original score - motion picture', 227244.875],
 ['best original song - motion picture', 177294.875],
 ['best screenplay award', 108135],
 ['best supporting actress - tv movie series / miniseries', 85489.25],
 ['best tv series comedy / musical', 74368.0625],
 ['best tv series drama', 51964.5],
 ['best actress - miniseries / tv movie', 41670.0],
 ['best actor - tv comedy', 3861

In [94]:
OFFICIAL_AWARDS_1315 = ['cecil b. demille award', 'best motion picture - drama', 'best performance by an actress in a motion picture - drama', 'best performance by an actor in a motion picture - drama', 'best motion picture - comedy or musical', 'best performance by an actress in a motion picture - comedy or musical', 'best performance by an actor in a motion picture - comedy or musical', 'best animated feature film', 'best foreign language film', 'best performance by an actress in a supporting role in a motion picture', 'best performance by an actor in a supporting role in a motion picture', 'best director - motion picture', 'best screenplay - motion picture', 'best original score - motion picture', 'best original song - motion picture', 'best television series - drama', 'best performance by an actress in a television series - drama', 'best performance by an actor in a television series - drama', 'best television series - comedy or musical', 'best performance by an actress in a television series - comedy or musical', 'best performance by an actor in a television series - comedy or musical', 'best mini-series or motion picture made for television', 'best performance by an actress in a mini-series or motion picture made for television', 'best performance by an actor in a mini-series or motion picture made for television', 'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']
OFFICIAL_AWARDS_1819 = ['best motion picture - drama', 'best motion picture - musical or comedy', 'best performance by an actress in a motion picture - drama', 'best performance by an actor in a motion picture - drama', 'best performance by an actress in a motion picture - musical or comedy', 'best performance by an actor in a motion picture - musical or comedy', 'best performance by an actress in a supporting role in any motion picture', 'best performance by an actor in a supporting role in any motion picture', 'best director - motion picture', 'best screenplay - motion picture', 'best motion picture - animated', 'best motion picture - foreign language', 'best original score - motion picture', 'best original song - motion picture', 'best television series - drama', 'best television series - musical or comedy', 'best television limited series or motion picture made for television', 'best performance by an actress in a limited series or a motion picture made for television', 'best performance by an actor in a limited series or a motion picture made for television', 'best performance by an actress in a television series - drama', 'best performance by an actor in a television series - drama', 'best performance by an actress in a television series - musical or comedy', 'best performance by an actor in a television series - musical or comedy', 'best performance by an actress in a supporting role in a series, limited series or motion picture made for television', 'best performance by an actor in a supporting role in a series, limited series or motion picture made for television', 'cecil b. demille award']

In [95]:
awards_token_dict = {}


final_awards_name = OFFICIAL_AWARDS_1315
removable_tokens = {'an','-','/','by','for','or','in','a','series','role','made','performance',',','feature','film'}
for award_name in final_awards_name:
    an1 = award_name.replace(' or motion picture made for television','')
    an1.strip()
    print(an1)
    an2 = award_name.replace('','')
    award_token_list = []
    tokens = nltk.word_tokenize(award_name.lower())
    for token in tokens:
        if all(token != rt for rt in removable_tokens):
            award_token_list.append(token)
    awards_token_dict[award_name] = [award_token_list]

    award_token_list1 = []
    tokens1 = nltk.word_tokenize(an1.lower())
    for token in tokens1:
        if all(token != rt for rt in removable_tokens):
            award_token_list1.append(token)
    awards_token_dict[award_name].append(award_token_list1)

# Aliasing:
alias = {
    'television':['tv'],
    'mini-series':['mini','series'],
    'film':[]
}

multiwords = {
    'motion picture':'movie'
}
for k, v in awards_token_dict.items():
    for val in v[0]:
        if val in alias.keys():
            new_tk_l = v[0][:]
            new_tk_l.remove(val)
            new_tk_l += alias[val]
            awards_token_dict[k].append(new_tk_l)
'''
for i, awards_name in enumerate(new_awards_list):
    #clean_string = re.sub('[^A-Za-z0-9]+', ' ', awards_name[0])
    clean_string = awards_name
    tokens = nltk.word_tokenize(clean_string)
    for rt in removable_tokens:
        if rt in tokens:
            tokens.remove(rt)
    awards_token_dict[awards_name] = [tokens]
    pprint(tokens)
    if ' / ' in awards_name:
        dirty_tokens = nltk.word_tokenize(awards_name)
        i = dirty_tokens.index('/')
        stickytoken = dirty_tokens[i-1] + '/' + dirty_tokens[i+1]
        tokens1 = tokens[:]
        #tokens1.remove('/')
        s1 = dirty_tokens[i-1]
        s2 = dirty_tokens[i+1]
        if s1 in tokens1:
            tokens1.remove(s1)
        if s2 in tokens1:
            tokens1.remove(s2)
        tokens1.append(stickytoken)
        awards_token_dict[awards_name].append(tokens1)
        
'''
pprint(awards_token_dict)

cecil b. demille award
best motion picture - drama
best performance by an actress in a motion picture - drama
best performance by an actor in a motion picture - drama
best motion picture - comedy or musical
best performance by an actress in a motion picture - comedy or musical
best performance by an actor in a motion picture - comedy or musical
best animated feature film
best foreign language film
best performance by an actress in a supporting role in a motion picture
best performance by an actor in a supporting role in a motion picture
best director - motion picture
best screenplay - motion picture
best original score - motion picture
best original song - motion picture
best television series - drama
best performance by an actress in a television series - drama
best performance by an actor in a television series - drama
best television series - comedy or musical
best performance by an actress in a television series - comedy or musical
best performance by an actor in a television serie

In [155]:
def extract_people_names(store,text_list,awards_name,awards_token,tweet_store):
    for tid, tweet in enumerate(text_list):
        for awards_tk_l in awards_token:
            #print(awards_tk_l)
            if all(token in tweet.lower() for token in awards_tk_l):
                if awards_name in store:
                    store[awards_name].append(tweet)
                    tweet_store[awards_name].append([tid,tweet])
                else:
                    store[awards_name] = [tweet]
                    tweet_store[awards_name]=[[tid,tweet]]
        if  awards_name in store.keys() and len(store[awards_name]) > 1000:
            break
    return [store, tweet_store]

def construct_regex(num):
    final = ''
    for i in range(num):
        substr = '[A-Z][a-z]* ?'
        final += substr
    final1 = '\"'+final+'\"'
    return final1

def construct_regex1(num):
    final = ''
    for i in range(num):
        substr = '[A-Z][a-z]* ?'
        final += substr
    final1 = "\'"+final+"\'"
    return final1

def extract_movie_song(store,text_list,awards_name,awards_token,num,tweet_store):
    occupation_words = ["actor","director","actress","singer","scientist","cecil b. demille"]
    for tid, tweet in enumerate(text_list):
        for award_tk_l in awards_token:
            if all(token in tweet.lower() for token in award_tk_l) and not any(occupation in tweet.lower() for occupation in occupation_words):
                for i in range(1,num):
                    regexp = construct_regex(i)
                    x = re.findall(regexp,tweet)
                    if len(x) != 0:
                        for i in range(len(x)):
                            x[i] = x[i].strip('\"')
                        if awards_name in store:
                            store[awards_name]+=x
                        else:
                            store[awards_name] = x
                    regexp1 = construct_regex1(i)
                    x = re.findall(regexp1,tweet)
                    if len(x) != 0:
                        for i in range(len(x)):
                            x[i] = x[i].strip('\'')
                        if awards_name in store:
                            store[awards_name]+=x
                        else:
                            store[awards_name] = x
                if awards_name in tweet_store:
                    tweet_store[awards_name].append([tid,tweet])
                else:
                    tweet_store[awards_name] = [[tid,tweet]]
        if  awards_name in store.keys() and len(store[awards_name]) > 1000:
            break
    return [store, tweet_store]





In [156]:
store = dict()
tweet_store = dict()
df = pd.read_json(data_path)['text']
occupation_words = ["actor","director","actress","singer","scientist","cecil b. demille"]
for awards_name, awards_token in awards_token_dict.items():
    if any(occupation in awards_name.lower() for occupation in occupation_words):
        [store, tweet_store] = extract_people_names(store,df,awards_name,awards_token,tweet_store)
        # Award for movie/songs
    else:
        [store, tweet_store] = extract_movie_song(store,df,awards_name,awards_token,5,tweet_store)
    
    print(awards_name, len(store[awards_name]))


    #print(store)
# pprint(store)
#pprint(tweet_store)



cecil b. demille award 1002
best motion picture - drama 1002
best performance by an actress in a motion picture - drama 1002
best performance by an actor in a motion picture - drama 1002
best motion picture - comedy or musical 1002
best performance by an actress in a motion picture - comedy or musical 1002
best performance by an actor in a motion picture - comedy or musical 1002
best animated feature film 122
best foreign language film 708
best performance by an actress in a supporting role in a motion picture 1002
best performance by an actor in a supporting role in a motion picture 1002
best director - motion picture 1002
best screenplay - motion picture 516
best original score - motion picture 42
best original song - motion picture 1004
best television series - drama 980
best performance by an actress in a television series - drama 1001
best performance by an actor in a television series - drama 1001
best television series - comedy or musical 695
best performance by an actress in a 

In [157]:
df5 = df.iloc[0:10000]
df4 = df5[df5.str.contains(" hosts ", case = False)]
tweets = df4.tolist()

#Find Common correlated Bigrams, for name search, 5 grams for "name and name"
words = []
bigrams = []
pentgrams = []
for tweet in tweets:
 #   tweet = tweet.lower()
    clean_string = re.sub('[^A-Za-z0-9]+', ' ', tweet)
    tokens = nltk.word_tokenize(clean_string)
    grams1 = ngrams(tokens, 1)
    grams2 = ngrams(tokens, 2)
    grams5 = ngrams(tokens, 5)
    for gram in grams1:
        words.append(gram)
    for gram in grams2:
        bigrams.append(gram)
    for gram in grams5:
        pentgrams.append(gram)
    common1 = Counter(words).most_common()
    common2 = Counter(bigrams).most_common()
    common5 = Counter(pentgrams).most_common()
names = nltk.corpus.names.words()

def accumulate_votes(words, bigrams, pentgrams):
    hosts = {}
    for word in words:
        untokenize = ' '.join(word[0])
        hosts[untokenize] = word[1] 
    for gram in bigrams:
        untokenize = ' '.join(gram[0])
        votes = gram[1]
        if gram[0][0] in hosts:
            if gram[0][0] in names:
                votes *= 2
            votes += hosts[gram[0][0]]
        if gram[0][1] in hosts:
            if gram[0][1] in names:
                votes *= 2
            votes += hosts[gram[0][1]]
        if gram[0][1][0].isupper() and gram[0][0][0].isupper():
            votes *= 2
        hosts[untokenize] = votes
    for gram in pentgrams:
        if gram[0][0] == "Hosts":
            host1 = gram[0][1] + " " + gram[0][2]
            if host1 in hosts:
                hosts[host1] = hosts[host1] * 2
            host2 = gram[0][3] + " " + gram[0][4]
            if host2 in hosts:
                hosts[host2] = hosts[host2] * 2
        if gram[0][2] == "and":
            host1 = gram[0][0] + " " + gram[0][1]
            if host1 in hosts:
                hosts[host1] = hosts[host1] * 2
            host2 = gram[0][3] + " " + gram[0][4]
            if host2 in hosts:
                hosts[host2] = hosts[host2] * 2
    return sorted(hosts.items(), key = lambda x: x[1], reverse = True)

host = accumulate_votes(common1, common2, common5)[:2]

In [185]:

#st = StanfordNERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz','stanford-ner/stanford-ner.jar')
punc = string.punctuation
names = nltk.corpus.names.words()

results = {}
rexp = '[A-Z][a-z]* [A-Z][a-z]*'
for awards_name,awards_tweets in store.items():
    temp_result = {}
    result = []
    flag = 0
    if any(occupation in awards_name.lower() for occupation in occupation_words):
        for awards_tweet in awards_tweets:
            x = re.findall(rexp,awards_tweet)
            if len(x)!= 0:
                for sx in x:
                    #print(sx)
                    tokens = nltk.word_tokenize(sx)
                    #print(tokens)
                    if tokens[0] in names and sx.lower() not in awards_name:
                        #print(dataframe[extraid])
                        result.append(sx)
    else:
        result = store[awards_name]
    print(awards_name,len(result))
    temp_result = (Counter(result).most_common())
    if len(temp_result)> 1:# and temp_result[0][1] > 50:
        for pre_name, pre_result in results.items():
            if temp_result[0][0] == pre_result[0][0] and temp_result[1][0] == pre_result[1][0]:
                #print(temp_result)
                #print(results[pre_name])
                if temp_result[0][1] > pre_result[0][1]:
                    results[pre_name] = temp_result
                    flag = 1
    
        if flag == 0:
            results[awards_name] = temp_result

    
pprint(results)

cecil b. demille award 348
best motion picture - drama 1002
best performance by an actress in a motion picture - drama 886
best performance by an actor in a motion picture - drama 914
best motion picture - comedy or musical 1002
best performance by an actress in a motion picture - comedy or musical 980
best performance by an actor in a motion picture - comedy or musical 924
best animated feature film 122
best foreign language film 708
best performance by an actress in a supporting role in a motion picture 924
best performance by an actor in a supporting role in a motion picture 46
best director - motion picture 912
best screenplay - motion picture 516
best original score - motion picture 42
best original song - motion picture 1004
best television series - drama 980
best performance by an actress in a television series - drama 968
best performance by an actor in a television series - drama 939
best television series - comedy or musical 695
best performance by an actress in a television 

In [186]:
for award in store.keys():
    if award in results.keys():
        #print(award[0],results[award[0]])
        print(award)
    #else:
     #   del tweet_store[award]
        

cecil b. demille award
best motion picture - drama
best performance by an actress in a motion picture - drama
best performance by an actor in a motion picture - drama
best motion picture - comedy or musical
best performance by an actress in a motion picture - comedy or musical
best performance by an actor in a motion picture - comedy or musical
best animated feature film
best foreign language film
best performance by an actress in a supporting role in a motion picture
best performance by an actor in a supporting role in a motion picture
best director - motion picture
best screenplay - motion picture
best original score - motion picture
best original song - motion picture
best television series - drama
best performance by an actress in a television series - drama
best performance by an actor in a television series - drama
best television series - comedy or musical
best performance by an actress in a television series - comedy or musical
best performance by an actor in a television serie

In [187]:
def find_presenters(awards_store,dataframe):
    presenter_store = {}
    scope = 20
    names = nltk.corpus.names.words()
    rexp = '[A-Z][a-z]* [A-Z][a-z]*'
    for award_name, tweets in awards_store.items():
        present_list = []
        for tid, tweet in tweets:
            if scope < tid < len(dataframe)-scope:
                for extraid in range(tid-scope,tid+scope):
                    if 'present' in dataframe[extraid].lower() or 'introduc' in dataframe[extraid].lower():
                        x = re.findall(rexp,dataframe[extraid])
                        if len(x)!= 0:
                            for sx in x:
                                #print(sx)
                                tokens = nltk.word_tokenize(sx)
                                #print(tokens)
                                if tokens[0] in names and sx.lower() not in award_name:
                                    #print(dataframe[extraid])
                                    present_list.append(sx)
        presenter_store[award_name] = Counter(present_list).most_common()[:3]
    return presenter_store

present_store = find_presenters(tweet_store,df)
pprint(present_store)

    

{'best animated feature film': [('Kristen Wiig', 598),
                                ('Bill Hader', 520),
                                ('Kevin Hart', 396)],
 'best director - motion picture': [('Harrison Ford', 70),
                                    ('Richard Linklater', 30),
                                    ('Anna Faris', 16)],
 'best foreign language film': [('Jennifer Aniston', 64),
                                ('Lily Tomlin', 46),
                                ('Kristen Wiig', 44)],
 'best mini-series or motion picture made for television': [('Gina Rodriguez',
                                                             76),
                                                            ('Jennifer Aniston',
                                                             66),
                                                            ('Jeremy Renner',
                                                             57)],
 'best motion picture - comedy or musical': [('Robert Do

In [188]:
def find_related_nominees(awards_store,dataframe):
    nominees_store = {}
    scope = 20
    names = nltk.corpus.names.words()
    rexp = '[A-Z][a-z]* [A-Z][a-z]*'
    for award_name, tweets in awards_store.items():
        nominee_list = []
        if any(occupation in award_name.lower() for occupation in occupation_words):
            for tid, tweet in tweets:
                if scope < tid < len(dataframe)-scope:
                    for extraid in range(tid-scope,tid+scope):
                        if 'win' in dataframe[extraid].lower():
                            x = re.findall(rexp,dataframe[extraid])
                            if len(x)!= 0:
                                for sx in x:
                                    #print(sx)
                                    tokens = nltk.word_tokenize(sx)
                                    #print(tokens)
                                    if tokens[0] in names:
                                        print(dataframe[extraid])
                                        nominee_list.append(sx)
        else:
            if award_name in results.keys():
                nominee_list = results[award_name][:5]
        nominees_store[award_name] = Counter(nominee_list).most_common()[:5]
    return nominees_store

#nominees_store1 = find_related_nominees(tweet_store,df)

In [189]:
def is_similar(name1, name2):
    nt1 = nltk.word_tokenize(name1)
    nt2 = nltk.word_tokenize(name2)
    if any(tk in nt1 for tk in nt2):
        return True
    if nltk.edit_distance(name1,name2) < 3:
        return True
    return False
#print (is_similar('Richard Linklater','Ava Du'))

In [190]:
def find_nominees(results):
    nominees_store = {}
    occupation_words = ["actor","director","actress","singer","scientist","cecil b. demille"]
    rexp = '[A-Z][a-z]* [A-Z][a-z]*'
    for awards_name, result in results.items():
        nominee = []
         # Merge People
        if any(occupation in awards_name.lower() for occupation in occupation_words):
            for name in result:
                x = re.findall(rexp,name[0])
                if len(x)!=0:
                    if len(nominee) == 0:
                        nominee.append(name)
                    else:
                        if not any(is_similar(prev_name[0],name[0]) for prev_name in nominee):
                            nominee.append(name)

        else:
            nominee = results[awards_name][:5]
        nominees_store[awards_name] = nominee
    return nominees_store

nominees_store2 = find_nominees(results)
#pprint(nominees_store2)

In [191]:
dfdress = df
df2 = dfdress[dfdress.str.contains(" dressed ", case = False)]

df3 = df2[df2.str.contains(" best ", case = False)]
best_tweets = df3.tolist()
df4 = df2[df2.str.contains(" worst ", case = False)]
worst_tweets = df4.tolist()
df5 = df[df.str.contains(" carpet ", case = False)]
mentions = df5.tolist()

def find_most_common_name(tweets):
    bigrams = []
    for tweet in tweets:
 #   tweet = tweet.lower()
        clean_string = re.sub('[^A-Za-z0-9]+', ' ', tweet)
        tokens = nltk.word_tokenize(clean_string)
        grams2 = ngrams(tokens, 2)
        for gram in grams2:
            bigrams.append(gram)
    common2 = Counter(bigrams).most_common()
    names = nltk.corpus.names.words()
    people = {}
    for gram in common2:
        untokenize = ' '.join(gram[0])
        votes = gram[1]
        if gram[0][0] in names:
            votes *= 3
        if gram[0][1] in names:
            votes *= 2
        if gram[0][1][0].isupper() and gram[0][0][0].isupper() and untokenize != "Red Carpet" and untokenize != "Golden Globes":
            if gram[0][0] not in names and gram[0][1] not in names:
                votes *= 1.5
            elif gram[0][0] in names and gram[0][1] in names:
                votes *= 4
            elif gram[0][0] in names or gram[0][1] in names:
                votes *= 3
            else:
                votes *= 2
        people[untokenize] = votes
    return sorted(people.items(), key = lambda x: x[1], reverse = True)

In [192]:
most_mentioned = find_most_common_name(mentions)
best_dressed = find_most_common_name(best_tweets)
worst_dressed = find_most_common_name(worst_tweets)
print("Most Mentioned on Red Carpet:", most_mentioned[0][0])
print("Best Dressed on Red Carpet:", best_dressed[0][0])
print("Worst Dressed on Red Carpet:", worst_dressed[0][0])

Most Mentioned on Red Carpet: Kerry Washington
Best Dressed on Red Carpet: Tina Fey
Worst Dressed on Red Carpet: See The


In [193]:
def output_name(item_list):
    rexp = '[A-Z][a-z]* [A-Z][a-z]*'
    names = nltk.corpus.names.words()
    result = []
    for item in item_list:
        x = re.findall(rexp,item[0])
        if len(x)!= 0:
            for sx in x:                         
                tokens = nltk.word_tokenize(sx)                         
                if tokens[0] in names and tokens[1] in names:
                    result = item
                    return result

most_mentioned1 = output_name(most_mentioned)
best_dressed1 = output_name(best_dressed)
worst_dressed1 = output_name(worst_dressed)
#worst_dressed1

In [194]:


def get_sentiment(tweets):
    neu = 0
    pos = 0
    neg = 0
    for tweet in tweets:
        sid = SentimentIntensityAnalyzer()
        ss = sid.polarity_scores(tweet)
        neu += ss['neu']
        pos += ss['pos']
        neg += ss['neg']
    sentiment = {}
    sentiment['Neutral'] = neu / len(tweets)
    sentiment['Positive'] = pos / len(tweets)
    sentiment['Negative'] = neg / len(tweets)
    return sentiment

def analyze_sentiment(dataset,words_string):

    #Example for sentiment towards Tina Fey
    df2 = dataset[dataset.str.contains(words_string, case = False)]
    if len(df2)>100:
        tweets = df2.iloc[0:100].tolist()
    else:
        tweets = df2.tolist()
    senti = get_sentiment(tweets)
    print(senti)
    return senti


analyze_sentiment(df,sentiment_word)

{'Neutral': 0.8679200000000015, 'Positive': 0.07998999999999998, 'Negative': 0.05209}


{'Neutral': 0.8679200000000015,
 'Positive': 0.07998999999999998,
 'Negative': 0.05209}

In [195]:
f1 = open('human_readable_results.txt','w')
write_string = ""

print("Host: ")
write_string += "Host: "
print(host[0][0])
write_string += host[0][0]
write_string +=", "
print(host[1][0])
write_string += host[1][0]
print("\n")
write_string += "\n"

print("Most Mentioned on Red Carpet:", most_mentioned1[0])
write_string += ("Most Mentioned on Red Carpet: "+ most_mentioned1[0]+'\n')
print("Best Dressed on Red Carpet:", best_dressed1[0])
write_string += ("Best Dressed on Red Carpet: "+ best_dressed1[0]+'\n')
print("Worst Dressed on Red Carpet:", worst_dressed1[0])
write_string += ("Worst Dressed on Red Carpet: "+ worst_dressed1[0]+'\n')
print("\n")
write_string += "\n"

for award_name, award_result in results.items():
    c_award_name = []
    nt = nltk.word_tokenize(award_name)
    for token in nt:
        ct = token.capitalize()
        c_award_name.append(ct)
    c_name = ' '.join(c_award_name)
    nomlist = []
    for i in range(len(nominees_store2[award_name][:5])):
        nomlist.append(nominees_store2[award_name][i][0])

    write_string += "Award Name: "
    print("Award Name: ")
    print(c_name)
    write_string += c_name
    write_string += "\n"


    write_string += "Presenter: "
    print("Presenter: ")
    if len(present_store[award_name]) != 0:
        print(present_store[award_name][0])
        write_string += present_store[award_name][0][0]
        write_string += "\n"

    write_string += "Nominess: "
    print("Nominess: ")
    #print(nominees_store1[award_name])
    print(nominees_store2[award_name][:5])
    write_string += str(nomlist)
    write_string += "\n"

    write_string += "Winner: "
    print("Winner: ")
    print(award_result[0])
    write_string += award_result[0][0]
    write_string += "\n"

    write_string += "Attitude to Winner: "
    print("Attitude to Winner: ")
    astr = analyze_sentiment(df,award_result[0][0])
    print("\n")
    write_string += str(astr)
    write_string += "\n"
    write_string += "\n"

f1.write(write_string)
f1.close()

Host: 
Tina Fey
Amy Poehler


Most Mentioned on Red Carpet: Kerry Washington
Best Dressed on Red Carpet: Tina Fey
Worst Dressed on Red Carpet: Lana Del


Award Name: 
Cecil B. Demille Award
Presenter: 
('George Clooney', 26)
Nominess: 
[('George Clooney', 270), ('Julianna Margulies', 24), ('Don Cheadle', 20), ('Liza Minnelli', 10), ('Elizabeth Taylor', 10)]
Winner: 
('George Clooney', 270)
Attitude to Winner: 
{'Neutral': 0.8480800000000005, 'Positive': 0.14460000000000003, 'Negative': 0.007300000000000001}


Award Name: 
Best Motion Picture - Drama
Presenter: 
('Tina Fey', 892)
Nominess: 
[('Boyhood', 962), ('Selma', 12), ('The Grand Budapest Hotel', 6), ('Best Drama Motion Picture', 4), ('Foxcatcher', 4)]
Winner: 
('Boyhood', 962)
Attitude to Winner: 
{'Neutral': 0.7716000000000001, 'Positive': 0.21034999999999995, 'Negative': 0.018059999999999996}


Award Name: 
Best Performance By An Actress In A Motion Picture - Drama
Presenter: 
('Jamie Dornan', 38)
Nominess: 
[('Julianne Moore',

In [196]:
import json
final_data = {"Host":[host[0][0],host[1][0]]}



for award_name in results.keys():
    c_award_name = []
    nt = nltk.word_tokenize(award_name)
    for token in nt:
        ct = token.capitalize()
        c_award_name.append(ct)
    c_name = ' '.join(c_award_name)
    nomlist = []
    for i in range(len(nominees_store2[award_name][:5])):
        nomlist.append(nominees_store2[award_name][i][0])
    final_data.update({
        c_name:{
            "Presenter:":present_store[award_name][0][0],
            "Nominees:":nomlist,
            "Winner:":results[award_name][0][0]
        }
    })





with open('result.json', 'w', encoding='utf-8') as f:
    json.dump(final_data, f, ensure_ascii=False, indent=4)