In [116]:
import pandas as pd
import numpy as np

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

import string
import re

In [117]:
data = pd.read_json('gg2013.json')
data = pd.DataFrame(data)

In [118]:
data = data['text'].values.tolist()

In [119]:
category_dict = {
    'Best Motion Picture – Drama' :  ['best', 'drama', 'movie', 'motion', 'picture'], 
    'Best Motion Picture – Musical or Comedy' : ['best', 'movie', 'musical', 'comedy', 'motion', 'picture'],
    'Best Motion Picture – Foreign Language' : ['best', 'foreign', 'language', 'motion', 'picture'],
    'Best Motion Picture – Animated' : ['best', 'animated', 'motion', 'picture'],
    'Best Director – Motion Picture' : ['best', 'director', 'motion', 'picture'],
    'Best Actor – Motion Picture Drama' : ['best', 'drama', 'actor', 'motion', 'picture'],
    'Best Actor – Motion Picture Musical or Comedy' : ['best', 'actor', 'musical', 'comedy', 'motion', 'picture'],
    'Best Actress – Motion Picture Drama' : ['best', 'actress', 'drama', 'motion', 'picture'],
    'Best Actress – Motion Picture Musical or Comedy' : ['best', 'musical', 'comedy', 'actress', 'motion', 'picture'],
    'Best Supporting Actor – Motion Picture' : ['best', 'supporting', 'actor', 'motion', 'picture'],
    'Best Supporting Actress – Motion Picture' : ['best', 'supporting', 'actress', 'motion', 'picture'],
    'Best Screenplay – Motion Picture' : ['best', 'screenplay', 'motion', 'picture'],
    'Best Original Score – Motion Picture' : ['best', 'original', 'score', 'motion', 'picture'],
    'Best Original Song – Motion Picture' : ['best', 'original', 'song', 'motion', 'picture'],
    'Cecil B. DeMille Award for Lifetime Achievement in Motion Pictures' : ['cecil', 'demille', 'lifetime', 'achievement'],
    'Best Television Series – Drama' : ['best', 'television', 'tv', 'series', 'drama'],
    'Best Television Series – Musical or Comedy' : ['best', 'television', 'tv', 'series', 'musical', 'comedy'],
    'Best Miniseries or Television Film' : ['best', 'television', 'tv', 'film', 'miniseries'],
    'Best Actor – Television Series Drama' : ['best', 'actor', 'television', 'drama'],
    'Best Actor – Television Series Musical or Comedy' : ['best', 'actor', 'television', 'tv', 'musical', 'comedy'],
    'Best Actor – Miniseries or Television Film' : ['best', 'actor', 'miniseries', 'television', 'film'],
    'Best Actress – Television Series Drama' : ['best', 'actress', 'television', 'tv', 'drama'],
    'Best Actress – Television Series Musical or Comedy' : ['best', 'actress', 'television', 'tv', 'musical', 'comedy'],
    'Best Actress – Miniseries or Television Film' : ['best', 'actress', 'mini', 'miniseries', 'television', 'tv'], 
    'Best Supporting Actor – Series, Miniseries or Television Film' : ['best', 'supporting', 'actor', 'series', 'mini', 'miniseries', 'tv', 'television', 'film'], 
    'Best Supporting Actress – Series, Miniseries or Television Film' : ['best', 'supporting', 'actress', 'series', 'mini', 'miniseries', 'tv', 'television', 'film'], 
    'Carol Burnett Award for Achievement in Television' : ['carol', 'burnett', 'television', 'tv']
}

In [120]:
def filter0(data, list1):
    result = []

    for tweet in data:
        if all(term in tweet.lower() for term in list1):
            result.append(tweet)

    return result

In [121]:
def filter1(data, list1):
    result1 = []
    result2 = []

    for tweet in data:
        if any(term in tweet.lower() for term in list1):
            result1.append(tweet)
        else:
            result2.append(tweet)

    return result1, result2      

In [122]:
def filter2(data, list1, list2):
    result1 = []
    result2 = []

    for tweet in data:
        if any(term in tweet.lower() for term in list1):
            result1.append(tweet)
        elif any(term in tweet.lower() for term in list2):
            result2.append(tweet)
            
    return result1, result2

In [123]:
def filter3(data, list1, list2, list3):
    result1 = []
    result2 = []
    result3 = []
        
    for tweet in data:
        if any(term in tweet.lower() for term in list1):
            result1.append(tweet)
        elif any(term in tweet.lower() for term in list2):
            result2.append(tweet)
        elif any(term in tweet.lower() for term in list3):
            result3.append(tweet)
            
    return result1, result2, result3

In [155]:
def extractWinner(data, list1):
    result = []
    
    tt = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)

    # strip stopwords, punctuation 
    punctuation = list(string.punctuation)
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'rt']    
    stop = punctuation + remove_terms + list1

    for tweet in data:
        tweet = re.sub("\d+", "", tweet) #strip nums
        tweet = re.sub(r'http\S+', '', tweet) #strip urls
        tweet = re.sub(r'#\S+', '', tweet) #strip hashtags
        tweet = tt.tokenize(tweet) #tokenize
        tweet = [term for term in tweet if term.lower() not in stop] #remove stop words
        result.append(tweet)
        
    #print(result[:20])
        
    bgrams = [];

    for tweet in result:
        if tweet:
            bigram = list(nltk.bigrams(tweet))
            #print(bigram[:10])
            for bg in bigram:
                if bool(re.search('([A-Z][a-z]+)', bg[0])) and bool(re.search('([A-Z][a-z]+)', bg[1])):
                    #print(bg[0], bg[1])
                    bgrams.append(bg)
            #tweet = re.findall('([A-Z][a-z]+)', tweet)
            #if tweet:
                #bgrams += list(nltk.bigrams(tweet))
    #print(bgrams[:10])  
    fdist = nltk.FreqDist(bgrams)
    
    temp = fdist.most_common(1)[0][0]
    name = ' '.join(temp)
    
    return name 

In [156]:
def getActorWinners():
    # layer 1 - divide movies / tv
    movie = ['motion picture', 'movie', 'movies']
    tv = ['tv', 'television']
    f1_movie, f1_tv = filter2(data, movie, tv)
    
    # layer 2 - divide actors+actresses / media
    actor = ['actor', 'actress', 'actors', 'actresses']
    f2_movie_actor, f2_movie_media = filter1(f1_movie, actor)
    f2_tv_actor, f2_tv_media = filter1(f1_tv, actor)
    
    #layer 3 - divide actors / actresses
    f3_tv_actor, f3_tv_actress = filter2(f2_tv_actor, ['actor', 'actors'], ['actress', 'actresses'])
    f3_movie_actor, f3_movie_actress = filter2(f2_movie_actor, ['actor', 'actors'], ['actress', 'actresses'])
    
    # layer 4 - divide main / supporting actors
    f4_movie_actor_supporting, f4_movie_actor_main = filter1(f3_movie_actor, ['supporting'])
    f4_movie_actress_supporting, f4_movie_actress_main = filter1(f3_movie_actress, ['supporting'])
    f4_tv_actor_supporting, f4_tv_actor_main = filter1(f3_tv_actor, ['supporting'])
    f4_tv_actress_supporting, f4_tv_actress_main = filter1(f3_tv_actress, ['supporting'])
    
    # layer 5.1 - for main movie actors, divide drama / comedy or musical
    f5_movie_actor_main_drama, f5_movie_actor_main_musical_comedy = filter2(f4_movie_actor_main, ['drama'], ['musical', 'comedy'])
    f5_movie_actress_main_drama, f5_movie_actress_main_musical_comedy = filter2(f4_movie_actress_main, ['drama'], ['musical', 'comedy'])
    
    # layer 5.2 - for main tv actors, divide drama / comedy or musical / miniseries or film
    f5_tv_actor_main_drama, f5_tv_actor_main_musical_comedy, f5_tv_actor_main_mini_film = filter3(f4_tv_actor_main, ['drama'], ['musical', 'comedy'], ['mini', 'miniseries', 'film', 'movie'])
    f5_tv_actress_main_drama, f5_tv_actress_main_musical_comedy, f5_tv_actress_main_mini_film = filter3(f4_tv_actress_main, ['drama'], ['musical', 'comedy'], ['mini', 'miniseries', 'film', 'movie'])
    
    winners = {
        'Best Actor – Motion Picture Drama' : extractWinner(f5_movie_actor_main_drama, category_dict['Best Actor – Motion Picture Drama']),
        'Best Actor – Motion Picture Musical or Comedy' : extractWinner(f5_movie_actor_main_musical_comedy, category_dict['Best Actor – Motion Picture Musical or Comedy']),
        'Best Actress – Motion Picture Drama' : extractWinner(f5_movie_actress_main_drama, category_dict['Best Actress – Motion Picture Drama']),
        'Best Actress – Motion Picture Musical or Comedy' : extractWinner(f5_movie_actress_main_musical_comedy, category_dict['Best Actress – Motion Picture Musical or Comedy']),
        'Best Supporting Actor – Motion Picture' : extractWinner(f4_movie_actor_supporting, category_dict['Best Supporting Actor – Motion Picture']),
        'Best Supporting Actress – Motion Picture' : extractWinner(f4_movie_actress_supporting, category_dict['Best Supporting Actress – Motion Picture']),
        'Best Actor – Television Series Drama' : extractWinner(f5_tv_actor_main_drama, category_dict['Best Actor – Television Series Drama']),
        'Best Actor – Television Series Musical or Comedy' : extractWinner(f5_tv_actor_main_musical_comedy, category_dict['Best Actor – Television Series Musical or Comedy']),
        'Best Actor – Miniseries or Television Film' : extractWinner(f5_tv_actor_main_mini_film, category_dict['Best Actor – Miniseries or Television Film']),
        'Best Actress – Television Series Drama' : extractWinner(f5_tv_actress_main_drama, category_dict['Best Actress – Television Series Drama']),
        'Best Actress – Television Series Musical or Comedy' : extractWinner(f5_tv_actress_main_musical_comedy, category_dict['Best Actress – Television Series Musical or Comedy']),
        'Best Actress – Miniseries or Television Film' : extractWinner(f5_tv_actress_main_mini_film, category_dict['Best Actress – Miniseries or Television Film']), 
        'Best Supporting Actor – Series, Miniseries or Television Film' : extractWinner(f4_tv_actor_supporting, category_dict['Best Supporting Actor – Series, Miniseries or Television Film']), 
        'Best Supporting Actress – Series, Miniseries or Television Film' : extractWinner(f4_tv_actress_supporting, category_dict['Best Supporting Actress – Series, Miniseries or Television Film']) 
    }
    
    return winners
    

In [157]:
getActorWinners()

{'Best Actor – Motion Picture Drama': 'Daniel Day-Lewis',
 'Best Actor – Motion Picture Musical or Comedy': 'Hugh Jackman',
 'Best Actress – Motion Picture Drama': 'Jessica Chastain',
 'Best Actress – Motion Picture Musical or Comedy': 'Jennifer Lawrence',
 'Best Supporting Actor – Motion Picture': 'Christoph Waltz',
 'Best Supporting Actress – Motion Picture': 'Anne Hathaway',
 'Best Actor – Television Series Drama': 'Damian Lewis',
 'Best Actor – Television Series Musical or Comedy': 'Don Cheadle',
 'Best Actor – Miniseries or Television Film': 'Kevin Costner',
 'Best Actress – Television Series Drama': 'Claire Danes',
 'Best Actress – Television Series Musical or Comedy': 'Lena Dunham',
 'Best Actress – Miniseries or Television Film': 'Julianne Moore',
 'Best Supporting Actor – Series, Miniseries or Television Film': 'Ed Harris',
 'Best Supporting Actress – Series, Miniseries or Television Film': 'Maggie Smith'}

In [165]:
def getBestNonActors():
    director = filter0(data, ['director'])
    screenplay = filter0(data, ['screenplay'])
    original_score = filter0(data, ['original', 'score'])
    cecil_demille = filter0(data, ['cecil', 'demille'])
    carol_burnett = filter0(data, ['carol', 'burnett'])
    
    #print(original_score[:20])
    
    winners = {
        'Best Director – Motion Picture' : extractWinner(director, category_dict['Best Director – Motion Picture']),
        'Best Screenplay – Motion Picture' : extractWinner(screenplay, category_dict['Best Screenplay – Motion Picture']),
        'Best Original Score – Motion Picture' : extractWinner(original_score, category_dict['Best Original Score – Motion Picture']),
        'Cecil B. DeMille Award for Lifetime Achievement in Motion Pictures' : extractWinner(cecil_demille, category_dict['Cecil B. DeMille Award for Lifetime Achievement in Motion Pictures'])
    }
    
    return winners   

In [166]:
getBestNonActors()

{'Best Director – Motion Picture': 'Ben Affleck',
 'Best Screenplay – Motion Picture': 'Quentin Tarantino',
 'Best Original Score – Motion Picture': 'Mychael Danna',
 'Cecil B. DeMille Award for Lifetime Achievement in Motion Pictures': 'Jodie Foster'}