In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import string
import math
import re
import datetime
import time
from collections import Counter
import statistics as st

In [2]:
data = pd.read_json('gg2013.json')

In [3]:
data = pd.DataFrame(data)

In [22]:
print(len(data))

174643


In [35]:
print(data[:1])

                   id                                    text  \
0  290620657987887104  JLo's dress! #eredcarpet #GoldenGlobes   

         timestamp_ms                                           user contains  
0 2013-01-14 00:45:38  {'screen_name': 'Dozaaa_xo', 'id': 557374298}    False  


In [5]:
def timestampToUnix(t):
    return time.mktime(t.timetuple())

In [6]:
f = open('english.txt', 'r')
stopwords = f.read().splitlines()
#print(stopwords)

In [7]:
OFFICIAL_AWARDS_1315 = ['cecil b. demille award', 
                        'best motion picture - drama', 
                        'best performance by an actress in a motion picture - drama', 
                        'best performance by an actor in a motion picture - drama', 
                        'best motion picture - comedy or musical', 
                        'best performance by an actress in a motion picture - comedy or musical', 
                        'best performance by an actor in a motion picture - comedy or musical', 
                        'best animated feature film', 'best foreign language film', 
                        'best performance by an actress in a supporting role in a motion picture', 
                        'best performance by an actor in a supporting role in a motion picture', 
                        'best director - motion picture', 'best screenplay - motion picture', 
                        'best original score - motion picture', 'best original song - motion picture', 
                        'best television series - drama', 
                        'best performance by an actress in a television series - drama', 
                        'best performance by an actor in a television series - drama', 
                        'best television series - comedy or musical', 
                        'best performance by an actress in a television series - comedy or musical', 
                        'best performance by an actor in a television series - comedy or musical', 
                        'best mini-series or motion picture made for television', 
                        'best performance by an actress in a mini-series or motion picture made for television', 
                        'best performance by an actor in a mini-series or motion picture made for television', 
                        'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 
                        'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']

In [8]:
def parseAward(award):
    award = re.split('\W+', award)
    award = [i for i in award if i not in stopwords]
    return award

In [9]:
def getAwardCategories():
    category_dict = dict()
    terms_counter = Counter()
    for a in OFFICIAL_AWARDS_1315:
        terms = parseAward(a)
        for t in terms:
            terms_counter[t] += 1
        category_dict[a] = terms
        
    return category_dict

In [38]:
def getTweetsForAward(category_dict, award):
    #df = pd.DataFrame(columns=['id', 'text', 'timestamp_ms', 'user'])
    l = []
    for tweet in data['text']:
        if all(term in tweet.lower() for term in category_dict[award]):
            l.append(data.loc[data.text == tweet])
    df = pd.concat(l)
    return df

In [23]:
def getTweetsInTimeRange(df):
    df['unixtime'] = df['timestamp_ms'].apply(lambda x:  timestampToUnix(x))
    
    mean = st.mean(df['unixtime'].values.tolist())
    stdev = st.stdev(df['unixtime'].values.tolist())
    starttime = mean - (stdev*1)
    endtime = mean + (stdev*1)
    
    starttime = datetime.datetime.fromtimestamp(starttime).strftime('%Y-%m-%d %H:%M:%S')
    endtime = datetime.datetime.fromtimestamp(endtime).strftime('%Y-%m-%d %H:%M:%S')
    
    mask = (data['timestamp_ms'] >= starttime) & (data['timestamp_ms'] <= endtime)
    df = data.loc[mask]
    print(len(df))
    return df

In [25]:
def getTweetsInTimeRange2(df):
    starttime = df['timestamp_ms'].min()
    endtime = df['timestamp_ms'].max()
    mask = (data['timestamp_ms'] >= starttime) & (data['timestamp_ms'] <= endtime)
    df = data.loc[mask]
    print(len(df))
    return df

In [12]:
def filter0(data, list1):
    result = []

    for tweet in data['text']:
        if all(term in tweet.lower() for term in list1):
            result.append(tweet)

    return result

In [13]:
def extractPresenters(data, list1):
    #print(data[0])
    result = []
    
    tt = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)
    translator = str.maketrans('', '', string.punctuation)

    # strip stopwords, punctuation 
    #punctuation = list(string.punctuation)
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'rt', 'golden', 'globe', 'globes']    
    stop = remove_terms + list1
    
    for tweet in data:
        #print(tweet)
        tweet = re.sub("\d+", "", tweet) #strip nums
        tweet = re.sub(r'http\S+', '', tweet) #strip urls
        tweet = re.sub(r'#\S+', '', tweet) #strip hashtags
        #tweet = re.sub(r'[^\w\s]', '', tweet) #strip non-alphanumeric characters
        tweet = tweet.translate(translator)
        tweet = tweet.split() #tokenize
        tweet = [term for term in tweet if term.lower() not in stop] #remove stop words
        result.append(tweet)
        
    #print(result[:20])
        
    bgrams = [];

    for tweet in result:
        if tweet:
            bigram = list(nltk.bigrams(tweet))
            #print(bigram[:10])
            for bg in bigram:
                if bool(re.match('([A-Z][a-z]+)', bg[0])) and bool(re.match('([A-Z][a-z]+)', bg[1])):
                    #print(bg[0], bg[1])
                    bgrams.append(bg)
            #tweet = re.findall('([A-Z][a-z]+)', tweet)
            #if tweet:
                #bgrams += list(nltk.bigrams(tweet))
  
    fdist = nltk.FreqDist(bgrams)
    #print(fdist)
    
    try:
        temp = fdist.most_common(1)[0][0]
        name = ' '.join(temp)
    except:
        name = "nothing here"
    
    return fdist.most_common(5)

In [40]:
def getPresenters():
    category_dict = getAwardCategories()
    
    for award in OFFICIAL_AWARDS_1315:
        print(award)
        df = getTweetsForAward(category_dict, award)
        print(df[:5])
        df = getTweetsInTimeRange(df)
        print(df[:5])
        stopwords = category_dict[award] + ['Jessica', 'Chastain', 'Marion', 'Cotillard', 'Helen', 'Mirren', 'Naomi', 'Watts', 'Rachel', 'Weisz']
        results = filter0(df, ['present'])
        presenters = extractPresenters(results, category_dict[award])
        print(award, presenters)

In [41]:
getPresenters()

cecil b. demille award
                       id                                               text  \
37780  290633958348165120  "Kristen Stewart e Simon Baker para anunciar o...   
39286  290634456509861888  RT @TwiViciados: "Kristen Stewart e Simon Bake...   
39708  290634618313523200  RT @HollywoodLegacy: Golden Globe History: In ...   
42698  290635743464931329  RT @HollywoodLegacy: Golden Globe History: In ...   
43172  290635944678277120  RT @HollywoodLegacy: Golden Globe History: In ...   

             timestamp_ms                                               user  \
37780 2013-01-14 01:38:30    {'screen_name': 'TwiViciados', 'id': 179564865}   
39286 2013-01-14 01:40:28  {'screen_name': 'lesbianforstew_', 'id': 25471...   
39708 2013-01-14 01:41:07         {'screen_name': 'rsethib', 'id': 22226042}   
42698 2013-01-14 01:45:35      {'screen_name': 'TCM_Party', 'id': 401089025}   
43172 2013-01-14 01:46:23  {'screen_name': 'JeffreyLuscombe', 'id': 17204...   

      contains 

                       id                                               text  \
27067  290630489134481409  It's weird that they put comedy and musical in...   
28134  290630758035501056  A nominee for #BestMotionPicture for A Comedy ...   
28795  290631125905330176  Let's hope the Hollywood Foreign Press made th...   
63576  290640048272072704  Best Actress in a Motion Picture Comedy/Musica...   
64868  290640382453235712  Jennifer Lawrence better win for this! #Golden...   

             timestamp_ms                                               user  \
27067 2013-01-14 01:24:42    {'screen_name': 'thelilyriver', 'id': 83414986}   
28134 2013-01-14 01:25:47  {'screen_name': 'ChasingCinema', 'id': 525493157}   
28795 2013-01-14 01:27:14   {'screen_name': 'TylerHoltzman', 'id': 15073512}   
63576 2013-01-14 02:02:41           {'screen_name': 's3nsi', 'id': 73888208}   
64868 2013-01-14 02:04:01   {'screen_name': 'holly_caitlin', 'id': 21166924}   

      contains  
27067     True  
2813

                       id                                               text  \
44236  290636280784629761  RT @AmourFilm: Keep your fingers crossed for #...   
94530  290648111666323456  Sylvester Stallone and @Schwarzenegger present...   
94669  290648176778682368  Best Foreign Language Film: Amour (Austria) #G...   
94696  290648175931432960  Yep, AMOUR wins Best Foreign Language Film #Go...   
94731  290648174094327808  Best foreign language film goes to Amour! #Gol...   

             timestamp_ms                                              user  \
44236 2013-01-14 01:47:43  {'screen_name': 'pwoblematique', 'id': 19537857}   
94530 2013-01-14 02:34:44     {'screen_name': 'samtraspe', 'id': 135435635}   
94669 2013-01-14 02:34:59      {'screen_name': 'yontefian', 'id': 20027344}   
94696 2013-01-14 02:34:59   {'screen_name': 'TheFilmChair', 'id': 82661658}   
94731 2013-01-14 02:34:59    {'screen_name': 'MelindaJane', 'id': 18031568}   

      contains  
44236     True  
94530     

                       id                                               text  \
85460  290645304871628800  Robert Pattinson and Amanda Seyfried present t...   
85464  290645304716455936  Robert Pattinson alert! Presenting Best Screen...   
85510  290645357921193984  Best Screenplay - Motion Picture: Quentin Tara...   
85842  290645456877391873  RT @nbc: Django Unchained takes home the award...   
86066  290645495867650049  RT @nbc: Django Unchained takes home the award...   

             timestamp_ms                                               user  \
85460 2013-01-14 02:23:35      {'screen_name': 'samtraspe', 'id': 135435635}   
85464 2013-01-14 02:23:35      {'screen_name': 'OKMagazine', 'id': 14353891}   
85510 2013-01-14 02:23:47       {'screen_name': 'MarlowNYC', 'id': 24544452}   
85842 2013-01-14 02:24:11  {'screen_name': 'SincerelyBhop', 'id': 189935217}   
86066 2013-01-14 02:24:20     {'screen_name': 'kimilee116', 'id': 426973684}   

      contains  
85460     True  
8546

best television series - drama [(('Bill', 'Clinton'), 210), (('Kristen', 'Wiig'), 64), (('Will', 'Ferrell'), 57), (('Robert', 'Pattinson'), 36), (('Steven', 'Spielberg'), 25)]
best performance by an actress in a television series - drama
                       id                                               text  \
97163  290648825327144960  Claire Danes - Best Performance by an Actress ...   
97244  290648848722952192  Claire Danes of "Homeland" wins Best Performan...   

             timestamp_ms                                               user  \
97163 2013-01-14 02:37:34  {'screen_name': 'golden_globes', 'id': 105991003}   
97244 2013-01-14 02:37:40  {'screen_name': 'OnTheRedCarpet', 'id': 17497090}   

      contains  
97163     True  
97244     True  
197
                       id                                               text  \
97048  290648818570108928  RT @thecollegecrush: Lea Michele is soooo tan....   
97049  290648818561716224  RT @goldenglobes: Best Foreign Film - 

ValueError: No objects to concatenate