In [None]:
import pandas as pd
import numpy as np

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import string
import math
import re
import datetime
import time
from collections import Counter
import statistics as st

In [None]:
data = pd.read_json('gg2013.json')

In [None]:
data = pd.DataFrame(data)

In [None]:
print(data[:5])

                   id                                               text  \
0  290620657987887104             JLo's dress! #eredcarpet #GoldenGlobes   
1  290620657887219713  What's making Sofia Vergara's boobs stay like ...   
2  290620657828524032  RT @FabSugar: Kerry Washington is EVERYTHING. ...   
3  290620657799159809     Anne Hathaway has got me living. #GoldenGlobes   
4  290620657778188288  Jennifer Lopez's lace dress? Thoughts? #Golden...   

         timestamp_ms                                               user  
0 2013-01-14 00:45:38      {'screen_name': 'Dozaaa_xo', 'id': 557374298}  
1 2013-01-14 00:45:38    {'screen_name': 'theAmberShow', 'id': 14648726}  
2 2013-01-14 00:45:38        {'screen_name': 'SweetyPW', 'id': 35498686}  
3 2013-01-14 00:45:38  {'screen_name': '_NicoleEdwards', 'id': 144430...  
4 2013-01-14 00:45:38  {'screen_name': 'lolaogunnaike', 'id': 134953223}  


In [None]:
def timestampToUnix(t):
    return time.mktime(t.timetuple())

In [None]:
f = open('english.txt', 'r')
stopwords = f.read().splitlines()
#print(stopwords)

In [None]:
OFFICIAL_AWARDS_1315 = ['cecil b. demille award', 
                        'best motion picture - drama', 
                        'best performance by an actress in a motion picture - drama', 
                        'best performance by an actor in a motion picture - drama', 
                        'best motion picture - comedy or musical', 
                        'best performance by an actress in a motion picture - comedy or musical', 
                        'best performance by an actor in a motion picture - comedy or musical', 
                        'best animated feature film', 'best foreign language film', 
                        'best performance by an actress in a supporting role in a motion picture', 
                        'best performance by an actor in a supporting role in a motion picture', 
                        'best director - motion picture', 'best screenplay - motion picture', 
                        'best original score - motion picture', 'best original song - motion picture', 
                        'best television series - drama', 
                        'best performance by an actress in a television series - drama', 
                        'best performance by an actor in a television series - drama', 
                        'best television series - comedy or musical', 
                        'best performance by an actress in a television series - comedy or musical', 
                        'best performance by an actor in a television series - comedy or musical', 
                        'best mini-series or motion picture made for television', 
                        'best performance by an actress in a mini-series or motion picture made for television', 
                        'best performance by an actor in a mini-series or motion picture made for television', 
                        'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 
                        'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']

In [None]:
def parseAward(award):
    award = re.split('\W+', award)
    award = [i for i in award if i not in stopwords]
    return award

In [None]:
def getAwardCategories():
    category_dict = dict()
    terms_counter = Counter()
    for a in OFFICIAL_AWARDS_1315:
        terms = parseAward(a)
        for t in terms:
            terms_counter[t] += 1
        category_dict[a] = terms
        
    return category_dict

In [None]:
def getTweetsForAward(award):
    for tweet in data['text']:
        if all(term in tweet.lower() for term in category_dict[award]):
            data.loc[data.text == tweet, 'contains'] = True

    df = data.loc[data['contains'] == True]
    df = df.drop(columns=['contains'])
    return df

In [None]:
def getTweetsInTimeRange(df):
    df['unixtime'] = df['timestamp_ms'].apply(lambda x:  timestampToUnix(x))
    
    mean = st.mean(df['unixtime'].values.tolist())
    stdev = st.stdev(df['unixtime'].values.tolist())
    starttime = mean - (stdev*1)
    endtime = mean + (stdev*1)
    
    starttime = datetime.datetime.fromtimestamp(starttime).strftime('%Y-%m-%d %H:%M:%S')
    endtime = datetime.datetime.fromtimestamp(endtime).strftime('%Y-%m-%d %H:%M:%S')
    
    mask = (data['timestamp_ms'] >= starttime) & (data['timestamp_ms'] <= endtime)
    df = data.loc[mask]
    return df

In [None]:
def filter0(data, list1):
    result = []

    for tweet in data['text']:
        if all(term in tweet.lower() for term in list1):
            result.append(tweet)

    return result

In [None]:
def extractPresenters(data, list1):
    #print(data[0])
    result = []
    
    tt = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)
    translator = str.maketrans('', '', string.punctuation)

    # strip stopwords, punctuation 
    #punctuation = list(string.punctuation)
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'rt', 'golden', 'globe', 'globes']    
    stop = remove_terms + list1
    
    for tweet in data:
        #print(tweet)
        tweet = re.sub("\d+", "", tweet) #strip nums
        tweet = re.sub(r'http\S+', '', tweet) #strip urls
        tweet = re.sub(r'#\S+', '', tweet) #strip hashtags
        #tweet = re.sub(r'[^\w\s]', '', tweet) #strip non-alphanumeric characters
        tweet = tweet.translate(translator)
        tweet = tweet.split() #tokenize
        tweet = [term for term in tweet if term.lower() not in stop] #remove stop words
        result.append(tweet)
        
    #print(result[:20])
        
    bgrams = [];

    for tweet in result:
        if tweet:
            bigram = list(nltk.bigrams(tweet))
            #print(bigram[:10])
            for bg in bigram:
                if bool(re.match('([A-Z][a-z]+)', bg[0])) and bool(re.match('([A-Z][a-z]+)', bg[1])):
                    #print(bg[0], bg[1])
                    bgrams.append(bg)
            #tweet = re.findall('([A-Z][a-z]+)', tweet)
            #if tweet:
                #bgrams += list(nltk.bigrams(tweet))
  
    fdist = nltk.FreqDist(bgrams)
    #print(fdist)
    
    try:
        temp = fdist.most_common(1)[0][0]
        name = ' '.join(temp)
    except:
        name = "nothing here"
    
    return fdist.most_common(5)

In [None]:
def getPresenters():
    category_dict = getAwardCategories()
    
    for award in OFFICIAL_AWARDS_1315:
        df = getTweetsForAward(award)
        df = getTweetsInTimeRange(df)
        #print(df)
        stopwords = category_dict[award] + ['Jessica', 'Chastain', 'Marion', 'Cotillard', 'Helen', 'Mirren', 'Naomi', 'Watts', 'Rachel', 'Weisz']
        results = filter0(df, ['present'])
        presenters = extractPresenters(results, category_dict[award])
        print(award, presenters)

In [None]:
getPresenters()