In [16]:
import pandas as pd
import numpy as np

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import string
import math
import re
import datetime
import time
from collections import Counter
import statistics as st

In [17]:
df = pd.read_json('../gg2015.json')

In [18]:
df.columns

Index(['id', 'text', 'timestamp_ms', 'user'], dtype='object')

In [19]:
df = df.drop(columns=['id', 'user'])

In [20]:
data = df.values.tolist()

In [21]:
data[:5]

[['just had to scramble to find a golden globes stream for my brother. :D',
  Timestamp('2015-01-11 22:20:13.011000')],
 ["RT @ENews: Show us how you're watching the #GoldenGlobes -- tweet us a pic of your set up, we'll RT our faves! #ERedCarpet",
  Timestamp('2015-01-11 22:25:13.824000')],
 ['@danaKStew @50ShadesWorldcm @ScarletteDrake Also Red Carpet um 12 &amp; die Show vill. um 1?!',
  Timestamp('2015-01-11 22:25:13.869000')],
 ['RT @lisarinna: When your husband tells you that you Are going to the #GoldenGlobes parties like 5 minutes before you go.......\nYou just gra…',
  Timestamp('2015-01-11 22:25:13.928000')],
 ['“@goldenglobes: Creating multiple mini Moët Moments on the @GoldenGlobes red carpet… http://t.co/vaLDYqbuD1\n#MoetMoment” May I have one plz?',
  Timestamp('2015-01-11 22:25:14.067000')]]

In [22]:
def timestampToUnix(t):
    return time.mktime(t.timetuple())

In [23]:
f = open('english.txt', 'r')
stop_words = f.read().splitlines()
#print(stopwords)

In [24]:
OFFICIAL_AWARDS_1315 = ['cecil b. demille award', 
                        'best motion picture - drama', 
                        'best performance by an actress in a motion picture - drama', 
                        'best performance by an actor in a motion picture - drama', 
                        'best motion picture - comedy or musical', 
                        'best performance by an actress in a motion picture - comedy or musical', 
                        'best performance by an actor in a motion picture - comedy or musical', 
                        'best animated feature film', 'best foreign language film', 
                        'best performance by an actress in a supporting role in a motion picture', 
                        'best performance by an actor in a supporting role in a motion picture', 
                        'best director - motion picture', 'best screenplay - motion picture', 
                        'best original score - motion picture', 
                        'best original song - motion picture', 
                        'best television series - drama', 
                        'best performance by an actress in a television series - drama', 
                        'best performance by an actor in a television series - drama', 
                        'best television series - comedy or musical', 
                        'best performance by an actress in a television series - comedy or musical', 
                        'best performance by an actor in a television series - comedy or musical', 
                        'best mini-series or motion picture made for television', 
                        'best performance by an actress in a mini-series or motion picture made for television', 
                        'best performance by an actor in a mini-series or motion picture made for television', 
                        'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 
                        'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']

In [25]:
def parseAward(award):
    award = re.split('\W+', award)
    award = [i for i in award if i not in stop_words]
    return award

In [26]:
def getAwardCategories():
    category_dict = dict()
    for a in OFFICIAL_AWARDS_1315:
        terms = parseAward(a)
        category_dict[a] = terms
        
    return category_dict

In [51]:
def getTweetsForAward(data, list1):
    synonyms = {
        'motion' : ['motion picture', 'motion', 'picture', 'movie'],
        'picture' : ['motion picture', 'motion', 'picture', 'movie'],
        'television' : ['television', 'tv'],
        'mini' : ['mini-series', 'mini', 'series', 'miniseries'],
        'series' : ['mini-series', 'mini', 'series', 'miniseries']
    }
    
    #result = pd.DataFrame(columns={'text', 'timestamp_ms'})
    time = []
    #count = 0
    
    list1 = [i for i in list1 if i != 'performance' and i != 'role']

    for tweet in df['text']:
        cond = True
        for i in list1:
            if i in synonyms:
                if all(j not in tweet.lower() for j in synonyms[i]):
                    cond = False
            elif i not in tweet.lower():
                cond = False
        if cond:
            #result = result.append(data.loc[data.text == tweet, ['text', 'timestamp_ms']])
            time = time.append(timestampToUnix(data['timestamp_ms'][data.text == tweet]))
            #count += 1
            
    #df = pd.concat(result)

    return time

In [43]:
def getTweetsInTimeRange(time):
    #df['unixtime'] = df['timestamp_ms'].apply(lambda x:  timestampToUnix(x))
    
    
    mean = st.mean(time)
    stdev = st.stdev(time)
    starttime = mean - (stdev*1)
    endtime = mean + (stdev*1)
    
    starttime = datetime.datetime.fromtimestamp(starttime).strftime('%Y-%m-%d %H:%M:%S')
    endtime = datetime.datetime.fromtimestamp(endtime).strftime('%Y-%m-%d %H:%M:%S')
    
    mask = (df['timestamp_ms'] >= starttime) & (df['timestamp_ms'] <= endtime)
    df = df.loc[mask]
    print(len(df))
    return df['text'].values.tolist()

In [29]:
def getTweetsInTimeRange2(df):
    starttime = df['timestamp_ms'].min()
    endtime = df['timestamp_ms'].max()
    mask = (data['timestamp_ms'] >= starttime) & (data['timestamp_ms'] <= endtime)
    df = data.loc[mask]
    print(len(df))
    return df

In [30]:
def filter0(data, word):
    result = []
    for tweet in data['text']:
        if word in tweet:
            result.append(tweet)

    return result

In [31]:
def extractPresenters(data, list1):
    #print(data[0])
    result = []
    
    tt = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)
    translator = str.maketrans('', '', string.punctuation)

    # strip stopwords, punctuation 
    #punctuation = list(string.punctuation)
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'rt', 'golden', 'globe', 'globes']    
    stop = remove_terms + list1
    
    for tweet in data:
        #print(tweet)
        tweet = re.sub("\d+", "", tweet) #strip nums
        tweet = re.sub(r'http\S+', '', tweet) #strip urls
        tweet = re.sub(r'#\S+', '', tweet) #strip hashtags
        #tweet = re.sub(r'[^\w\s]', '', tweet) #strip non-alphanumeric characters
        tweet = tweet.translate(translator)
        tweet = tweet.split() #tokenize
        tweet = [term for term in tweet if term.lower() not in stop_words] #remove stop words
        for i in stop:
            for j in tweet:
                if i.lower() in j.lower():
                    tweet.remove(j)
        result.append(tweet)
        
    #print(result[:20])
        
    grams = [];

    for tweet in result:
        if tweet:
            gram = list(nltk.everygrams(tweet, 2, 3))
            #print(bigram[:10])
            for g in gram:
                if len(g) == 2:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])):
                        grams.append(g)
                else:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[2])):
                        grams.append(g)
  
    fdist = nltk.FreqDist(grams)
    #print(fdist)
    
    try:
        temp = fdist.most_common(1)[0][0]
        name = ' '.join(temp)
    except:
        name = "nothing here"
    
    return fdist.most_common()

In [52]:
category_dict = getAwardCategories()

for award in OFFICIAL_AWARDS_1315:    
    time = getTweetsForAward(df, category_dict[award])
    tweets = getTweetsInTimeRange(time)
    presenters = extractPresenters(tweets, category_dict[award])

AttributeError: 'list' object has no attribute 'mktime'

In [None]:
for award in OFFICIAL_AWARDS_1315:
    print(tweets[award][:5])

In [None]:
def getPresenters(tweets, category_dict):
        #df = getTweetsInTimeRange(df)
        #print(df[:5], len(df))
        #stopwords = category_dict[award] + ['Jessica', 'Chastain', 'Marion', 'Cotillard', 'Helen', 'Mirren', 'Naomi', 'Watts', 'Rachel', 'Weisz']
        presenters = {}
        for award in OFFICIAL_AWARDS_1315:
            results = filter0(tweets[award], 'present')
        #print(results[:5])
            presenters[award] = extractPresenters(results, category_dict[award])
        #print(award, presenters)
        return presenters

In [None]:
getPresenters(tweets, category_dict)