In [205]:
import pandas as pd
import numpy as np
 
import nltk
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import string
import math
import re
from collections import Counter 

In [206]:
data = pd.read_json('gg2013.json')
data = pd.DataFrame(data)

In [207]:
# return cleaned Tweet as string
# remove stopwords, user handles, punctuation, urls

def cleanTweets(tweet):
    tt = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)

    punctuation = list(string.punctuation)
    
    # strip stopwords, punctuation, url components 
    stop = stopwords.words('english') + punctuation + ['t.co', 'http', 'https', '...', '..', ':\\', 'RT', '#']

    strip_nums = re.sub("\d+", "", tweet)
    tokenized = tt.tokenize(strip_nums)
    terms_stop = [term for term in tokenized if term not in stop]
    cleaned = [term for term in terms_stop]
    cleaned = ' '.join(cleaned)
    
    return cleaned

In [208]:
def filter0(data):
    results = []
    
    for tweet in data['text']:
        if 'best' in tweet.lower():
            results.append(tweet)
            
    return results

In [209]:
f0 = data['text'].values.tolist()

In [210]:
def filter1(data, list1, list2):
    result1 = []
    result2 = []
    
    for tweet in data:
        if any(term in tweet.lower() for term in list1):
            result1.append(tweet)
        elif any(term in tweet.lower() for term in list2):
            result2.append(tweet)
            
    return result1, result2

In [211]:
movie = ['motion picture', 'movie', 'movies']
tv = ['tv', 'television']

f1_movie, f1_tv = filter1(f0, movie, tv)

In [212]:
def filter2(data, list1):
    result1 = []
    result2 = []
        
    for tweet in data:
        if any(term in tweet.lower() for term in list1):
            result1.append(tweet)
        else:
            result2.append(tweet)
            
    return result1, result2

In [213]:
actor = ['actor', 'actress', 'actors', 'actresses']
f2_movie_actor, f2_movie_media = filter2(f1_movie, actor)

In [214]:
f2_tv_actor, f2_tv_media = filter2(f1_tv, actor)

In [215]:
f3_tv_actor, f3_tv_actress = filter1(f2_tv_actor, ['actor', 'actors'], ['actress', 'actresses'])

In [216]:
f3_movie_actor, f3_movie_actress = filter1(f2_movie_actor, ['actor', 'actors'], ['actress', 'actresses'])

In [223]:
f4_lifetime_achievement = filter2(data, ['cecil', 'demille', 'lifetime', 'achievement'])[0]
f4_tv_drama = filter2(f2_tv_media, ['drama'])[0]
f4_tv_musical_comedy = filter2(f2_tv_media, ['musical', 'comedy'])[0]
f4_tv_miniseries_film = filter2(f2_tv_media, ['film', 'miniseries'])[0]
f4_tv_drama_actor = filter2(f3_tv_actor, ['drama'])[0]
f4_tv_musical_comedy_actor = filter2(f3_tv_actor, ['musical', 'comedy'])[0]
f4_tv_miniseries_film_actor = filter2(f3_tv_actor, ['miniseries', 'mini', 'film'])[0]
f4_tv_drama_actress = filter2(f3_tv_actress, ['drama'])[0]
f4_tv_musical_comedy_actress = filter2(f3_tv_actress, ['musical', 'comedy'])[0]
f4_tv_miniseries_film_actress = filter2(f3_tv_actress, ['miniseries', 'mini', 'film'])[0]
f4_tv_supporting_actor_series = filter2(f3_tv_actor, ['supporting', 'series', 'mini', 'miniseries', 'film'])[0]
f4_tv_supporting_actress_series = filter2(f3_tv_actress, ['supporting', 'series', 'mini', 'miniseries', 'film'])[0]
f4_achievement_in_tv = filter2(data, ['carol', 'burnett'])[0]

In [224]:
def extractWinner(data, list1):
    result = []
    
    tt = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)

    punctuation = list(string.punctuation)
    remove_terms = ['#GoldenGlobes', 'Golden Globes', 'golden globes', '#GoldenGlobe', 'Golden Globe', '#goldenglobes']    
    # strip stopwords, punctuation, url components 
    stop = stopwords.words('english') + punctuation + ['t.co', 'http', 'https', '...', '..', ':\\', 'rt', '#'] + remove_terms + list1

    for tweet in data:
        tweet = re.sub("\d+", "", tweet) #strip nums
        tweet = re.sub(r'http\S+', '', tweet)
        tweet = tt.tokenize(tweet) #tokenize
        tweet = [term for term in tweet if term.lower() not in stop] #remove stop words
        result.append(' '.join(tweet))
        
    bgrams = [];

    for tweet in result:
        tweet = re.findall('([A-Z][a-z]+)', tweet)
        if tweet:
            bgrams += list(nltk.bigrams(tweet))
        
    fdist = nltk.FreqDist(bgrams)
    
    return fdist

In [225]:
extractWinner(f4_lifetime_achievement, ['cecil', 'demille', 'lifetime', 'achievement']).most_common(10)

[]

In [226]:
extractWinner(f4_tv_drama, ['television', 'tv', 'series', 'drama']).most_common(10)

[]

In [227]:
extractWinner(f4_tv_musical_comedy, ['television', 'tv', 'series', 'musical', 'comedy']).most_common(10)

[]

In [222]:
extractWinner(f4_tv_miniseries_film, ['television', 'tv', 'film', 'miniseries']).most_common(10)

[]