# Preprocessing Start

In [163]:
import pandas as pd
import numpy as np

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
nltk.download('wordnet')

from collections import Counter
from collections import OrderedDict

import string
import re
import unidecode
import requests
import json
from pprint import pprint

[nltk_data] Downloading package wordnet to /Users/yqiao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [164]:
# Load in official award names

OFFICIAL_AWARDS_1315_media = ['best motion picture - drama', 'best motion picture - comedy or musical', 'best animated feature film', 'best foreign language film', 'best screenplay - motion picture', 'best original score - motion picture', 'best original song - motion picture', 'best television series - drama', 'best television series - comedy or musical', 'best mini-series or motion picture made for television']



In [165]:
# Load in answers - we will remove this when we submit the final fial

with open('gg2015answers.json') as f:
    answers = json.load(f)
    
true_dict = answers['award_data'] # use this for true dict in the below function

In [166]:
# Loadin dataset - TO DO, automate this processes for four relevant years (2013,2015,2018,2019)

df = pd.read_json('../gg2015.json')
df = pd.DataFrame(df)

In [167]:
# Write dataset to a list

data = df['text'].values.tolist()

In [168]:
# Initialize seperate knowledge bases 

people = set()
media = set() 

In [169]:
url = 'https://query.wikidata.org/sparql'

In [170]:
query = """
# ALL PERSONS required for awards
SELECT DISTINCT ?person ?personLabel WHERE {
# FIRST: uncomment occupation:
  ?person wdt:P31 wd:Q5;
           wdt:P106/wdt:P279* wd:Q2526255; #uncomment for     FILM director (no award for TV director)
  FILTER NOT EXISTS { ?person wdt:P570 ?date. } #person is alive
  
# SECOND: uncomment gender if applicable (for actor/actress):
#          wdt:P21 wd:Q6581097;    #male
#          wdt:P21 wd:Q6581072;    #female
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb = r.json()
for item in kb['results']['bindings']:
    people.add(unidecode.unidecode(item['personLabel']['value']))

In [171]:
query = """
# ALL PERSONS required for awards
SELECT DISTINCT ?person ?personLabel WHERE {
# FIRST: uncomment occupation:
  ?person wdt:P31 wd:Q5;
           wdt:P106/wdt:P279* wd:Q10800557; #uncomment for    FILM actor (don't just use actor)
  FILTER NOT EXISTS { ?person wdt:P570 ?date. } #person is alive
  
# SECOND: uncomment gender if applicable (for actor/actress):
#          wdt:P21 wd:Q6581097;    #male
#          wdt:P21 wd:Q6581072;    #female
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r1 = requests.get(url, params = {'format': 'json', 'query': query})
kb1 = r1.json()
for item in kb1['results']['bindings']:
    people.add(unidecode.unidecode(item['personLabel']['value']))

In [172]:
query = """
# ALL PERSONS required for awards
SELECT DISTINCT ?person ?personLabel WHERE {
# FIRST: uncomment occupation:
  ?person wdt:P31 wd:Q5;
           wdt:P106/wdt:P279* wd:Q10798782; #uncomment for    TV actor (don't just use actor)
  FILTER NOT EXISTS { ?person wdt:P570 ?date. } #person is alive
  
# SECOND: uncomment gender if applicable (for actor/actress):
#          wdt:P21 wd:Q6581097;    #male
#          wdt:P21 wd:Q6581072;    #female
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb = r.json()
for item in kb['results']['bindings']:
    people.add(unidecode.unidecode(item['personLabel']['value']))

In [173]:
query = """
# ALL PERSONS required for awards
SELECT DISTINCT ?person ?personLabel WHERE {
# FIRST: uncomment occupation:
  ?person wdt:P31 wd:Q5;
           wdt:P106/wdt:P279* wd:Q36834; #uncomment for       composer (cannot use songwriter)
  FILTER NOT EXISTS { ?person wdt:P570 ?date. } #person is alive
  
# SECOND: uncomment gender if applicable (for actor/actress):
#          wdt:P21 wd:Q6581097;    #male
#          wdt:P21 wd:Q6581072;    #female
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb = r.json()
for item in kb['results']['bindings']:
    people.add(unidecode.unidecode(item['personLabel']['value']))

In [174]:
query = """
# ALL PERSONS required for awards
SELECT DISTINCT ?person ?personLabel WHERE {
# FIRST: uncomment occupation:
  ?person wdt:P31 wd:Q5;
           wdt:P106/wdt:P279* wd:Q28389; #uncomment for       screenwriter
  FILTER NOT EXISTS { ?person wdt:P570 ?date. } #person is alive
  
# SECOND: uncomment gender if applicable (for actor/actress):
#          wdt:P21 wd:Q6581097;    #male
#          wdt:P21 wd:Q6581072;    #female
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb = r.json()
for item in kb['results']['bindings']:
    people.add(unidecode.unidecode(item['personLabel']['value']))

In [175]:
query = """
# ALL PERSONS required for awards
SELECT DISTINCT ?person ?personLabel WHERE {
# FIRST: uncomment occupation:
  ?person wdt:P31 wd:Q5;
          wdt:P106/wdt:P279* wd:Q177220; #uncomment for       singer
  FILTER NOT EXISTS { ?person wdt:P570 ?date. } #person is alive
  
# SECOND: uncomment gender if applicable (for actor/actress):
#          wdt:P21 wd:Q6581097;    #male
#          wdt:P21 wd:Q6581072;    #female
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb = r.json()
for item in kb['results']['bindings']:
    people.add(unidecode.unidecode(item['personLabel']['value']))

In [176]:
## Media KB Creation

In [177]:
url = 'https://query.wikidata.org/sparql'
query = """
SELECT DISTINCT ?itemLabel  WHERE {
 ?item wdt:P31 wd:Q11424. ?item wdt:P577 ?_publication_date. ?item wdt:P136 ?_genre.
 ?_genre rdfs:label ?_genreLabel. BIND(str(YEAR(?_publication_date)) AS ?year)
 FILTER((LANG(?_genreLabel)) = "en")
 FILTER (?_publication_date >= "2012-00-00T00:00:00Z"^^xsd:dateTime && ?_publication_date <= "2019-00-00T00:00:00Z"^^xsd:dateTime )
 SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .} }
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb_m = r.json()

for item in kb_m['results']['bindings']:
    media.add(unidecode.unidecode(item['itemLabel']['value']))

In [178]:
query = """
SELECT DISTINCT ?itemLabel  WHERE {
  ?item wdt:P31 wd:Q5398426.
  ?item wdt:P580  ?_start
 FILTER (?_start >= "2005-00-00T00:00:00Z"^^xsd:dateTime && ?_start <= "2019-00-00T00:00:00Z"^^xsd:dateTime )
  SERVICE wikibase:label {bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .}
}
    
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb_m = r.json()

for item in kb_m['results']['bindings']:
    media.add(unidecode.unidecode(item['itemLabel']['value']))

In [179]:
f = open('english.txt', 'r')
stop_words = f.read().splitlines()

In [180]:
OFFICIAL_AWARDS_1315_people = ['cecil b. demille award',
                        'best performance by an actress in a motion picture - drama', 
                        'best performance by an actor in a motion picture - drama',
                        'best performance by an actress in a motion picture - comedy or musical', 
                        'best performance by an actor in a motion picture - comedy or musical',
                        'best performance by an actress in a supporting role in a motion picture', 
                        'best performance by an actor in a supporting role in a motion picture', 
                        'best director - motion picture', 
                        'best screenplay - motion picture', 
                        'best original score - motion picture',
                        'best performance by an actress in a television series - drama', 
                        'best performance by an actor in a television series - drama',
                        'best performance by an actress in a television series - comedy or musical', 
                        'best performance by an actor in a television series - comedy or musical', 
                        'best performance by an actress in a mini-series or motion picture made for television', 
                        'best performance by an actor in a mini-series or motion picture made for television', 
                        'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 
                        'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']

In [181]:
people = set(map(lambda x: x.lower(),people))
media = set(map(lambda x: x.lower(),media)) 


# Preprocessing Complete


In [182]:
def clean_awards(text):
    " Cleans individual tweet for award search"
    
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'rt', 'golden', 'globe', 'globes']
    
    text = re.sub("(\s)#\w+","",text)    # strips away all hashtags 
    text = re.sub("RT","",text)          # removes retweet
    text = re.sub("[^a-zA-Z ]", '',text) # removes all punctuation but keeps whitespace for tokenization
    text = text.lower()
    text = text.split()
    text = " ".join([term for term in text if term not in remove_terms]) #remove stop words
    
    return text 

In [183]:
def find_tags(tweet):
    """
    Performs pos tagging at a tweet level
    """
    
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(tweet)
    tags = nltk.pos_tag(tokens)
    return tags

In [184]:
def pos_search(tags,chunk_gram,label):
    ""
    
    
    potentials = ""
    chunk_parser = nltk.RegexpParser(chunk_gram)
    chunked = chunk_parser.parse(tags)
    for subtree in chunked.subtrees():
        if subtree.label() == label: 
            raw_list = untag(subtree)
            raw_list = [i for i in raw_list if wordnet.synsets(i)]
            string = ' '.join(raw_list)
            if "best" in string[0:6]:
                if len(string) >= len(potentials):
                    potentials = string
                    
    if potentials == "":
        return "No Chunk"

    return potentials

In [185]:
def filter_df(df,label):
    
    data = df.loc[df[label] != "No Chunk"]
    data.drop(data.columns.difference([label]), 1, inplace=True)
    single_list = list(data[label])
    freq = FreqDist(single_list)
    
    return data, freq

In [186]:
def find_awards(df):
    """
    Returns a list of strings for all possible awards
    """
    # Shuffle data if necesarry
    sample_size = 200000
    if len(df['text']) > sample_size:
        df = df.sample(n=sample_size)
    
    # Clean awards, keep best, pos tag
    df['text'] = df['text'].apply(lambda x:  clean_awards(x))
    df_a = df[df['text'].str.contains("best")]
    df_a['tags'] = df_a['text'].apply(lambda x: find_tags(x))
    
    # Define regex patterns from generalized 
    regex_pattern_0 = "P0: {<JJ.><NN.|JJ|VBG><...?>*<NN.>}"
    regex_pattern_1 = "P1: {<NN.><IN|NN.|IN><...?>*<NN.>}"
    regex_pattern_2 = "P2: {<RB.><JJ|NN.|VGB><...?>*<NN.|JJ>}"
    
    # Search for pos 
    df_a['chunks_0'] = df_a['tags'].apply(lambda x: pos_search(x,regex_pattern_0,"P0"))
    df_a['chunks_1'] = df_a['tags'].apply(lambda x: pos_search(x,regex_pattern_1,"P1"))
    df_a['chunks_2'] = df_a['tags'].apply(lambda x: pos_search(x,regex_pattern_2,"P2"))

    data_0, freq_0 = filter_df(df_a,"chunks_0")
    data_1, freq_1 = filter_df(df_a,"chunks_1")
    data_2, freq_2 = filter_df(df_a,"chunks_2")

    freq = freq_0 + freq_1 + freq_2
    
    possible = []

    for i in freq.most_common():
        if i[1] >= 8: possible.append(i[0])
    
    return possible

In [187]:
def parseAward(award):
    """
    Returns a list of words that can be used to filter for a particular award
    """
    
    award = re.split('\W+', award)
    award = [i for i in award if i not in stop_words]
    award = list(set(award))
    return award

In [188]:
def getCategoriesDict(awards_list):
    """
    Returns a dictionary that has all awards as keys, and a list of relevant filtering words as values
    """
    
    categories_dict = dict()
    for a in awards_list:
        terms = parseAward(a)
        categories_dict[a] = terms

    return categories_dict

In [189]:
def filter0(data, list1, spec = "people"):
    """
    Returns a list of tweets that are relevant to a particular award
    """
    synonyms = {}
    
    if spec == "people":
        synonyms = {
            'motion' : ['motion picture', 'motion', 'picture', 'movie'],
            'picture' : ['motion picture', 'motion', 'picture', 'movie'],
            'television' : ['television', 'tv'],
            'mini' : ['mini-series', 'mini', 'series', 'miniseries'],
            'series' : ['mini-series', 'mini', 'series', 'miniseries']
        }

    
    result = []
    
    list1 = [i for i in list1 if i != 'performance' and i != 'role']

    for tweet in data:
        cond = True
        for i in list1:
            if i in synonyms:
                if all(j not in tweet.lower() for j in synonyms[i]):
                    cond = False
            elif i not in tweet.lower():
                cond = False
        if cond:
            result.append(tweet)

    return result

In [190]:
def extractPeople(data, list1):
    """
    Extracts potential People nominees from an individual tweet
    """
    
    
    result = []
       
    translator = str.maketrans('', '', string.punctuation)
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'rt', 'golden', 'globe', 'globes']
    stop = remove_terms + list1
    
    for tweet in data:
        
        tweet = re.sub("\d+", "", tweet)       #strip nums
        tweet = re.sub(r'http\S+', '', tweet)  #strip urls
        tweet = re.sub(r'#\S+', '', tweet)     #strip hashtags
        tweet = tweet.translate(translator)    #strip non-alphanumeric characters
        tweet = tweet.split()                  #tokenize
        tweet = [term for term in tweet if term.lower() not in stop_words] #remove stop words
        for i in stop:
            for j in tweet:
                if i.lower() in j.lower():
                    tweet.remove(j)
        result.append(tweet)
        

        
    grams = [];

    for tweet in result:
        if tweet:
            # Get all possible bigrams & trigrams in a tweet
            gram = list(nltk.everygrams(tweet, 2, 3))
            
            # Filter through and append to list for tweet
            for g in gram:
                if len(g) == 2:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])):
                        grams.append(g)
                else:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[2])):
                        grams.append(g)

  
    fdist = nltk.FreqDist(grams)

    try:
        names = fdist.most_common()
    except:
        names = "nothing here"
    
    return names

In [191]:
def extractMedia(data, list1):
    """
    Extracts potential media nominees from an individual tweet
    """
    
    result = []
       
    translator = str.maketrans('', '', string.punctuation)
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'rt', 'golden', 'globe', 'globes', 'best']    
    stop = remove_terms + list1
    
    for tweet in data:
        tweet = re.sub("\d+", "", tweet)      #strip nums
        tweet = re.sub(r'http\S+', '', tweet) #strip urls
        tweet = re.sub(r'#\S+', '', tweet)    #strip hashtags
        tweet = tweet.translate(translator)   #strip non-alphanumeric characters
        tweet = tweet.split()                 #tokenize

        for i in stop:
            for j in tweet:
                if i.lower() in j.lower():
                    tweet.remove(j)
        tweet = ' '.join(tweet)
        result.append(tweet)
        
        
    grams = [];

    for tweet in result:
        if tweet:
            
            grams.extend(re.findall(r"([A-Z][\w-]*(?:\s+[A-Z][\w-]*)+)", tweet))
            grams.extend(re.findall(r"\b[A-Z][a-z]+\b.*\b[A-Z][a-z]+\b", tweet))
            #singular = re.findall(r"\b[A-Z][a-z]+\b", tweet)
            #singular = [i for i in singular if not wordnet.synsets(i)]
            #grams.extend(singular)
            
    # print(grams)
    fdist = nltk.FreqDist(grams)

    try:
        names = fdist.most_common()

    except:
        names = "nothing here"
    
    return names

In [192]:
def extractPresenters(data, list1, winners):
    #print(data[0])
    result = []
       
    #tt = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)
    translator = str.maketrans('', '', string.punctuation)
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'golden', 'globe', 'globes']
    stop = remove_terms + list1 + winners.split()
    
    for tweet in data:
        #print(tweet)
        tweet = re.sub("\d+", "", tweet) #strip nums
        tweet = re.sub(r'http\S+', '', tweet) #strip urls
        tweet = re.sub(r'#\S+', '', tweet) #strip hashtags
        tweet = tweet.translate(translator) #strip non-alphanumeric characters
        tweet = tweet.split() #tokenize
        #tweet = [term for term in tweet if term.lower() not in stop_words] #remove stop words
        for i in stop:
            for j in tweet:
                if i.lower() in j.lower():
                    tweet.remove(j)
        result.append(tweet)
        
    #print(result[:20])
        
    grams = [];

    for tweet in result:
        if tweet:
            gram = list(nltk.everygrams(tweet, 2, 3))
            #print(bigram[:10])
            for g in gram:
                if len(g) == 2:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])):
                        grams.append(g)
                else:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[2])):
                        grams.append(g)

  
    fdist = nltk.FreqDist(grams)
    #print(fdist)

    try:
        names = fdist.most_common()
        #names = [' '.join(i[0]) for i in fdist.most_common()]
    except:
        names = "nothing here"
    
    return names

In [193]:
# DATA PASSED IN AS LIST
def extractHosts(data):
   # clean data
    cleaned_data = []

    for tweet in data:
        tt = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)

        punctuation = list(string.punctuation)

        # strip stopwords, punctuation, url components
        stop = stopwords.words('english') + punctuation + ['t.co', 'http', 'https', '...', '..', ':\\', 'RT', '#']

        strip_nums = re.sub("\d+", "", tweet)
        tokenized = tt.tokenize(strip_nums)
        terms_stop = [term for term in tokenized if term not in stop]
        cleaned = [term for term in terms_stop]
        cleaned = ' '.join(cleaned)
        cleaned_data.append(cleaned)


    # find host
    include_terms = ['host', 'hosted', 'hosting', 'hosts']
    remove_terms = ['next year']
    host = [];
    cohost = False;

    for tweet in cleaned_data:
        if any(term in tweet for term in include_terms) and any(term not in tweet for term in remove_terms):
            host.append(tweet)
        if 'cohost' in tweet:
            cohost = True

    bgrams = [];

    for tweet in host:
        bgrams += list(nltk.bigrams(tweet.split()))

    fdist = nltk.FreqDist(bgrams)

    if cohost:
        fdist = fdist.most_common(2)
    else:
        fdist = fdist.most_common(1)

    final_hosts = []
    for host in fdist:
        name = host[0][0] + ' ' + host[0][1]
        final_hosts.append(name)

    return final_hosts

In [194]:
def get_category_tweets(award_list, categories_dict,data):
    """
    Using an award list and category dictionary, filters out tweets at an award level
    """
    
    d = {}
    for award in award_list:
        d["{0}".format(award)] = filter0(data, categories_dict[award])
                                         
    return d

In [231]:
def get_nom(award_list, categories_dict, tweets_dict, spec = ""):
    """
    Gets all potential nominees based on ExtractMedia or ExtractPeople
    """
    
    if spec == "people":
        funct = extractPeople
    elif spec == "media":
        funct = extractMedia
    else:
        print("there is a problem")

    nominees = {}
    for award in award_list:
        nominees["{0}".format(award)] = funct(tweets_dict[award], categories_dict[award])
    
    return nominees

In [217]:
def getPresenters(award_list, categories_dict, tweets_dict, winners):
    present = ['present', 'annouc', 'introduc']
    
    for award in award_list:
        for tweet in tweets_dict[award]:
            if all(i not in tweet for i in present):
                del tweets_dict[tweet]
            
    presenters = {}
    for award in award_list:
        all_presenters = extractPresenters(tweets_dict[award], categories_dict[award], winners[award])
        presenters[award] = ' '.join(all_presenters[award][0][0])
    
    return presenters

In [197]:
def compareToKB(nominees,kb):
    """
    Takes in a dictionary of potential nominees and removes those that don't appear in a relevant KB
    
    If no nominees are in the KB, then ... (currentlt top five)
    """
    
    final_nominees = {}
        
    for i in nominees:
        award_nominees = []
        
        for j in nominees[i]:
            print(j)
            if j[0].lower() in kb:
                award_nominees.append(j[0].lower())
                
        if not award_nominees:
            award_nominees = [i[0].lower() for i in nominees[i][:5]]
        
        award_nominees = list(set(award_nominees))
        final_nominees[i] = award_nominees
        
    return final_nominees

In [198]:
def separateAwards(award_list):
    possible_people_awards = ['actor',' actress', 'musician', ' singer', 'composer', 'director', 'producer',
                        'screenwriter', 'stage technician', 'author']

    people_awards = []
    media_awards = []

    for category in award_list:
        if any(job in category.lower() for job in possible_people_awards):
            people_awards.append(category)
        else:
            media_awards.append(category)

    return people_awards, media_awards

In [199]:
def compress_associated_dict(award_list,nominees,winners,presenters):
    
    our_dict = {}

    for award in award_list:
        our_dict[award] = {
            'nominees' : nominees[award],
            'winner' : winners[award],
            'presenters' : presenters[award]
        }
        
    return our_dict

In [200]:
## To DO

def getMediaWinners(nominees):
    pass

In [222]:
def getPeopleWinners(nominees):
    final_winners = {}
    #print(nominees)
    
    for award in nominees:
        winner = ' '.join(nominees[award][0][0])
        final_winners[award] = winner
        
    return final_winners

In [240]:
def associated_tasks(award_list,data,spec,kb,kb2):
    
    # Create a dictionary to filter tweets at a category level
    cat_filter_dict = getCategoriesDict(award_list)                 
    #print(cat_filter_dict)
    
    # Get all associated tweets for each award 
    tweets_dict = get_category_tweets(award_list, cat_filter_dict, data)
    #print(tweets_dict)
    
    # For each award, get all associated nominees
    full_nom_dict = get_nom(award_list, cat_filter_dict, tweets_dict, spec)
    print(full_nom_dict)
    
    # Filter out all nominees that are not in the dictionary
    final_nom = compareToKB(full_nom_dict, kb)
    
    
    # TO DO - Fill in the below functions
    final_winners = {}
    if spec == "media":
        pass 
        # final_winners = get_media_winners()
    
    elif spec == "people":
        final_winners = getPeopleWinners(full_nom_dict)
    
    # Get possible presenters 
    full_presenters = getPresenters(award_list, cat_filter_dict, tweets_dict, final_winners)
    
    # Filter out all presenters not in the secondary kb
    final_pres = comparetoKB(full_presenters,kb2)
    
    # compressed_dict = compress_associated_dict(award_list,final_nom,final_winners,final_pres)
    
    # This return is only required for main_exec
    return tweets_dict, full_nom_dict, final_nom, final_pres

In [203]:
def main_exec(award_list,df,kb_p,kb_m):
    """
    Main execution file - how you run the program
    Itype: kb_p and kb_m are sets for our built KB's
    """
    
    data = df['text'].values.tolist()
    
    # Call host search function
    # // To Do // - Insert function call (should write to JSON / return a variable to write later )
    host = extractHosts(data)
    
    # Call award recognition function
    # // To Do // - Insert function call (should write to JSON / return a variable to write later)
    
    # Set a variable to the hardcoded list
    hardcoded = award_list
    
    # Segment out awards award categories
    people_awards, media_awards = seperateAwards(hardcoded)
    
    
    ## Functions below need to return a dictionary with following structure
    ## Key 1: Award Name, Value: Dictionary
    ## Key 2: [ Nominees, Winners, Presenters] 
    
    
    # Call people award search function - winner, nominee, presenter (potentially swap last two)
    
    people_tweet, all_potential_people, full_people_dict = associated_tasks(people_awards, data, "people", kb_p, kb_p)
    
    media_tweet, all_potential_media, full_media_dict = associated_tasks(media_awards, data, "media", kb_m, kb_p)
    
    
    
    
    # Call people award search function - winner, nominee, presenter (potentially swap last two)
    ## // To DO // - Insert function call ( only write to easy_comp now)
    
    
    # Merge dictionaries from two above functions
    # Return single dict for easy_comp

In [204]:
def easy_comp(award_list,true_dict,our_dict):
        
    # Input 
    # Dict of dictionaries 
    # Keys - Award Name
    # Values - Dictionary with keys (nominees, presenters, winner)
    
    # Output
    # Nested List (some elements are dictionarys)

    output = []

    for award in award_list:
        output.append([award,["Guess",our_dict[award]],[["True",true_dict[award]]]])

    
    return output

# For easy printing use pprint. It works nicely for updated comparison

In [205]:
people_categories_dict = getCategoriesDict(OFFICIAL_AWARDS_1315_people)
media_categories_dict = getCategoriesDict(OFFICIAL_AWARDS_1315_media)
media_categories_dict;

In [206]:
import timeit
start_time = timeit.default_timer()
tweets = get_category_tweets(OFFICIAL_AWARDS_1315_media, media_categories_dict,data)
elapsed = timeit.default_timer() - start_time
# print(elapsed)

In [207]:
media_nominees = get_nom(OFFICIAL_AWARDS_1315_media, media_categories_dict, tweets, "media")

In [226]:
# media_nominees

In [225]:
# media_nominees_dict = compareToKB(media_nominees, media)

In [224]:
 # media_nominees_dict

In [211]:
our_dict = {}

for award in OFFICIAL_AWARDS_1315_media:
    our_dict[award] = {
        'nominees' : media_nominees_dict[award],
        'winner' : [],
        'presenters' : []
    }

In [212]:
easy_comp(OFFICIAL_AWARDS_1315_media,true_dict,our_dict);

In [213]:
# New

In [214]:
OFFICIAL_AWARDS_1315 = ['cecil b. demille award', 'best motion picture - drama', 'best performance by an actress in a motion picture - drama', 'best performance by an actor in a motion picture - drama', 'best motion picture - comedy or musical', 'best performance by an actress in a motion picture - comedy or musical', 'best performance by an actor in a motion picture - comedy or musical', 'best animated feature film', 'best foreign language film', 'best performance by an actress in a supporting role in a motion picture', 'best performance by an actor in a supporting role in a motion picture', 'best director - motion picture', 'best screenplay - motion picture', 'best original score - motion picture', 'best original song - motion picture', 'best television series - drama', 'best performance by an actress in a television series - drama', 'best performance by an actor in a television series - drama', 'best television series - comedy or musical', 'best performance by an actress in a television series - comedy or musical', 'best performance by an actor in a television series - comedy or musical', 'best mini-series or motion picture made for television', 'best performance by an actress in a mini-series or motion picture made for television', 'best performance by an actor in a mini-series or motion picture made for television', 'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']

In [215]:
people_awards, media_awards = separateAwards(OFFICIAL_AWARDS_1315)

In [220]:
#media_tweet, all_potential_media, full_media_dict = associated_tasks(media_awards, data, "media", media, people) 

In [241]:
people_tweet, all_potential_media, full_people_dict = associated_tasks(people_awards, data, "people", people, people) 

{'best performance by an actress in a motion picture - drama': [(('Julianne', 'Moore'), 3298), (('Moore', 'Alice'), 808), (('Julianne', 'Moore', 'Alice'), 795), (('Congratulations', 'Julianne'), 208), (('Congratulations', 'Julianne', 'Moore'), 208), (('Moore', 'Wins'), 160), (('Julianne', 'Moore', 'Wins'), 160), (('Winnner', 'Julianne'), 109), (('Winnner', 'Julianne', 'Moore'), 109), (('Ruth', 'Wilson'), 52), (('Congrats', 'Julianne'), 49), (('Congrats', 'Julianne', 'Moore'), 49), (('Wearing', 'Givenchy'), 43), (('Wins', 'Award'), 34), (('Moore', 'Wins', 'Award'), 34), (('Moore', 'Freundlich'), 29), (('Julianne', 'Moore', 'Freundlich'), 29), (('Rosamund', 'Pike'), 22), (('Award', 'Woo'), 20), (('Wins', 'Award', 'Woo'), 20), (('Jennifer', 'Aniston'), 14), (('Girl', 'Julianne'), 13), (('Girl', 'Julianne', 'Moore'), 13), (('Patricia', 'Arquette'), 11), (('Moore', 'Rosamund'), 11), (('Moore', 'Rosamund', 'Pike'), 11), (('Reese', 'Witherspoon'), 9), (('Julianne', 'Moore', 'Rosamund'), 9), (

AttributeError: 'tuple' object has no attribute 'lower'

In [None]:
# print(media)

In [None]:
# pprint(full_media_dict)