# Preprocessing Start

In [213]:
import pandas as pd
import numpy as np

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
nltk.download('wordnet')

from collections import Counter
from collections import OrderedDict

import string
import re
import unidecode
import requests
import json
from pprint import pprint

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/keithpallo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:
# Load in official award names

OFFICIAL_AWARDS_1315_media = ['best motion picture - drama', 'best motion picture - comedy or musical', 'best animated feature film', 'best foreign language film', 'best screenplay - motion picture', 'best original score - motion picture', 'best original song - motion picture', 'best television series - drama', 'best television series - comedy or musical', 'best mini-series or motion picture made for television']



In [45]:
# Load in answers - we will remove this when we submit the final fial

with open('gg2015answers.json') as f:
    answers = json.load(f)
    
true_dict = answers['award_data'] # use this for true dict in the below function

In [53]:
# Loadin dataset - TO DO, automate this processes for four relevant years (2013,2015,2018,2019)

data = pd.read_json('../gg2015.json')
data = pd.DataFrame(data)

In [54]:
# Write dataset to a list

data = data['text'].values.tolist()

In [55]:
# Initialize seperate knowledge bases 

people = set()
media = set() 

In [56]:
url = 'https://query.wikidata.org/sparql'

In [57]:
query = """
# ALL PERSONS required for awards
SELECT DISTINCT ?person ?personLabel WHERE {
# FIRST: uncomment occupation:
  ?person wdt:P31 wd:Q5;
           wdt:P106/wdt:P279* wd:Q2526255; #uncomment for     FILM director (no award for TV director)
  FILTER NOT EXISTS { ?person wdt:P570 ?date. } #person is alive
  
# SECOND: uncomment gender if applicable (for actor/actress):
#          wdt:P21 wd:Q6581097;    #male
#          wdt:P21 wd:Q6581072;    #female
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb = r.json()
for item in kb['results']['bindings']:
    people.add(unidecode.unidecode(item['personLabel']['value']))

In [58]:
query = """
# ALL PERSONS required for awards
SELECT DISTINCT ?person ?personLabel WHERE {
# FIRST: uncomment occupation:
  ?person wdt:P31 wd:Q5;
           wdt:P106/wdt:P279* wd:Q10800557; #uncomment for    FILM actor (don't just use actor)
  FILTER NOT EXISTS { ?person wdt:P570 ?date. } #person is alive
  
# SECOND: uncomment gender if applicable (for actor/actress):
#          wdt:P21 wd:Q6581097;    #male
#          wdt:P21 wd:Q6581072;    #female
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r1 = requests.get(url, params = {'format': 'json', 'query': query})
kb1 = r1.json()
for item in kb1['results']['bindings']:
    people.add(unidecode.unidecode(item['personLabel']['value']))

In [59]:
query = """
# ALL PERSONS required for awards
SELECT DISTINCT ?person ?personLabel WHERE {
# FIRST: uncomment occupation:
  ?person wdt:P31 wd:Q5;
           wdt:P106/wdt:P279* wd:Q10798782; #uncomment for    TV actor (don't just use actor)
  FILTER NOT EXISTS { ?person wdt:P570 ?date. } #person is alive
  
# SECOND: uncomment gender if applicable (for actor/actress):
#          wdt:P21 wd:Q6581097;    #male
#          wdt:P21 wd:Q6581072;    #female
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb = r.json()
for item in kb['results']['bindings']:
    people.add(unidecode.unidecode(item['personLabel']['value']))

In [60]:
query = """
# ALL PERSONS required for awards
SELECT DISTINCT ?person ?personLabel WHERE {
# FIRST: uncomment occupation:
  ?person wdt:P31 wd:Q5;
           wdt:P106/wdt:P279* wd:Q36834; #uncomment for       composer (cannot use songwriter)
  FILTER NOT EXISTS { ?person wdt:P570 ?date. } #person is alive
  
# SECOND: uncomment gender if applicable (for actor/actress):
#          wdt:P21 wd:Q6581097;    #male
#          wdt:P21 wd:Q6581072;    #female
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb = r.json()
for item in kb['results']['bindings']:
    people.add(unidecode.unidecode(item['personLabel']['value']))

In [61]:
query = """
# ALL PERSONS required for awards
SELECT DISTINCT ?person ?personLabel WHERE {
# FIRST: uncomment occupation:
  ?person wdt:P31 wd:Q5;
           wdt:P106/wdt:P279* wd:Q28389; #uncomment for       screenwriter
  FILTER NOT EXISTS { ?person wdt:P570 ?date. } #person is alive
  
# SECOND: uncomment gender if applicable (for actor/actress):
#          wdt:P21 wd:Q6581097;    #male
#          wdt:P21 wd:Q6581072;    #female
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb = r.json()
for item in kb['results']['bindings']:
    people.add(unidecode.unidecode(item['personLabel']['value']))

In [232]:
## Media KB Creation

In [140]:
url = 'https://query.wikidata.org/sparql'
query = """
SELECT DISTINCT ?itemLabel  WHERE {
 ?item wdt:P31 wd:Q11424. ?item wdt:P577 ?_publication_date. ?item wdt:P136 ?_genre.
 ?_genre rdfs:label ?_genreLabel. BIND(str(YEAR(?_publication_date)) AS ?year)
 FILTER((LANG(?_genreLabel)) = "en")
 FILTER (?_publication_date >= "2010-00-00T00:00:00Z"^^xsd:dateTime && ?_publication_date <= "2019-00-00T00:00:00Z"^^xsd:dateTime )
 SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .} }
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb_m = r.json()

for item in kb_m['results']['bindings']:
    media.add(unidecode.unidecode(item['itemLabel']['value']))

In [141]:
query = """
SELECT DISTINCT ?itemLabel  WHERE {
  ?item wdt:P31 wd:Q5398426.
  ?item wdt:P580  ?_start
 FILTER (?_start >= "2000-00-00T00:00:00Z"^^xsd:dateTime && ?_start <= "2019-00-00T00:00:00Z"^^xsd:dateTime )
  SERVICE wikibase:label {bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .}
}
    
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
kb_m = r.json()

for item in kb_m['results']['bindings']:
    media.add(unidecode.unidecode(item['itemLabel']['value']))

In [64]:
f = open('english.txt', 'r')
stop_words = f.read().splitlines()

In [65]:
OFFICIAL_AWARDS_1315_people = ['cecil b. demille award',
                        'best performance by an actress in a motion picture - drama', 
                        'best performance by an actor in a motion picture - drama',
                        'best performance by an actress in a motion picture - comedy or musical', 
                        'best performance by an actor in a motion picture - comedy or musical',
                        'best performance by an actress in a supporting role in a motion picture', 
                        'best performance by an actor in a supporting role in a motion picture', 
                        'best director - motion picture', 
                        'best screenplay - motion picture', 
                        'best original score - motion picture',
                        'best performance by an actress in a television series - drama', 
                        'best performance by an actor in a television series - drama',
                        'best performance by an actress in a television series - comedy or musical', 
                        'best performance by an actor in a television series - comedy or musical', 
                        'best performance by an actress in a mini-series or motion picture made for television', 
                        'best performance by an actor in a mini-series or motion picture made for television', 
                        'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 
                        'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']

# Preprocessing Complete


In [66]:
def parseAward(award):
    """
    Returns a list of words that can be used to filter for a particular award
    """
    
    award = re.split('\W+', award)
    award = [i for i in award if i not in stop_words]
    award = list(set(award))
    return award

In [67]:
def getCategoriesDict(awards_list):
    """
    Returns a dictionary that has all awards as keys, and a list of relevant filtering words as values
    """
    
    categories_dict = dict()
    for a in awards_list:
        terms = parseAward(a)
        categories_dict[a] = terms

    return categories_dict

In [166]:
def filter0(data, list1, spec = "people"):
    """
    Returns a list of tweets that are relevant to a particular award
    """
    synonyms = {}
    
    if spec == "people":
        synonyms = {
            'motion' : ['motion picture', 'motion', 'picture', 'movie'],
            'picture' : ['motion picture', 'motion', 'picture', 'movie'],
            'television' : ['television', 'tv'],
            'mini' : ['mini-series', 'mini', 'series', 'miniseries'],
            'series' : ['mini-series', 'mini', 'series', 'miniseries']
        }

    
    result = []
    
    list1 = [i for i in list1 if i != 'performance' and i != 'role']

    for tweet in data:
        cond = True
        for i in list1:
            if i in synonyms:
                if all(j not in tweet.lower() for j in synonyms[i]):
                    cond = False
            elif i not in tweet.lower():
                cond = False
        if cond:
            result.append(tweet)

    return result

In [71]:
def extractPeople(data, list1):
    """
    Extracts potential People nominees from an individual tweet
    """
    
    
    result = []
       
    translator = str.maketrans('', '', string.punctuation)
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'rt', 'golden', 'globe', 'globes']
    stop = remove_terms + list1
    
    for tweet in data:
        
        tweet = re.sub("\d+", "", tweet)       #strip nums
        tweet = re.sub(r'http\S+', '', tweet)  #strip urls
        tweet = re.sub(r'#\S+', '', tweet)     #strip hashtags
        tweet = tweet.translate(translator)    #strip non-alphanumeric characters
        tweet = tweet.split()                  #tokenize
        tweet = [term for term in tweet if term.lower() not in stop_words] #remove stop words
        for i in stop:
            for j in tweet:
                if i.lower() in j.lower():
                    tweet.remove(j)
        result.append(tweet)
        

        
    grams = [];

    for tweet in result:
        if tweet:
            # Get all possible bigrams & trigrams in a tweet
            gram = list(nltk.everygrams(tweet, 2, 3))
            
            # Filter through and append to list for tweet
            for g in gram:
                if len(g) == 2:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])):
                        grams.append(g)
                else:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[2])):
                        grams.append(g)

  
    fdist = nltk.FreqDist(grams)

    try:
        names = fdist.most_common()
    except:
        names = "nothing here"
    
    return names

In [223]:
def extractMedia(data, list1):
    """
    Extracts potential media nominees from an individual tweet
    """
    
    result = []
       
    translator = str.maketrans('', '', string.punctuation)
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'rt', 'golden', 'globe', 'globes', 'best']    
    stop = remove_terms + list1
    
    for tweet in data:
        tweet = re.sub("\d+", "", tweet)      #strip nums
        tweet = re.sub(r'http\S+', '', tweet) #strip urls
        tweet = re.sub(r'#\S+', '', tweet)    #strip hashtags
        tweet = tweet.translate(translator)   #strip non-alphanumeric characters
        tweet = tweet.split()                 #tokenize

        for i in stop:
            for j in tweet:
                if i.lower() in j.lower():
                    tweet.remove(j)
        tweet = ' '.join(tweet)
        result.append(tweet)
        
        
    grams = [];

    for tweet in result:
        if tweet:
            
            grams.extend(re.findall(r"([A-Z][\w-]*(?:\s+[A-Z][\w-]*)+)", tweet))
            grams.extend(re.findall(r"\b[A-Z][a-z]+\b.*\b[A-Z][a-z]+\b", tweet))
            #singular = re.findall(r"\b[A-Z][a-z]+\b", tweet)
            #singular = [i for i in singular if not wordnet.synsets(i)]
            #grams.extend(singular)
            
  
    fdist = nltk.FreqDist(grams)

    try:
        names = fdist.most_common()

    except:
        names = "nothing here"
    
    return names

In [93]:
def get_category_tweets(award_list, categories_dict,data):
    """
    Using an award list and category dictionary, filters out tweets at an award level
    """
    
    d = {}
    for award in award_list:
        d["{0}".format(award)] = filter0(data, categories_dict[award])
                                         
    return d

In [240]:
def get_nom(award_list, categories_dict, tweets_dict, spec = ""):
    """
    Gets all potential nominees based on ExtractMedia or ExtractPeople
    """
    
    if spec == "people":
        funct = extractMedia
    elif spec == "media":
        funct = extractPeople
    else:
        print("there is a problem")

    nominees = {}
    for award in award_list:
        nominees["{0}".format(award)] = funct(tweets_dict[award], categories_dict[award])
    
    return nominees

In [149]:
def compareToKB(nominees,kb):
    """
    Takes in a dictionary of potential nominees and removes those that don't appear in a relevant KB
    
    If no nominees are in the KB, then ... (currentlt top five)
    """
    
    final_nominees = {}
        
    for i in nominees:
        award_nominees = []
        
        for j in nominees[i]:
            if j[0] in kb:
                award_nominees.append(j[0])
                
        if not award_nominees:
            award_nominees = [i[0] for i in nominees[i][:5]]
        
        final_nominees[i] = award_nominees
        
    return final_nominees

In [246]:
def separateAwards(award_list):
    possible_people_awards = ['actor',' actress', 'musician', ' singer', 'composer', 'director', 'producer',
                        'screenwriter', 'stage technician', 'author']

    people_awards = []
    media_awards = []

    for category in award_list:
        if any(job in category.lower() for job in possible_people_awards):
            people_awards.append(category)
        else:
            media_awards.append(category)

    return people_awards, media_awards

In [None]:
def compress_associated_dict(award_list,nominees,winners,presenters):
    
    our_dict = {}

    for award in award_list:
        our_dict[award] = {
            'nominees' : nominees[award],
            'winner' : winners[award],
            'presenters' : presenters[award]
        }
        
    return our_dict

In [None]:
## To DO

def getMediaWinners(nominees):
    pass

In [169]:
def getPeopleWinners(nominees):
    final_winners = {}
    
    for award in nominees:
        winner = ' '.join(nominees[award][0][0])
        final_winners[award] = winner
        
    return final_winners

In [None]:
def associated_tasks(award_list,data,spec,kb,kb2):
    
    # Create a dictionary to filter tweets at a category level
    cat_filter_dict = getCategoriesDict(award_list)                 
    
    # Get all assocaited tweets for each award 
    tweets_dict = get_category_tweets(award_list, cat_filter_dict,data)
    
    # For each award, get all assocaited nominees
    full_nom_dict = get_nom(award_list, cat_dict, tweets, spec)
    
    # Filter out all nominees that are not in the dictionary
    final_nom = comparetoKB(full_nom_dict,kb)
    
    
    # TO DO - Fill in the below functions
    
    if spec == "media":
        pass 
        # final_winners = get_media_winners()
    
    elif spec =="people":
        final_winners = getPeopleWinners(full_nom_dict)
    
    # TO DO - 
    # Get presenters
    # full_presenters = get_pres(award_list,cat_dict,tweets)
    
    # Filter out all presenters not in the secondary kb
    # final_pres = comparetoKB(presenters,kb2)
    
    compressed_dict = compress_associated_dict(award_list,final_nom,final_winners,final_pres)
    
    # This return is only required for main_exec
    return tweets_dict, full_nom_dict, compressed_dict

In [5]:
def main_exec(data,kb_p,kb_m):
    """
    Main execution file - how you run the program
    Itype: kb_p and kb_m are sets for our built KB's
    """
    
    # Call host search function
    # // To Do // - Insert function call (should write to JSON / return a variable to write later )
    
    # Call award entity recognition function
    # // To Do // - Insert function call (should write to JSON / return a variable to write later)
    
    # Set a variable to the hardcoded list
    hardcoded = OFFICIAL_AWARDS_1315_media
    
    # Segment out awards award categories
    people_awards, media_awards = seperateAwards(hardcoded)
    
    
    ## Functions below need to return a dictionary with following structure
    ## Key 1: Award Name, Value: Dictionary
    ## Key 2: [ Nominees, Winners, Presenters] 
    
    
    # Call people award search function - winner, nominee, presenter (potentially swap last two)
    
    people_tweet, all_potential_people, full_people_dict = associated_tasks(people_awards,data,"people",kb_p,kb_p)
    
    media_tweet, all_potential_media, full_media_dict = associated_tasks(media_awards,"media",kb_m,kb_p)
    
    
    
    
    # Call people award search function - winner, nominee, presenter (potentially swap last two)
    ## // To DO // - Insert function call ( only write to easy_comp now)
    
    
    # Merge dictionaries from two above functions
    # Return single dict for easy_comp

In [163]:
def easy_comp(award_list,true_dict,our_dict):
        
    # Input 
    # Dict of dictionaries 
    # Keys - Award Name
    # Values - Dictionary with keys (nominees, presenters, winner)
    
    # Output
    # Nested List (some elements are dictionarys)

    output = []

    for award in award_list:
        output.append([award,["Guess",our_dict[award]],[["True",true_dict[award]]]])

    
    return output

# For easy printing use pprint. It works nicely for updated comparison

In [68]:
people_categories_dict = getCategoriesDict(OFFICIAL_AWARDS_1315_people)
media_categories_dict = getCategoriesDict(OFFICIAL_AWARDS_1315_media)
media_categories_dict;

In [167]:
import timeit
start_time = timeit.default_timer()
tweets = get_category_tweets(OFFICIAL_AWARDS_1315_media, media_categories_dict,data)
elapsed = timeit.default_timer() - start_time
print(elapsed)

22.43589629099006


In [242]:
media_nominees = get_media_nom(OFFICIAL_AWARDS_1315_media, media_categories_dict, tweets, "media")

In [227]:
media_nominees_dict = compareToKB(media_nominees, media)

In [230]:
our_dict = {}

for award in OFFICIAL_AWARDS_1315_media:
    our_dict[award] = {
        'nominees' : media_nominees_dict[award],
        'winner' : [],
        'presenters' : []
    }

In [239]:
easy_comp(OFFICIAL_AWARDS_1315_media,true_dict,our_dict);