In [None]:
import wikipedia

terms = ['actor', 'actress', 'director']

def check_validity(name, content):
    for n in name.lower().split():
        if content.lower().find(n) == -1:
            return False
    return True

def most_frequent_term(content, terms) -> list:
    count_map = {term: content.lower().count(term) for term in terms}
    counts = sorted(count_map.items(), key=lambda x: x[1], reverse=True)
    max_count = counts[0][1]
    
    if max_count > 0:
        most_frequent = [term for term, count in counts if count == max_count]
    else:
        most_frequent = []
    return most_frequent

def possible_job_list(name) -> list:
    try:
        print(f"wikipedia: Searching for '{name}'")
        search_results = wikipedia.search(name)
        if not search_results:
            print(f"wikipedia: No search results found for '{name}'")
            return []
        
        content = wikipedia.page(search_results[0], auto_suggest=False).content
        if not check_validity(name, content):
            return []

        return most_frequent_term(content, terms)
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"wikipedia: Ambiguous. Possible matches include: {e.options} for '{name}'")
        return []
    except wikipedia.exceptions.PageError:
        print(f"wikipedia: The page does not exist: '{name}'")
        return []
    except Exception as e:
        print(f"wikipedia: {e}")
        return []

In [None]:
import re
import time
import json
import spacy
import os

from syntax_analysis import find_verbs, find_persons, find_work_of_art, generate_candidates, get_descendants_precise, get_descendants_idx, get_descendants_greedy, is_human_pronoun
from data_preprocess import TweetsPreprocessor
from vote import Vote
from wiki import WikiSearch
from timestamp_cluster import TimestampCluster
from handle_names import WinnerNameMatcher, about_human, get_job, name_cleaning

# TODO: change it to the correct folder
winner_result_folder = "winner_result"
winner_vote_file = "vote_winner_verb.json"
winner_to_award = {data['winner']: award.lower() for award, data in json.load(open("gg2013answers.json"))['award_data'].items()}
winner_matcher = WinnerNameMatcher(winner_to_award)

spacy_model = spacy.load("en_core_web_md")

timestamp_cluster = TimestampCluster(load_saved=True)

awards = json.load(open("gg2013answers.json"))['award_data'].keys()

def get_award_keywords(awards) -> dict:
    keyword_to_awards = {}

    for award in awards:
        spacy_output = spacy_model(award)
        for token in spacy_output:
            # Check if the token is a stopword, punctuation, or short word
            if token.is_stop or token.is_punct or len(token.text) < 3:
                continue
            
            # Check if the token's part-of-speech is one of the desired categories
            if token.pos_ in ["ADJ", "NOUN", "ADV", "VERB", "PROPN"]:
                keyword = token.text.lower()

                if keyword not in keyword_to_awards:
                    keyword_to_awards[keyword] = [award]
                else:
                    keyword_to_awards[keyword].append(award)
    del keyword_to_awards['award']
    return keyword_to_awards

keyword_to_awards = get_award_keywords(awards)
award_keywords = list(keyword_to_awards.keys())

nominee_ts_verbs = [
    "win", "receive", "get", "take", "rob", "be"
]
nominee_active_verbs = [
    "win", "receive", "get", "take", "rob"
]
nominee_passive_verbs = [
    "awarded", "go"
]

# def remove_words_after_for(text: str) -> str:
#     for_idx = text.find("for")
#     if for_idx == -1:
#         return text
#     return text[:for_idx].strip()

# def subtract_120_seconds(timestamp_ms: int) -> int:
#     return timestamp_ms - 120000

# timestamp_list = json.load(open("winner_result/timestamp_winner_verb.json"))
# timestamp_to_award = {
#     subtract_120_seconds(lst[1][1]): lst[0] for lst in timestamp_list
# }
# award_to_winner = {
#     lst[0]: remove_words_after_for(lst[1][0]) for lst in timestamp_list
# }

def match_nominee_verb_pattern(text: str) -> bool:
    # List of verb-based patterns
    patterns = [
        r'\b(nominated for)\b',
        r'\b((should|shld|would) have|should\'ve|shld\'ve|would\'ve) (won|received|got|taken home|gone to|been awarded to|been\b(?!.*\bnominated\b))\b',
        r'\b(should|shld|will) (win|receive|get|take home|go to|be awarded to)\b',
        r'I (wish|hope|guess|think|bet|predict|expect) .*(wins|win|receives|receive|gets|get|goes to|go to|awarded to|take(s) home)\b',
        r'(want|wanted) .* to (win|receive|get|go to|be awarded to|take home|be\b(?!.*\bnominated\b))\b',
        r'(would like|hoping) .* (wins|win|receives|receive|gets|get|goes to|go to|awarded to|take(|s) home)\b',
        r'\b((was|is|got|get) robbed)\b',
    ]
    
    # Check each pattern against the text
    for pattern in patterns:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    return False

def nominee_verb_based_match(tweet: dict, base_confidence=1.0):
    """
    Given a tweet, find the nominee and award
    
    Requires: tweet['new_text'] includes nominee_verb_pattern
    Modifies: tweet['candidates']
    """
    try:
        spacy_output = tweet.get('spacy_output', spacy_model(tweet['new_text']))
        
        for sentence in spacy_output.sents:
            # Y (should / will) win X / Y is nominated for X / Y gets robbed
            verb_list = find_verbs(sentence)
            for verb in verb_list:
                root = verb
                award = []
                nominee = []
                cur_confidence = base_confidence
                
                if root.lemma_ in nominee_active_verbs:
                    for child in root.children:
                        if child.dep_ == "dobj":
                            award += generate_candidates(child, root)
                        elif child.dep_ == "nsubj" or child.dep_ == "nsubjpass":
                            if child.pos_ == "PRON":
                                cur_confidence *= 0.8
                                known, is_human = is_human_pronoun(child.text)
                                if known:
                                    if is_human:
                                        nominee += [p.text for p in find_persons(spacy_output) if p.root.i <= child.i]
                                    else:
                                        nominee += [w.text for w in find_work_of_art(spacy_output) if w.root.i < child.i]
                                else:
                                    nominee += [p.text for p in find_persons(spacy_output) if p.root.i <= child.i]
                                    nominee += [w.text for w in find_work_of_art(spacy_output) if w.root.i < child.i]
                                    nominee = list(set(nominee))
                                    cur_confidence *= 0.8
                            else:
                                nominee.append(get_descendants_precise(child))                    
                # X (is awarded to / goes to) Y
                elif root.lemma_ in nominee_passive_verbs and root.i + 1 < len(sentence) and sentence[root.i + 1].text == "to":                 
                    for chunk in sentence.noun_chunks:
                        if chunk.root.head == root and (chunk.root.dep_ == "nsubj" or chunk.root.dep_ == "nsubjpass"):
                            award.append(get_descendants_precise(chunk.root))
                        elif chunk.root.dep_ == "pobj" and (chunk.root.head == root or chunk.root.head.text == "to"):
                            if chunk.root.pos_ == "PRON":
                                cur_confidence *= 0.8
                                known, is_human = is_human_pronoun(chunk.root.text)
                                if known:
                                    if is_human:
                                        nominee += [p.text for p in find_persons(spacy_output) if p.root.i <= child.i]
                                    else:
                                        nominee += [w.text for w in find_work_of_art(spacy_output) if w.root.i < child.i]
                                else:
                                    nominee += [p.text for p in find_persons(spacy_output) if p.root.i <= child.i]
                                    nominee += [w.text for w in find_work_of_art(spacy_output) if w.root.i < child.i]
                                    nominee = list(set(nominee))
                                    cur_confidence *= 0.8
                            else:
                                idxs = get_descendants_idx(chunk.root)
                                precise = get_descendants_precise(chunk.root)
                                greedy = get_descendants_greedy(chunk.root, idxs, root)
                                nominee += list(set([precise, greedy]))

                if nominee:
                    if not award:
                        cur_confidence *= 0.5
                        # do some inference based on keywords
                        keyword_constraints = []
                        for token in sentence:
                            if token.text.lower() in award_keywords and not token.text.lower() in keyword_constraints:
                                keyword_constraints.append(token.text.lower())
                        
                        # only move forward if there is at least one keyword
                        if keyword_constraints:
                            possible_awards = set(awards)
                            for keyword in keyword_constraints:
                                possible_awards = possible_awards.intersection(keyword_to_awards[keyword])

                            if possible_awards:
                                award = list(possible_awards)
                                cur_confidence /= len(award)
                    else:
                        for token in sentence:
                            if token.text.lower() in award_keywords:
                                award.append(token.text.lower())
                                
        
                if nominee or award:
                    candidates = {
                        "nominee_candidates": [name_cleaning(n) for n in list(set(nominee))],
                        "award_candidates": list(set(award)),
                        "base_confidence": cur_confidence,
                    }
                    tweet["candidates"] = tweet.get("candidates", []) + [candidates]
        
    except:
        return

In [1]:
tweet = {
        "text": "\"Allow us to present: Nepotism!\" -Eva Longoria and Don Cheadle #GoldenGlobes",
        "user": {
            "screen_name": "laleviner",
            "id": 75938926
        },
        "id": 290628849807523840,
        "timestamp_ms": 1358124849000,
        "new_text": "\"Allow us to present: Nepotism!\" -Eva Longoria and Don Cheadle",}

In [2]:

from timestamp_cluster import TimestampCluster
from datetime import datetime

tc = TimestampCluster(load_saved=True)
print(tc.categorize_timestamp(tweet['timestamp_ms']))
print(tc.categorize_timestamp_after(tweet['timestamp_ms']))

print(datetime.fromtimestamp(tweet['timestamp_ms'] / 1000.0).strftime("%Y-%m-%d %H:%M:%S"))

(None, 0)
('best performance by an actor in a supporting role in a motion picture', -3.252083333333333)
2013-01-13 18:54:09


In [1]:

from spacy import displacy

doc = spacy_model(tweet['new_text'])
# displacy.render(doc)
print([(ent.text, ent.label_) for ent in doc.ents])

if match_nominee_verb_pattern(tweet['new_text']):
    nominee_verb_based_match(tweet, 1.0)

print(json.dumps(tweet, indent=4))

NameError: name 'spacy_model' is not defined

In [None]:
from spacy import displacy
from spacy.matcher import Matcher

def correct_dep(matcher, doc, i, matches):
    _, start, end = matches[i]
    for token in doc[start:end]:
        if token.text == "supporting":
            token.dep_ = "amod"  # set as an adjective modifier
        elif token.text == "actress":
            token.dep_ = "nsubj"  # set as the nominal subject

# Create a Matcher pattern to identify the problematic structure
matcher = Matcher(spacy_model.vocab)
pattern = [{"LOWER": "supporting"}, {"LOWER": "actress"}]
matcher.add("SUPPORTING_ACTRESS", [pattern], on_match=correct_dep)

winner_verb_extract(tweet)
doc = spacy_model(tweet['new_text'])
matcher(doc)
displacy.render(doc, style="dep", page=True, minify=True)
print([(ent.text, ent.label_) for ent in doc.ents])
print(json.dumps(tweet, indent=4))