In [35]:
from nominee import awards, keyword_to_awards, award_keywords, spacy_model
import json

def remove_words_after_for(text: str) -> str:
    for_idx = text.find("for")
    if for_idx == -1:
        return text
    return text[:for_idx].strip()

def subtract_120_seconds(timestamp_ms: int) -> int:
    return timestamp_ms - 120000

timestamp_list = json.load(open("winner_result/timestamp_winner_verb.json"))
timestamp_to_award = {
    subtract_120_seconds(lst[1][1]): lst[0] for lst in timestamp_list
}
award_to_winner = {
    lst[0]: remove_words_after_for(lst[1][0]) for lst in timestamp_list
}

def nominee_ts_based_award_predict(tweet: dict, spacy_output, base_confidence=1.0) -> (str, float):
    if "robbed" not in tweet['new_text'].lower():
        return None, None
    
    keyword_constraints = []
    for token in spacy_output:
        if token.text.lower() in award_keywords and token.text.lower() != "award":
            keyword_constraints.append(token.text.lower())
    
    possible_awards = set(awards)
    if keyword_constraints:
        for keyword in keyword_constraints:
            possible_awards = possible_awards.intersection(keyword_to_awards[keyword])
    
    if not possible_awards:
        return None, None
    
    possible_timestamps = [t for t in timestamp_to_award.keys() if t < tweet['timestamp_ms'] and timestamp_to_award[t] in possible_awards]
    max_timestamp = max(possible_timestamps) if possible_timestamps else -1
    
    if max_timestamp == -1:
        return None, None
    else:
        return timestamp_to_award[max_timestamp], base_confidence

In [40]:
tweet = {
        "text": "Sofia was robbed!  #GoldenGlobes",
        "user": {
            "screen_name": "JCSosa",
            "id": 27664376
        },
        "id": 290627486339973122,
        "timestamp_ms": 1358125967000,
        "new_text": "Sofia was robbed!",
        "candidates": [
            {
                "nominee_candidates": [
                    "Sofia"
                ],
                "award_candidates": [],
                "base_confidence": 1.0
            }
        ]
    }


In [41]:
doc = spacy_model(tweet['new_text'])
print([ent.text for ent in doc.ents])
print(nominee_ts_based_award_predict(tweet, spacy_model(tweet['new_text'])))

['Sofia']
('best performance by an actor in a supporting role in a motion picture', 1.0)


In [5]:
from spacy import displacy
from spacy.matcher import Matcher

def correct_dep(matcher, doc, i, matches):
    _, start, end = matches[i]
    for token in doc[start:end]:
        if token.text == "supporting":
            token.dep_ = "amod"  # set as an adjective modifier
        elif token.text == "actress":
            token.dep_ = "nsubj"  # set as the nominal subject

# Create a Matcher pattern to identify the problematic structure
matcher = Matcher(spacy_model.vocab)
pattern = [{"LOWER": "supporting"}, {"LOWER": "actress"}]
matcher.add("SUPPORTING_ACTRESS", [pattern], on_match=correct_dep)

winner_verb_extract(tweet)
doc = spacy_model(tweet['new_text'])
matcher(doc)
displacy.render(doc, style="dep", page=True, minify=True)
print([(ent.text, ent.label_) for ent in doc.ents])
print(json.dumps(tweet, indent=4))

NOminee verb pattern matched


[('Jessica Lange', 'PERSON'), ('American', 'NORP')]
{
    "text": "Best Supporting Actress, TV goes to\u2026 Maggie Smith. As always. #GoldenGlobes",
    "user": {
        "screen_name": "televisionary",
        "id": 10545772
    },
    "id": 290627447525879808,
    "timestamp_ms": 1358125957000,
    "new_text": "Pissed Jessica Lange didn't win best actress. And why was American horror story not nominated for best miniseries",
    "candidates": [
        {
            "winner_candidates": [],
            "award_candidates": [
                "best actress",
                "best actress."
            ],
            "base_confidence": 1.0
        },
        {
            "winner_candidates": [],
            "award_candidates": [
                "best actress",
                "best actress."
            ],
            "base_confidence": 1.0
        }
    ]
}
