In [24]:
import json
import spacy

spacy_model = spacy.load("en_core_web_md")

from similarity import compute_similarity

### TODO: delete this
awards = json.load(open("gg2013answers.json"))['award_data'].keys()

class Vote:
    def __init__(self, awards):
        self.awards = awards
        self.keyword_to_awards = self.get_award_keywords()
        self.award_keywords = list(self.keyword_to_awards.keys())
        self.results = {
            award: {
                'winner': {},
                'nominees': {},
            } for award in awards
        }
        self.score_update = {
            "strict": {
                "get_keyword": 2.0,
                "no_such_keyword": -0.2,
                "other_keyword": -2.0,
            },
            "loose": {
                "get_keyword": 2.0,
                "no_such_keyword": -0.2,
                "other_keyword": -1.0,
            },
        }
        # print(json.dumps(self.keyword_to_awards, indent=4))
    
    def get_award_keywords(self) -> dict:
        keyword_to_awards = {}

        for award in self.awards:
            spacy_output = spacy_model(award)
            for token in spacy_output:
                # Check if the token is a stopword, punctuation, or short word
                if token.is_stop or token.is_punct or len(token.text) < 3:
                    continue
                
                # Check if the token's part-of-speech is one of the desired categories
                if token.pos_ in ["ADJ", "NOUN", "ADV", "VERB", "PROPN"]:
                    keyword = token.text.lower()

                    if keyword not in keyword_to_awards:
                        keyword_to_awards[keyword] = [award]
                    else:
                        keyword_to_awards[keyword].append(award)
        
        return keyword_to_awards

    def get_candidate_keywords(self, candidate: str) -> list:
        spacy_output = spacy_model(candidate)
        candidate_keywords = []
        for token in spacy_output:
            print(token.text, token.is_stop, token.is_punct, len(token.text), token.pos_)
            if not (token.is_stop or token.is_punct or len(token.text) < 3) and token.pos_ in ["ADJ", "NOUN", "ADV", "VERB", "PROPN"]:
                candidate_keywords.append(token.text.lower())
        return candidate_keywords
    
    def similarity_for_awards(self, candidate_keywords: list, base_confidence=1.0) -> dict:
        update_key = "strict" if base_confidence > 1.0 else "loose"
        
        award_scores = {award: 0 for award in self.awards}
        for keyword in candidate_keywords:
            if keyword in self.keyword_to_awards:
                for award in self.awards:
                    # candidate keyword is in the award
                    if award in self.keyword_to_awards[keyword]:
                        award_scores[award] += self.score_update[update_key]["get_keyword"] / len(self.keyword_to_awards[keyword])
                    # candidate keyword is not in the award, but in other awards (huge penalty)
                    else:
                        award_scores[award] += self.score_update[update_key]["other_keyword"]                
            else:
                # candidate keyword is not in any award (small penalty)
                for award in self.awards:
                    award_scores[award] += self.score_update[update_key]["no_such_keyword"]
        
        for award_keyword in self.award_keywords:
            # award have a keyword not in the candidate keywords (small penalty)
            if award_keyword not in candidate_keywords:
                for award in self.keyword_to_awards[award_keyword]:
                    award_scores[award] += self.score_update[update_key]["no_such_keyword"]

        return award_scores

    def choose_award_to_vote(self, awards: list, award_candidates: list, base_confidence=1.0, tweet=None):
        if not award_candidates:
            return []
        scores = {}
        # for award in awards:
        #     scores[award] = compute_similarity(award, award_candidates)
        candidate_keywords = list(set([k for candidate in award_candidates for k in self.get_candidate_keywords(candidate)]))
        print(candidate_keywords)
        scores = self.similarity_for_awards(candidate_keywords, base_confidence)
        
        res = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:3]
        if tweet:
            tweet['similarity_res'] = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        highest_score = res[0][1]
        
        ratios = [1, 3, 6]
        
        for i in range(len(res)):
            if res[i][1] < 0:
                break
            res[i] = (res[i][0], highest_score / ratios[i] * base_confidence)
        return res
    
    def vote(self, res,  winner_candidates, nominee_candidates):
        for candidate in winner_candidates:
            for award, score in res:
                if candidate in self.results[award]['winner']:
                    self.results[award]['winner'][candidate] += score
                else:
                    self.results[award]['winner'][candidate] = score
        
        for candidate in nominee_candidates:
            for award, score in res:
                if candidate in self.results[award]['nominees']:
                    self.results[award]['nominees'][candidate] += score
                else:
                    self.results[award]['nominees'][candidate] = score
                

    def vote_for_awards(self, awards: list, tweets: list, modify_tweet=False):
        for tweet in tweets:
            if 'candidates' in tweet:
                for candidates in tweet['candidates']:
                    vote_res = self.choose_award_to_vote(awards, candidates['award_candidates'], candidates['base_confidence'], tweet if modify_tweet else None)
                    self.vote(vote_res, candidates.get('winner_candidates', []), candidates.get('nominee_candidates', []))
                
    def get_results(self):
        for award in self.awards:
            self.results[award]['winner'] = sorted(self.results[award]['winner'].items(), key=lambda x: x[1], reverse=True)[:10]
            self.results[award]['nominees'] = sorted(self.results[award]['nominees'].items(), key=lambda x: x[1], reverse=True)[:20]
        return self.results


In [17]:
modify_tweet = True
v = Vote(awards)


{
    "best": [
        "best screenplay - motion picture",
        "best director - motion picture",
        "best performance by an actress in a television series - comedy or musical",
        "best foreign language film",
        "best performance by an actor in a supporting role in a motion picture",
        "best performance by an actress in a supporting role in a series, mini-series or motion picture made for television",
        "best motion picture - comedy or musical",
        "best performance by an actress in a motion picture - comedy or musical",
        "best mini-series or motion picture made for television",
        "best original score - motion picture",
        "best performance by an actress in a television series - drama",
        "best performance by an actress in a motion picture - drama",
        "best performance by an actor in a motion picture - comedy or musical",
        "best motion picture - drama",
        "best performance by an actor in a supporting role 

In [18]:
all_tweets = [
    {
        "count": 58,
        "new_text": "Best supporting actress in a motion picture is awarded to Anne Hathaway for \"Les Miserables\"",
        "candidates": [
            {
                "winner_candidates": [
                    "Anne Hathaway",
                    "Anne Hathaway for \"Les Miserables"
                ],
                "award_candidates": [
                    "Best supporting actress in a motion picture"
                ],
                "base_confidence": 58
            }
        ],
    },
]
v.vote_for_awards(awards, all_tweets, modify_tweet=modify_tweet)
res = v.get_results()

print(json.dumps(all_tweets, indent=4))

['supporting', 'motion', 'actress', 'picture', 'best']
[
    {
        "count": 58,
        "new_text": "Best supporting actress in a motion picture is awarded to Anne Hathaway for \"Les Miserables\"",
        "candidates": [
            {
                "winner_candidates": [
                    "Anne Hathaway",
                    "Anne Hathaway for \"Les Miserables"
                ],
                "award_candidates": [
                    "Best supporting actress in a motion picture"
                ],
                "base_confidence": 58
            }
        ],
        "similarity_res": [
            [
                "best performance by an actress in a supporting role in a motion picture",
                0.7010084033613446
            ],
            [
                "best performance by an actress in a supporting role in a series, mini-series or motion picture made for television",
                -0.09899159663865537
            ],
            [
                "best per

In [25]:
v = Vote(awards)
v.get_candidate_keywords("cecil b. demille award")

cecil False False 5 PROPN
b. False False 2 PROPN
demille False False 7 PROPN
award False False 5 PROPN


['cecil', 'demille', 'award']