In [515]:
import json 
import pandas as pd
import re
import nltk
from nltk.util import ngrams
from collections import Counter
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_json("./gg2013.json")

In [516]:
#Analyze common phrase occurences
'''
Input: A list of strings after parsing
Output: The most common grams containing 'best'
Remarks:
    1. Longest award phrase is 12 words
'''
def common_phrases(keywords):
    bigrams = []
    trigrams = []
    quadgrams = []
    multigrams = []
    megagrams = []
    for item in keywords:
        clean_string = re.sub('[^A-Za-z0-9]+', ' ', item)
        tokens = nltk.word_tokenize(clean_string)
        grams2 = ngrams(tokens, 2)
        grams3 = ngrams(tokens, 3)
        grams4 = ngrams(tokens, 4)
        grams9 = ngrams(tokens, 9)
        grams12 = ngrams(tokens, 12)
        for gram in grams2:
            bigrams.append(gram)
        for gram in grams3:
            trigrams.append(gram)
        for gram in grams4:
            quadgrams.append(gram)
        for gram in grams9:
            multigrams.append(gram)
        for gram in grams12:
            megagrams.append(gram)

    common2 = Counter(bigrams).most_common()
    common3 = Counter(trigrams).most_common()
    common4 = Counter(quadgrams).most_common()
    common9 = Counter(multigrams).most_common()
    common12 = Counter(megagrams).most_common()

    common2 = filter(lambda x: x[0][0] == "best", common2)
    common3 = filter(lambda x: x[0][0] == "best", common3)
    common4 = filter(lambda x: x[0][0] == "best", common4)
    common9 = filter(lambda x: x[0][0] == "best", common9)
    common12 = filter(lambda x: x[0][0] == "best", common12)

    common2 = list(common2)
    common3 = list(common3)
    common4 = list(common4)
    common9 = list(common9)
    common12 = list(common12)

    combined = common2 + common3 + common4 + common9 + common12
    return combined

In [517]:
'''
Input: List of strings
Output: Strings with its frequency
Remarks:
    1. Can also count common strings in strings for strategy2
'''
def most_common_beststring(strings):
    best_string = filter(lambda x: "best" in x, strings)
    best_string = list(best_string)
    return Counter(best_string).most_common()

In [518]:
#Strategy 1 scan after the word won:
df2 = df[df['text'].str.contains(" won ") | df['text'].str.contains(" Won ")]
tweets = df2['text'].tolist()
keywords = []
#regexp = re.compile(r'[!?.;#]+(?=$|\s)')
regexp = re.compile(r'[!?.;#,@:]')

stop_words = ['at','and','on','because','but','tonight','before','lol','since','i','I']

#Parse Keywords. All phrase after 'won best' will be captured.
for tweet in tweets:
    words = tweet.lower().split()
    index_won = words.index("won")
    if words[index_won + 1] == "best":
        keyword = "best "
        curr_index = index_won + 2
        while curr_index != len(words):
            if regexp.search(words[curr_index]) or words[curr_index] in stop_words:
                #keyword += words[curr_index]
                keyword = keyword.strip()
                break
            keyword += words[curr_index] + " "
            curr_index += 1
        keywords.append(keyword)

strategy1 = common_phrases(keywords)
print(strategy1)

[(('best', 'director'), 41), (('best', 'supporting'), 37), (('best', 'original'), 37), (('best', 'actress'), 28), (('best', 'actor'), 22), (('best', 'picture'), 14), (('best', 'screenplay'), 12), (('best', 'motion'), 11), (('best', 'tv'), 9), (('best', 'performance'), 8), (('best', 'animated'), 8), (('best', 'song'), 7), (('best', 'foreign'), 7), (('best', 'dressed'), 5), (('best', 'movie'), 5), (('best', 'drama'), 4), (('best', 'comedy'), 4), (('best', 'skin'), 1), (('best', 'blurry'), 1), (('best', 'life'), 1), (('best', 'anal'), 1), (('best', 'husband'), 1), (('best', 'film'), 1), (('best', 'pic'), 1), (('best', 'original', 'song'), 23), (('best', 'director', 'for'), 13), (('best', 'supporting', 'actor'), 11), (('best', 'supporting', 'actress'), 11), (('best', 'actor', 'in'), 11), (('best', 'motion', 'picture'), 10), (('best', 'actress', 'for'), 9), (('best', 'performance', 'by'), 8), (('best', 'actress', 'in'), 6), (('best', 'actor', 'for'), 5), (('best', 'screenplay', 'for'), 5), 

In [519]:
#Strategy 2 scan before the word won:
df3 = df[df['text'].str.contains(" goes to ")]
tweets = df3['text'].tolist()
keywords2 = []
#Parse Backwards, stop at the word best
for tweet in tweets:
    words = nltk.word_tokenize(tweet.lower())
    index = words.index("to")
    if words[index - 1] == "goes":
        keyword = ""
        curr_index = index - 2
        while curr_index >= 0:
            if curr_index == 0 or words[curr_index] == "best":
                keyword = words[curr_index] + keyword
                break
            keyword = " " + words[curr_index] + keyword
            curr_index -= 1
        keywords2.append(keyword)

#print(keywords2)
strategy2 = common_phrases(keywords2)
print(strategy2)

most_common_beststring(keywords)

[(('best', 'actress'), 148), (('best', 'supporting'), 142), (('best', 'motion'), 115), (('best', 'actor'), 96), (('best', 'original'), 39), (('best', 'screenplay'), 36), (('best', 'director'), 36), (('best', 'tv'), 33), (('best', 'animated'), 17), (('best', 'foreign'), 15), (('best', 'speech'), 13), (('best', 'comedy'), 9), (('best', 'drama'), 8), (('best', 'performance'), 8), (('best', 'picture'), 8), (('best', 'miniseries'), 7), (('best', 'mini'), 5), (('best', 'movie'), 4), (('best', 'television'), 4), (('best', 'dressed'), 3), (('best', 'goldenglobes'), 3), (('best', 'editing'), 3), (('best', 'sound'), 2), (('best', 'use'), 2), (('best', 'musical'), 2), (('best', 'gay'), 2), (('best', 'presenters'), 2), (('best', 'tan'), 2), (('best', 'hair'), 2), (('best', 'humans'), 1), (('best', 'imitation'), 1), (('best', 'political'), 1), (('best', 'awkward'), 1), (('best', 'originalscore'), 1), (('best', 'painted'), 1), (('best', 'song'), 1), (('best', 'swallowed'), 1), (('best', 'acceptance'

[('best', 90),
 ('best director', 26),
 ('best supporting', 14),
 ('best actress', 12),
 ('best original', 12),
 ('best picture', 10),
 ("best original song for 'ben'", 9),
 ('best director for', 8),
 ('best original song for', 7),
 ('best performance by an actress in a supporting', 6),
 ('best actor', 5),
 ('best original song', 5),
 ('best screenplay', 5),
 ('best animated feature', 5),
 ('best supporting actor for django', 4),
 ('best drama', 4),
 ('best tv', 4),
 ('best song', 4),
 ('best movie', 4),
 ('best motion picture', 4),
 ('best supporting actor', 3),
 ('best supporting actress', 3),
 ('best foreign film', 3),
 ('best foreign', 3),
 ('best actor in a comedy or', 3),
 ('best actress for silver linings', 2),
 ('best supporting actress for les', 2),
 ('best screenplay for django', 2),
 ('best screenplay for', 2),
 ('best actress for keeping up with the', 2),
 ('best actress in a tv series drama (and probably many best dressed', 2),
 ('best comedy or musical', 2),
 ('best actor

In [520]:
common_strings = most_common_beststring(keywords2)

In [521]:
'''
Input: gram sets
Output: phrases with frequency
'''
def accumulate_votes(grams1, grams2, common_strings):
    awards = {}
    '''
    for gram in grams1:
        untokenize = ' '.join(gram[0])
        awards[untokenize] = len(gram[0]) * gram[1]
    for gram in grams2:
        untokenize = ' '.join(gram[0])
        if untokenize in awards:
            awards[untokenize] = awards[untokenize] * 2.5
        else:
            awards[untokenize] = len(gram[0]) * gram[1]
    '''
    for string in common_strings:
        first_two = ' '.join(nltk.word_tokenize(string[0])[:2])
        if string[0] in awards:
            awards[string[0]] = awards[string[0]] * 2.5
        elif first_two in awards:
            awards[string[0]] = awards[first_two] * string[1]
        else:
            awards[string[0]] = len(string[0]) * string[1]
    return sorted(awards.items(), key = lambda x: x[1], reverse = True)
votes = accumulate_votes(strategy1, strategy2, common_strings)
#Need a way to combine similar categories
print(len(votes))

259


In [522]:
collocation_words = {
    "tv":"television",
    "pic":"picture",
    "for":"-",
    "in":"-",
    'or':'/',
    'of':'-'
}

skip_words = ['a']

paraphrase = [',','@','(',')','#']

def gram_cleaning(grams):
    new_grams = []
    for gram in grams:
        word_list = []
        for word in gram[0]:
            if word in collocation_words:
                word = collocation_words[word]
            if word in skip_words:
                break
            if word not in paraphrase:
                word_list.append(word)
        word_tuple = tuple(word_list)
        gram_tuple = (word_tuple,gram[1])
        new_grams.append(gram_tuple)
    return new_grams

def sticky_word_string(phrase):
    phrase_list = nltk.word_tokenize(phrase)
    token_list = []
    for word in phrase_list:
        flag = 0
        for i,character in enumerate(word):
            if character == '/' and i != 0:
                token_list.append(word[0:i])
                token_list.append(word[i])
                flag = i
                if i != len(word)-1:
                    token_list.append(word[i+1:len(word)])
                break
        if flag == 0 and word not in skip_words:
            token_list.append(word)
    clean_string = ' '.join(token_list)
    return clean_string


def string_cleaning(grams):
    new_grams = []
    for gram in grams:
        word_list = []
        clean_gram = sticky_word_string(gram[0])
        temp_list = nltk.word_tokenize(clean_gram)
        for word in temp_list:
            if word in collocation_words:
                word = collocation_words[word]
            if word not in paraphrase:
                word = word.strip()
                word_list.append(word)
        phrase = ' '.join(word_list)
        gram_tuple = (phrase,gram[1])
        new_grams.append(gram_tuple)
    return new_grams

#for vote in votes:
#print(strategy1)
new1 = gram_cleaning(strategy1)
new2 = gram_cleaning(strategy2)
new3 = string_cleaning(common_strings)
votes = accumulate_votes(new1, new2, new3)
#Need a way to combine similar categories
print(len(votes))
votes

223


[('best director - motion picture', 30468.75),
 ('best actress - television comedy / musical', 7612.5),
 ('best actress - motion picture drama', 6240.0),
 ('best motion picture drama', 5875.0),
 ('best screenplay - motion picture', 5062.5),
 ('best actress - television series drama', 3360.0),
 ('best actress - mini-series / television movie', 2496),
 ('best actress - motion picture comedy / musical', 2400.0),
 ('best actress drama', 2400.0),
 ('best screenplay motion picture', 2025.0),
 ('best actress - comedy / musical', 1920.0),
 ('best supporting actor motion picture', 1764),
 ('best supporting actress television series miniseries / television movie',
  1562),
 ('best supporting actress - motion picture', 1200.0),
 ('best actor - motion picture drama', 990.0),
 ('best actress television drama', 960),
 ('best actress television series comedy', 768),
 ('best actor - miniseries / television movie', 714),
 ('best motion picture comedy / musical', 675.0),
 ('best actor - motion picture c

In [523]:
words_pattern = '[a-zA-Z]+'
resolution_list = []
awards_dict = {}
for i,vote in enumerate(votes):
    words = re.findall(words_pattern, vote[0], flags=re.IGNORECASE)
    words = ' '.join(words)
    if words in awards_dict:
        awards_dict[words].append(i)
    else:
        awards_dict[words] = [i]
#pprint(awards_dict)

new_list = []
for key, val in awards_dict.items():
    if len(val) == 1:
        new_list.append(list(votes[val[0]]))
    else:
        sum = 0
        for i in val:
            sum+=votes[i][1]
        new_list.append([votes[val[0]][0],sum])
new_list



[['best director - motion picture', 30468.75],
 ['best actress - television comedy / musical', 7804.5],
 ['best actress - motion picture drama', 7872.0],
 ['best motion picture drama', 6106.5],
 ['best screenplay - motion picture', 7492.5],
 ['best actress - television series drama', 3552.0],
 ['best actress - mini-series / television movie', 3072],
 ['best actress - motion picture comedy / musical', 2400.0],
 ['best actress drama', 2880.0],
 ['best actress - comedy / musical', 2112.0],
 ['best supporting actor motion picture', 1954],
 ['best supporting actress television series miniseries / television movie',
  1562],
 ['best supporting actress - motion picture', 1276.0],
 ['best actor - motion picture drama', 1050.0],
 ['best actress television drama', 1536],
 ['best actress television series comedy', 768],
 ['best actor - miniseries / television movie', 714],
 ['best motion picture comedy / musical', 807.0],
 ['best actor - motion picture comedy / musical', 660.0],
 ['best actor tel

In [528]:
'''
TF-IDF
'''
pd. set_option('display.max_columns', None)
pd. set_option('display.max_rows', None)

corpus = []
for item in new_list:
    corpus.append(item[0][1:])

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
tfidf = pd.DataFrame(denselist, columns=feature_names)

keyword_list = ['actress','supporting','actor','director','drama','musical','television']
for keyword in keyword_list:
    tfidf[keyword] = tfidf[keyword]*2
#tfidf.iloc[0]

In [529]:
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
threshold = 0.9
final_list = []
cs = cosine_similarity(tfidf,tfidf)
#ans = pairwise_distances(tfidf,tfidf,'chebyshev')
for v,score_row in enumerate(cs):
    similarity = []
    for i, score in enumerate(score_row):
        if score > threshold and v!=i:
            similarity.append([new_list[v],score,new_list[i]])
    final_list.append(similarity)
final_list

[[[['best director - motion picture', 30468.75],
   0.9206170609924232,
   ['best director', 195]]],
 [[['best actress - television comedy / musical', 7804.5],
   0.9613746091643035,
   ['best actress - television series comedy / musical', 192]],
  [['best actress - television comedy / musical', 7804.5],
   0.9613746091643035,
   ['best television series actress - comedy / musical', 122.5]]],
 [],
 [[['best motion picture drama', 6106.5],
   0.9008567621764881,
   ['best picture - drama', 72]]],
 [[['best screenplay - motion picture', 7492.5],
   1.0,
   ['best motion picture screenplay', 30]]],
 [[['best actress - television series drama', 3552.0],
   0.9583518874623275,
   ['best actress television drama', 1536]],
  [['best actress - television series drama', 3552.0],
   0.9583518874623275,
   ['best drama television actress', 58]],
  [['best actress - television series drama', 3552.0],
   0.9583518874623275,
   ['best television drama actress', 29]]],
 [],
 [[['best actress - motion

In [530]:
'''
Weighted score
'''
sum_list = []
for i,a in enumerate(cs):
    sum_list.append([a.sum(),new_list[i],i])

def myFunc(e):
  return e[0]*e[0]*e[1][1]

sum_list.sort(reverse=True,key=myFunc)
sum_list

[[44.032671900265214,
  ['best actress - television comedy / musical', 7804.5],
  1],
 [33.542953421632255, ['best actress - motion picture drama', 7872.0], 2],
 [43.59744238991609, ['best actress - television series drama', 3552.0], 5],
 [38.35989166774809,
  ['best actress - mini-series / television movie', 3072],
  6],
 [25.446125492227694, ['best motion picture drama', 6106.5], 3],
 [35.852040588976855,
  ['best actress - motion picture comedy / musical', 2400.0],
  7],
 [31.70520953585776, ['best actress drama', 2880.0], 8],
 [42.66676833837591,
  ['best supporting actress television series miniseries / television movie',
   1562],
  11],
 [42.06233519597323, ['best actress television drama', 1536], 14],
 [34.20581167043392, ['best actress - comedy / musical', 2112.0], 9],
 [8.707936407959867, ['best director - motion picture', 30468.75], 0],
 [44.677771042243386,
  ['best actor television series - comedy / musical', 868.0],
  19],
 [29.62950008071791, ['best supporting actor moti

In [527]:
'''
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("best actress-television series drama")
displacy.render(doc, style="dep")
'''


'\nimport spacy\nfrom spacy import displacy\n\nnlp = spacy.load("en_core_web_sm")\ndoc = nlp("best actress-television series drama")\ndisplacy.render(doc, style="dep")\n'