In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re 

import nltk;
from nltk.collocations import *;
from nltk.tokenize import word_tokenize;
from nltk.corpus import stopwords;
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer
from nltk import ngrams

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
nltk.download('averaged_perceptron_tagger');

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\keith\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
df_13 = pd.read_json("gg2013.json")

In [5]:
df_13 = pd.DataFrame(df_13)

In [6]:
OFFICIAL_AWARDS = ['cecil b. demille award', 
'best motion picture - drama', 
'best performance by an actress in a motion picture - drama', 
'best performance by an actor in a motion picture - drama', 
'best motion picture - comedy or musical', 
 'best performance by an actress in a motion picture - comedy or musical', 
 'best performance by an actor in a motion picture - comedy or musical', 
 'best animated feature film', 'best foreign language film', 
 'best performance by an actress in a supporting role in a motion picture', 
 'best performance by an actor in a supporting role in a motion picture', 
 'best director - motion picture', 'best screenplay - motion picture', 
 'best original score - motion picture', 'best original song - motion picture', 
 'best television series - drama', 'best performance by an actress in a television series - drama', 
 'best performance by an actor in a television series - drama', 'best television series - comedy or musical', 
 'best performance by an actress in a television series - comedy or musical', 
 'best performance by an actor in a television series - comedy or musical', 
 'best mini-series or motion picture made for television', 
 'best performance by an actress in a mini-series or motion picture made for television', 
 'best performance by an actor in a mini-series or motion picture made for television', 
 'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television',
 'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']


In [7]:
clean_awards = [re.sub("[^a-zA-Z0-9]+", ' ',i) for i in OFFICIAL_AWARDS];

In [8]:
pos_structure = []

for sentence in clean_awards:
    tokenized_text = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokenized_text)
    pos_structure.append([i[1] for i in tagged])

for i in range(0,len(clean_awards)):
    pos_structure[i].append(clean_awards[i])
    
award_dict = {}

for award in pos_structure:
    award_dict[award[-1]] = award[:-1]
    
pos_structure.sort()
# print(*pos_structure,sep='\n')

# print(award_dict)

In [360]:
def cleanTweets(tweet):
    tt = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=True)

    punctuation = list(string.punctuation)
    
    # strip stopwords, punctuation, url components 
    stop = stopwords.words('english') + punctuation + ['t.co', 'http', 'https', '...', '..', ':\\', 'RT', '#']

    strip_nums = re.sub("\d+", "", tweet)
    tokenized = tt.tokenize(strip_nums)
    terms_stop = [term for term in tokenized if term not in stop]
    cleaned = [term for term in terms_stop]
    cleaned = ' '.join(cleaned)
    
    return cleaned

In [361]:
def clean_awards(text):
    
    text = re.sub("(\s)#\w+","",text)    # strips away all hashtags 
    text = re.sub("[^a-zA-Z ]", '',text) # removes all punctuation but keeps whitespace for tokenization
    text = text.lower()                  # makes string lowercase
     
    return text 

In [9]:
def clean_entity(text):
    text = re.sub("(\s)#\w+","",text)    # strips away all hashtags 
    text = re.sub("[^a-zA-Z ]", '',text) # removes all punctuation but keeps whitespace for tokenization
     
    return text 

In [12]:
def find_tags(tweet):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(tweet)
    tags = nltk.pos_tag(tokens)
    return tags

In [363]:
def pos_search(tags,chunk_gram,label):
    potentials = "No Chunk"
    chunk_parser = nltk.RegexpParser(chunk_gram)
    chunked = chunk_parser.parse(tags)
    for subtree in chunked.subtrees():
        if subtree.label() == label: 
            potentials = ' '.join(untag(subtree))

    return potentials

In [10]:
def ne_search(tags):
    potentials = "No Entity"
    namedEnt = nltk.ne_chunk(tags)
    
    return namedEnt
    

In [364]:
def filter_df(df,label):
    
    data = df.loc[df[label] != "No Chunk"]
    data.drop(data.columns.difference([label]), 1, inplace=True)
    single_list = list(data[label])
    freq = FreqDist(single_list)
    
    return data, freq

In [365]:
def simple_test(dic,freq):
    
    test = []
    
    for key in dic.keys():
        if key in freq:
            test.append([key,freq[key]])
        else:
            test.append([key,"Not found"])
        
    print(*test,sep='\n')   
    return test

In [13]:
df_13['text'] = df_13['text'].apply(lambda x:  clean_entity(x))
df_e = df_13[df_13['text'].str.contains("best")]
df_e['tags'] = df_e['text'].apply(lambda x: find_tags(x))

In [18]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\keith\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\keith\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [19]:
df_e['entity'] = df_e['tags'].apply(lambda x: ne_search(x))

In [21]:
df_e['entity'].head()

16     [(Why, WRB), (did, VBD), (I, PRP), (just, RB),...
340    [(RT, NNP), (ZapitRick, NNP), (Can, NNP), [(Ke...
546    [(the, DT), (best, JJS), (phrase, NN), (I, PRP...
694    [[(Nicole, NNP)], [(Kidman, NNP)], (is, VBZ), ...
756    [(RT, NNP), (MovieMayor, NNP), (My, NNP), (gre...
Name: entity, dtype: object

In [22]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

ModuleNotFoundError: No module named 'spacy'