In [1]:
from bs4 import BeautifulSoup, SoupStrainer

import nltk
from nltk import word_tokenize, pos_tag, RegexpParser
from nltk.tokenize import sent_tokenize

import re
import requests

### MacVittie - Homework 5
**1)** Compile a list of static links (permalinks) to individual user movie reviews from one particular website. This will be your working dataset for this assignment, as well as for assignments 7 and 8, which together will make up your semester project.
    
It does not matter if you use a crawler or if you manually collect the links, but you will need at least 100 movie review links. Note that, as of this writing, the robots.txt file of IMDB.com allows the crawling of user reviews.
    
Each link should be to a web page that has only one user review of only one movie, e.g., the user review permalinks on the IMDB site.
    
Choose reviews of movies that are all in the same genre, e.g., sci-fi, mystery, romance, superhero, etc.
    
Make sure your collection includes reviews of several movies in your chosen genre and that it includes a mix of negative and positive reviews.

```
Genre is a funny thing, and ill-defined. The films I have selected are from my personal top 10 (and honorable mention), 
as I feel that the films an individual selects for such a list can be deeply telling of a person's personality and 
outlook on life. Near as I can tell, each of these could be reasonably classified as a "drama:" while Megamind perhaps 
leans more towards the comedic end of the spectrum, 3:10 to Yuma almost certainly also a western, the remainder are 
all certainly and deeply within the classification of a drama.

For these films I have chosen, there are 274 reviews. While I have not personally vetted that each has at least a 
positive and negative review, I can be fairly certain - knowing what I know of film, and how the general populace 
responds to such works - that each has at least one of each.
```

**2)** Extract noun phrase (NP) chunks from your reviews using the following procedure:

In Python, use BeautifulSoup to grab the main review text from each link.
    
Next run each review text through a tokenizer, and then try to NP-chunk it with a shallow parser.
    
You probably will have too many unknown words, owing to proper names of characters, actors, and so on that are not in your working dictionary. Make sure the main names that are relevant to the movies in your collection of reviews are added to the working lexicon, and then run the NP chunker again.

```
The reviews have been tokenized and tagged, then placed through a parser. The output seems reasonable.
```

**3)** Output all the chunks in a single list for each review, and submit that output for this assignment.

Also submit a brief written summary of what you did (describe your selection of genre, your source of reviews, how many you collected, and by what means).

```
As mentioned, I have specifically chosen my top 10 list, which has - somewhat surprisingly - remained unchanged 
for most of this decade, though I have not been able to indulge this hobby as much as I was able to in my 20's as 
of late. I used IMDB as a reference, specifically calling out the initial page for user reviews. I did not crawl 
past the first page for any film, as the resulting 274 reviews seemed sufficient.
```

In [2]:
# first, set up our movie review urls
# movie list is my top 10 + honorable mention: https://www.imdb.com/list/ls050974899/

review_urls = {
    'last_night': 'https://www.imdb.com/title/tt1294688/reviews?ref_=tt_urv',
    'vanilla_sky': 'https://www.imdb.com/title/tt0259711/reviews?ref_=tt_urv',
    'lost_in_translation': 'https://www.imdb.com/title/tt0335266/reviews?ref_=tt_urv',
    'never_let_me_go': 'https://www.imdb.com/title/tt1334260/reviews?ref_=tt_urv',
    'gattaca': 'https://www.imdb.com/title/tt0119177/reviews?ref_=tt_urv',
    'american_beauty': 'https://www.imdb.com/title/tt0169547/reviews?ref_=tt_urv',
    'megamind': 'https://www.imdb.com/title/tt1001526/reviews?ref_=tt_urv',
    'man_from_earth': 'https://www.imdb.com/title/tt0756683/reviews?ref_=tt_urv',
    'another_earth': 'https://www.imdb.com/title/tt1549572/reviews?ref_=tt_urv',
    'timer': 'https://www.imdb.com/title/tt1179794/reviews?ref_=tt_urv',
    '310_to_yuma': 'https://www.imdb.com/title/tt0381849/reviews?ref_=tt_urv',
}

In [12]:
def get_txt(url):
    return requests.get(url).text


def get_links_from(html):
    tags = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer('a', href=True))
    urls = [str(tag.attrs['href']) for tag in tags]
    return urls


def get_review_urls(links):
    url_template = 'https://www.imdb.com{}'
    return [url_template.format(link) for link in links]


def link(link):
    if '/review/' in link:
        return True
    return False


def get_links(links):
    links = filter(link, links)
    unique_links = set(links)
    return list(unique_links)


def strain(name, attrs):
    if name == 'div' and dict(attrs).get('class', None) == 'content':
        return True
    return False


def clean_txt(text):
    return re.split('\\n\\n\s+\d+ out of \d+', text)[0]


def get_review_from_url(url):
    html = get_txt(url)
    tags = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer(strain))
    review = clean_txt(tags.text)
    return review


def get_review_from_site(url):
    reviews = []

    reviews_home_text = get_txt(url)
    all_links = get_links_from(reviews_home_text)
    links = get_links(all_links)

    review_urls = get_review_urls(links)
    for url in review_urls:
        reviews.append(get_review_from_url(url))
    return reviews


def get_reviews_from_all_sites(url_list):
    all_reviews = []
    review_titles = url_list.keys()
    for title in review_titles:
        review_url = review_urls[title]
        all_reviews = all_reviews + get_review_from_site(review_url)
    return all_reviews


def sentences_to_words(sentences):
    return [word_tokenize(s) for s in sentences]


def alphanumerize(words):
    regex_matcher = '^[a-zA-Z0-9]+$'
    return [w for w in words if re.match(regex_matcher, w)]


def lemmatize_word(word):
    stemmer = LancasterStemmer()
    lemma_word = stemmer.stem(word)
    if wordnet.synsets(lemma_word):
        return lemma_word
    else:
        return word    
    
    
def lemmatize_words(words):
    return [lemmatize_word(w) for w in words]


def remove_stops(words):
    return [w for w in words if w not in stop_words]


def preprocess_documents(docs):
    d_format_words = [d.lower().strip() for d in docs]
    docs_of_words = sentences_to_words(d_format_words)
    words_with_chars = [remove_stops(w) for w in docs_of_words]
    content_words = [alphanumerize(c) for c in words_with_chars]
    lemmatized = [lemmatize_words(w) for w in content_words]
    return lemmatized 


def topics_to_map(model, feature_names, no_top_words):
    map = []
    for topic_idx, topic in enumerate(model.components_):
        s = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        map.append(s)
    return map


def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic {}:".format(topic_idx), end=' ')
        s = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        print (s)
        

def pos_tagging(sentence):
    cleaned_review = sentence.lower()
    tokenized_review = word_tokenize(cleaned_review)
    return pos_tag(tokenized_review)


def get_chunking_for_sentence(sentence):
    tagged_sentence = pos_tagging(sentence)

    grammar = "NP: {<DT>?<JJ>*<NN>}"

    cp = nltk.RegexpParser(grammar)
    result = cp.parse(tagged_sentence)
    return result


def get_chunking(review):
    sentences = sent_tokenize(review)
    result = [get_chunking_for_sentence(s) for s in sentences]
    return result        


def get_chunking_for_reviews(reviews):
    return [get_chunking(r) for r in reviews]


def get_noun_phrases_for_review(review):
    main_trees = get_chunking(review)
    subtrees = []
    for m in main_trees:
        for s in m.subtrees():
            if s.label() == 'NP':
                subtrees.append(s)
    
    return subtrees[1:]


def get_noun_phrases_for_reviews(reviews):
    return [get_noun_phrases_for_review(r) for r in reviews]

In [7]:
all_reviews = get_reviews_from_all_sites(review_urls)

In [8]:
print(len(all_reviews))

274


In [14]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\GnomeWorks\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [16]:
noun_phrases = get_noun_phrases_for_reviews(all_reviews)

In [17]:
for n in noun_phrases[0:6]:
    for i in range(2):
        print(n[i])
    print()

(NP own/JJ copy.i/NN)
(NP the/DT storyline/NN)

(NP the/DT romantic/JJ dramas/NN)
(NP boring/NN)

(NP the/DT rome/NN)
(NP film/NN)

(NP belarus/NN)
(NP this/DT season/NN)

(NP i/NN)
(NP a/DT summary/NN)

(NP catch/NN)
(NP b-/NN)



In [18]:
noun_phrases

[[Tree('NP', [('own', 'JJ'), ('copy.i', 'NN')]),
  Tree('NP', [('the', 'DT'), ('storyline', 'NN')]),
  Tree('NP', [('the', 'DT'), ('movie', 'NN')]),
  Tree('NP', [('sensibility', 'NN')]),
  Tree('NP', [('that', 'DT'), ('kind', 'NN')]),
  Tree('NP', [('situation', 'NN')]),
  Tree('NP', [('the', 'DT'), ('sight', 'NN')]),
  Tree('NP', [('the', 'DT'), ('pain', 'NN')]),
  Tree('NP', [('misery', 'NN')]),
  Tree('NP', [('suffering', 'NN')]),
  Tree('NP', [('the', 'DT'), ('character', 'NN')]),
  Tree('NP', [('keira', 'JJ'), ('knightley', 'NN')]),
  Tree('NP', [('i', 'NN')]),
  Tree('NP', [('a', 'DT'), ('great', 'JJ'), ('actor', 'NN')]),
  Tree('NP', [('the', 'DT'), ('character', 'NN')]),
  Tree('NP', [('an', 'DT'), ('easy-going', 'JJ'), ('girl', 'NN')]),
  Tree('NP', [('ex-love', 'NN')]),
  Tree('NP', [('some', 'DT'), ('fun', 'NN')]),
  Tree('NP', [('type', 'NN')]),
  Tree('NP', [('girl', 'NN')]),
  Tree('NP', [('eva', 'NN')]),
  Tree('NP', [('character', 'NN')]),
  Tree('NP', [('a', 'DT'), ('