In [12]:
from bs4 import BeautifulSoup, SoupStrainer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering, KMeans

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import LancasterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import re
import requests

import pandas as pd
import seaborn as sn
import numpy as np

import matplotlib.pyplot as plt

### MacVittie - Homework 8
Perform a vocabulary-based sentiment analysis of the movie reviews you used in homework 5 and homework 7, by doing the following:

**1)** In Python, load one of the sentiment vocabularies referenced in the textbook, and run the sentiment analyzer as explained in the corresponding reference. Add words to the sentiment vocabulary, if you think you need to, to better fit your particular text collection.

```
Done.
```

**2)** For each of the clusters you created in homework 7, compute the average, median, high, and low sentiment scores for each cluster. Explain whether you think this reveals anything interesting about the clusters.

```
So these are not the exact clusters I used in homework 7, going with a slightly different approach, mostly to see if
including the custom stop words - getting rid of actor and character names, and the like - has an impact. I somewhat
predicted that it would, as you can see from the word list below that the clusters are heavily emphasizing a lot of 
the words I had decided to preclude from the earlier analysis.

Cluster 0: 	mean: 0.05 	median: 0.0 	max: 0.6 	min: 0e+00
Cluster 1: 	mean: 0.09 	median: 0.0 	max: 0.6 	min: 0e+00
Cluster 2: 	mean: 0.04 	median: 0.0 	max: 0.6 	min: 0e+00
Cluster 3: 	mean: -0.005 	median: 0.0 	max: 0.4 	min: -0.5
Cluster 4: 	mean: 0.03 	median: 0.0 	max: 0.6 	min: 0e+00
Cluster 5: 	mean: 0.04 	median: 0.0 	max: 0.6 	min: -0.6
Cluster 6: 	mean: 0.2 	median: 0.0 	max: 0.6 	min: 0e+00
Cluster 7: 	mean: 0.08 	median: 0.0 	max: 0.6 	min: 0e+00
Cluster 8: 	mean: 0.02 	median: 0.0 	max: 0.6 	min: -0.5
Cluster 9: 	mean: 0.07 	median: 0.0 	max: 0.6 	min: 0e+00

I think what this is indicating is that a lot of these films have a heavy emphasis on characters. While some have
words with negative sentiment, like 'killing,' quite a few have generally neutral terms.

```

In [2]:
# first, set up our movie review urls
# movie list is my top 10 + honorable mention: https://www.imdb.com/list/ls050974899/

review_urls = {
    'last_night': 'https://www.imdb.com/title/tt1294688/reviews?ref_=tt_urv',
    'vanilla_sky': 'https://www.imdb.com/title/tt0259711/reviews?ref_=tt_urv',
    'lost_in_translation': 'https://www.imdb.com/title/tt0335266/reviews?ref_=tt_urv',
    'never_let_me_go': 'https://www.imdb.com/title/tt1334260/reviews?ref_=tt_urv',
    'gattaca': 'https://www.imdb.com/title/tt0119177/reviews?ref_=tt_urv',
    'american_beauty': 'https://www.imdb.com/title/tt0169547/reviews?ref_=tt_urv',
    'megamind': 'https://www.imdb.com/title/tt1001526/reviews?ref_=tt_urv',
    'man_from_earth': 'https://www.imdb.com/title/tt0756683/reviews?ref_=tt_urv',
    'another_earth': 'https://www.imdb.com/title/tt1549572/reviews?ref_=tt_urv',
    'timer': 'https://www.imdb.com/title/tt1179794/reviews?ref_=tt_urv',
    '310_to_yuma': 'https://www.imdb.com/title/tt0381849/reviews?ref_=tt_urv',
}

In [3]:
stop_list = []

In [75]:
def get_txt(url):
    return requests.get(url).text


def get_links_from(html):
    tags = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer('a', href=True))
    urls = [str(tag.attrs['href']) for tag in tags]
    return urls


def get_review_urls(links):
    url_template = 'https://www.imdb.com{}'
    return [url_template.format(link) for link in links]


def link(link):
    if '/review/' in link:
        return True
    return False


def get_links(links):
    links = filter(link, links)
    unique_links = set(links)
    return list(unique_links)


def strain(name, attrs):
    if name == 'div' and dict(attrs).get('class', None) == 'content':
        return True
    return False


def clean_txt(text):
    return re.split('\\n\\n\s+\d+ out of \d+', text)[0]


def get_review_from_url(url):
    html = get_txt(url)
    tags = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer(strain))
    review = clean_txt(tags.text)
    return review


def get_review_from_site(url):
    reviews = []

    reviews_home_text = get_txt(url)
    all_links = get_links_from(reviews_home_text)
    links = get_links(all_links)

    review_urls = get_review_urls(links)
    for url in review_urls:
        reviews.append(get_review_from_url(url))
    return reviews


def get_reviews_from_all_sites(url_list):
    all_reviews = []
    review_titles = url_list.keys()
    for title in review_titles:
        review_url = review_urls[title]
        all_reviews = all_reviews + get_review_from_site(review_url)
    return all_reviews

In [5]:
all_reviews = get_reviews_from_all_sites(review_urls)

In [79]:
# Create a cluster from titles (helped by https://pythonprogramminglanguage.com/kmeans-text-clustering/)
def run_kmeans(data, true_k, n_terms=20):
    def create_vectorization(text):
        vectorizor = TfidfVectorizer(stop_words='english')
        return [vectorizor, vectorizor.fit_transform(text)]
    
    vectorizor, X = create_vectorization(data)

    def create_cluster_model(X):
        model = KMeans(n_clusters=true_k, max_iter=100)
        model.fit(X.toarray())

        return model
    
    model = create_cluster_model(X)
    
    def get_top_terms(model, vectorizor, true_k, n_terms):
        order_centroids = model.cluster_centers_.argsort()[:, ::-1]
        terms = vectorizor.get_feature_names()
        clusters = []
        for i in range(true_k):
            cluster_terms = []
            for ind in order_centroids[i, :24]:
                cluster_terms.append(terms[ind])
            clusters.append(cluster_terms)
        return clusters

    cluster_top_terms = get_top_terms(model, vectorizor, true_k, n_terms)
    
    def print_top_terms():
        print("Top Terms/Cluster:")
        for index, cluster in enumerate(cluster_top_terms):
            print()
            print('Cluster', index)
            for term in cluster:
                print(' %s' % term)
    
    print_top_terms()
    
    return cluster_top_terms

top_terms = run_kmeans(all_reviews, 10)

Top Terms/Cluster:

Cluster 0
 cruise
 david
 film
 movie
 tom
 dream
 vanilla
 crowe
 films
 life
 sky
 best
 mind
 aames
 cameron
 like
 diaz
 lee
 think
 place
 penelope
 cruz
 takes
 makes

Cluster 1
 movie
 story
 film
 better
 vanilla
 good
 original
 watched
 ending
 concept
 seen
 really
 great
 like
 just
 sky
 thought
 los
 abre
 ojos
 night
 think
 felt
 watching

Cluster 2
 beauty
 film
 movie
 american
 life
 lester
 like
 spacey
 people
 just
 think
 sam
 really
 wife
 characters
 way
 time
 mendes
 ricky
 scene
 look
 japanese
 don
 bob

Cluster 3
 earth
 rhoda
 film
 marling
 planet
 cahill
 john
 car
 brit
 accident
 girl
 mapother
 science
 william
 mirror
 story
 fiction
 fi
 family
 movie
 sci
 like
 years
 just

Cluster 4
 gattaca
 vincent
 movie
 hawke
 film
 law
 ethan
 jude
 future
 society
 genetic
 thurman
 science
 jerome
 uma
 story
 niccol
 way
 subtle
 fiction
 great
 discrimination
 time
 america

Cluster 5
 megamind
 metro
 villain
 farrell
 man
 despica

In [80]:
sid = SentimentIntensityAnalyzer()

In [81]:
sid.polarity_scores('):{')

{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.5106}

In [82]:
all_scores = []

for terms in top_terms:
    scores = []
    for t in terms:
        score = sid.polarity_scores(t)
        scores.append(score['compound'])
    all_scores.append(np.array(scores))

for i, scores in enumerate(all_scores):
    print('Cluster {}: '.format(i), end='')

    mean = scores.mean()
    print('\tmean: {0:.1}'.format(mean), end=' ')

    median = np.median(scores)
    print('\tmedian: {}'.format(median), end=' ')

    _max = scores.max()
    print('\tmax: {0:.1}'.format(_max), end=' ')

    _min = scores.min()
    print('\tmin: {0:.1}'.format(_min))

Cluster 0: 	mean: 0.05 	median: 0.0 	max: 0.6 	min: 0e+00
Cluster 1: 	mean: 0.09 	median: 0.0 	max: 0.6 	min: 0e+00
Cluster 2: 	mean: 0.04 	median: 0.0 	max: 0.6 	min: 0e+00
Cluster 3: 	mean: -0.005 	median: 0.0 	max: 0.4 	min: -0.5
Cluster 4: 	mean: 0.03 	median: 0.0 	max: 0.6 	min: 0e+00
Cluster 5: 	mean: 0.04 	median: 0.0 	max: 0.6 	min: -0.6
Cluster 6: 	mean: 0.2 	median: 0.0 	max: 0.6 	min: 0e+00
Cluster 7: 	mean: 0.08 	median: 0.0 	max: 0.6 	min: 0e+00
Cluster 8: 	mean: 0.02 	median: 0.0 	max: 0.6 	min: -0.5
Cluster 9: 	mean: 0.07 	median: 0.0 	max: 0.6 	min: 0e+00


In [83]:
for i in range(0, 10):
    for j in range(0, 24):
        print(i,j," ",all_scores[i][j]," ",top_terms[i][j])
    print("\n")

0 0   0.0   cruise
0 1   0.0   david
0 2   0.0   film
0 3   0.0   movie
0 4   0.0   tom
0 5   0.25   dream
0 6   0.0   vanilla
0 7   0.0   crowe
0 8   0.0   films
0 9   0.0   life
0 10   0.0   sky
0 11   0.6369   best
0 12   0.0   mind
0 13   0.0   aames
0 14   0.0   cameron
0 15   0.3612   like
0 16   0.0   diaz
0 17   0.0   lee
0 18   0.0   think
0 19   0.0   place
0 20   0.0   penelope
0 21   0.0   cruz
0 22   0.0   takes
0 23   0.0   makes


1 0   0.0   movie
1 1   0.0   story
1 2   0.0   film
1 3   0.4404   better
1 4   0.0   vanilla
1 5   0.4404   good
1 6   0.3182   original
1 7   0.0   watched
1 8   0.0   ending
1 9   0.0   concept
1 10   0.0   seen
1 11   0.0   really
1 12   0.6249   great
1 13   0.3612   like
1 14   0.0   just
1 15   0.0   sky
1 16   0.0   thought
1 17   0.0   los
1 18   0.0   abre
1 19   0.0   ojos
1 20   0.0   night
1 21   0.0   think
1 22   0.0   felt
1 23   0.0   watching


2 0   0.5859   beauty
2 1   0.0   film
2 2   0.0   movie
2 3   0.0   american
2 4 