## Reviews

In [1]:
import re
import time
import json
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from bs4.element import Tag


def get_all_disney_titles() -> pd.DataFrame:
    """ Get all Disney titles and their release dates """
    url = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Animation_Studios_films'
    df = pd.read_html(url, header=0)[1]
    df['Year'] = df.apply(lambda row: row['Release date'].split(",")[-1].strip(), 1)
    return df


def get_all_pixar_titles() -> pd.DataFrame:
    """ Get all Pixar titles and their release dates """
    url = "https://en.wikipedia.org/wiki/List_of_Pixar_films"
    df = pd.read_html(url, header=0)[0]
    df = df.loc[df.Film != "Released films", :]
    df = df.iloc[:22]
    df['Year'] = df.apply(lambda row: row['Release date'].split(",")[-1].strip(), 1)
    return df


def match_years(search_result: Tag, year: str) -> bool:
    """ Check if the year of a movie search matches (within 2 years) the year of the search result"""
    string = search_result.text
    year = int(year[-4:])
    
    # Extract year from string 
    string_year = re.sub('[^0-9]',' ', string)  # Keep numbers
    string_year = re.sub(' +', ' ', string_year).strip()  # Remove duplicate whitespaces
    string_year = int(string_year.split(" ")[-1])
    
    if abs(string_year - year) <= 2:
        return True
    
    return False


def scrape_imdb_urls(df: pd.DataFrame) -> list:
    """ Scrape IMDB urls of all the movies """
    titles = list(df.Film.values)
    search_terms = (df.Film + "%20" + df.Year).values
    urls = [None for _ in range(len(search_terms))]
    
    # Search for the movie and extract the first result
    for index, search_term in tqdm(enumerate(search_terms)):

        # Get search result page
        search_url = f"https://www.imdb.com/find?q={search_term}&s=tt&ttype=ft&ref_=fn_ft"
        res = requests.get(search_url).text
        soup = BeautifulSoup(res,'lxml')

        # Extract best search result
        for result in soup.find_all("td", class_="result_text"):
            if match_years(result, search_term):
                url = result.find_all("a", href=True)[0]["href"]
                url = f"https://www.imdb.com{url}reviews"
                urls[index] = url
                break
            
    # Need to manually add saludos amigos as imdb's search engine cannot find it
    urls = {title: url if url else "https://www.imdb.com/title/tt0036326/reviews" for url, title in zip(urls, titles)}
    
    # Also cannot find onward correctly...
    if "Onward" in urls:
        urls["Onward"] = "https://www.imdb.com/title/tt7146812/reviews"
    
    return urls


def extract_reviews(soup: BeautifulSoup):
    """ Extract the title and reviews of a BeautifulSoup IMDB page """
    names = []
    reviews = []
    
    for elem in soup.find_all(class_='imdb-user-review'):
        name = elem.find(class_='title').get_text(strip=True)
        names.append(name)
        try:
            review = elem.find(class_="content").get_text(strip=True)
            reviews.append(review)
        except:
            continue
            
    return names, reviews


def scrape_reviews(urls: str, driver_path: str = None) -> BeautifulSoup:
    """ Scrape all reviews from a single movie on IMDB and return a soup instance
    
    It needs to use Chrome driver as the "load more" button should be 
    triggered multiple times in order to correctly load all reviews. 
    
    Parameters:
    ----------
    driver_path : str, default None
        path to your chromedriver
        
    url : str
        The url to scrape
        
    Returns:
    --------
    soup : BeautifulSoup
        A BeautifulSoup instance of the entire page
    
    """
    
    all_reviews = {title: [] for title in urls}
    
    for movie_title in urls:
        
        # Instantiate driver
        driver = webdriver.Chrome(executable_path=driver_path)
        wait = WebDriverWait(driver, 10)

        # Prepare page
        driver.get(urls[movie_title])
        soup = BeautifulSoup(driver.page_source, 'lxml')
        time.sleep(3)

        # Press "load more" until the full page is loaded
        # Curtosey of: https://stackoverflow.com/questions/55527423/why-do-i-only-get-first-page-data-when-using-selenium
        while True:
            try:
                driver.find_element_by_css_selector("button#load-more-trigger").click()
                wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,".ipl-load-more__load-indicator")))
                soup = BeautifulSoup(driver.page_source, 'lxml')
            except Exception:break

        driver.quit()

        # Extract all reviews and their titles from the full page
        titles, reviews = extract_reviews(soup)
        all_reviews[movie_title] = [(review_title, movie_review) for review_title, movie_review in zip(titles, reviews)]

    return all_reviews

**Get all titles**

In [2]:
disney = get_all_disney_titles()
pixar = get_all_pixar_titles()

**Get imdb urls**

In [3]:
# disney_urls = scrape_imdb_urls(disney)
pixar_urls = scrape_imdb_urls(pixar)

22it [00:14,  1.53it/s]


In [18]:
with open('data/disney_urls.json', 'w') as f:
    json.dump(disney_urls, f)
    
with open('data/pixar_urls.json', 'w') as f:
    json.dump(pixar_urls, f)

**Scrape reviews**

In [5]:
pixar_reviews = scrape_reviews(pixar_urls, 
                               '/home/CORP.VANSPAENDONCKGROEP.NL/maarten.grootendorst/Documents/Disney-NER/chromedriver')

with open('data/pixar_reviews.json', 'w') as f:
    json.dump(pixar_reviews, f)

In [None]:
disney_reviews = scrape_reviews(disney_urls, 
                                '/home/CORP.VANSPAENDONCKGROEP.NL/maarten.grootendorst/Documents/Disney-NER/chromedriver')

with open('data/disney_reviews.json', 'w') as f:
    json.dump(disney_reviews, f)

# Backup

In [76]:
all_reviews = {title: [] for title in urls}
for movie_title in list(urls.keys())[:10]:
    titles, reviews = scrape_reviews(urls[movie_title], 
                                     '/home/CORP.VANSPAENDONCKGROEP.NL/maarten.grootendorst/Documents/Disney-NER/chromedriver')
    all_reviews[movie_title] = [(review_title, movie_review) for review_title, movie_review in zip(titles, reviews)]

In [36]:
# URL = "https://www.imdb.com/title/tt2294629/reviews"  # frozen
URL = "https://www.imdb.com/title/tt2380307/reviews"  # coco


driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)

driver.get(URL)
soup = BeautifulSoup(driver.page_source, 'lxml')

while True:
    try:
        driver.find_element_by_css_selector("button#load-more-trigger").click()
        wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,".ipl-load-more__load-indicator")))
        soup = BeautifulSoup(driver.page_source, 'lxml')
    except Exception:break

driver.quit()

In [29]:
# driver.find_element_by_class_name("ipl-expander").click()

### Clean reviews

In [37]:
names = []
reviews = []
for elem in soup.find_all(class_='imdb-user-review'):
    name = elem.find(class_='title').get_text(strip=True)
    names.append(name)
    try:
        review = elem.find(class_="content").get_text(strip=True)
        reviews.append(review)
    except:
        continue

In [38]:
len(names)

1095

In [39]:
len(reviews)

1095

In [40]:
import json

with open('coco_reviews.json', 'w') as f:
    json.dump(reviews, f)
    
with open('coco_review_titles.json', 'w') as f:
    json.dump(names, f)

## Single Movie TF-IDF

In [7]:
cleaned_reviews = [x.lower() for x in reviews]

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 1))
X = tfidf.fit_transform(cleaned_reviews)

In [13]:
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(X.toarray()).flatten()[::-1]

n = 10
top_n = feature_array[tfidf_sorting][:n]

In [14]:
top_n

array(['enough', 'me', 'had', 'smile', 'considering', 'reminded',
       'sadness', 'related', 'tissues', 'grandma'], dtype='<U469')

## Multiple Movies TF-IDF

In [4]:
with open('coco_reviews.json') as f:
    coco = json.load(f)
    
with open('frozen_reviews.json') as f:
    frozen = json.load(f)

In [6]:
coco = " ".join(coco)
frozen = " ".join(frozen)

In [88]:
count = CountVectorizer(ngram_range=(1, 3), stop_words="english").fit([coco, frozen])

In [89]:
t = count.transform([coco, frozen])

In [90]:
t = np.array(t.todense()).T

In [91]:
t.shape

(377711, 2)

In [92]:
w = t.sum(axis=0)
m = 3000
tf = np.divide(t,w)

In [93]:
sum_tij = np.array(t.sum(axis=1)).T

In [94]:
idf = np.log(np.divide(m, sum_tij)).reshape(-1, 1)

In [95]:
tf_idf = np.multiply(tf, idf)

In [96]:
unique_sum = tf_idf.sum(axis=1).reshape(-1, 1)

In [97]:
unique = np.divide(tf_idf, unique_sum)

In [98]:
result = pd.DataFrame(tf_idf, index=count.get_feature_names(), columns=["Coco", "Frozen"])

In [99]:
result.sort_values("Coco", ascending=False).head(20)

Unnamed: 0,Coco,Frozen
coco,0.004372,2e-06
miguel,0.004176,0.0
dead,0.003984,9.6e-05
pixar,0.003784,0.000339
mexican,0.003375,4e-06
family,0.003152,0.000458
culture,0.003107,8.5e-05
permalink,0.002899,0.000847
helpful sign,0.002899,0.000847
sign vote permalink,0.002899,0.000847


In [100]:
result.sort_values("Frozen", ascending=False).head(20)

Unnamed: 0,Coco,Frozen
sister,5e-05,0.00233
ice,0.0,0.00221
anna,0.0,0.002154
olaf,0.000166,0.002149
snow,1.4e-05,0.002091
let,0.000373,0.002089
powers,1.4e-05,0.002083
songs,0.000784,0.002051
hans,7e-06,0.002025
snowman,7e-06,0.00201


### NER

In [55]:
# from transformers import BertTokenizer, BertModel
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# import torch

In [56]:
# model_en = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
# tokenizer_en = AutoTokenizer.from_pretrained("bert-base-cased")

In [3]:
import json

In [57]:
# with open('data.json', 'w') as f:
#     reviews = json.loads(f)

In [None]:
label_list = [
    "O",       # Outside of a named entity
    "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
    "I-MISC",  # Miscellaneous entity
    "B-PER",   # Beginning of a person's name right after another person's name
    "I-PER",   # Person's name
    "B-ORG",   # Beginning of an organisation right after another organisation
    "I-ORG",   # Organisation
    "B-LOC",   # Beginning of a location right after another location
    "I-LOC"    # Location
]

In [None]:
def predict(tokenizer, model, sequence):
    # Bit of a hack to get the tokens with the special tokens
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
    inputs = tokenizer.encode(sequence, return_tensors="pt")

    outputs = model(inputs)[0]
    predictions = torch.argmax(outputs, dim=2)
    
    return sequence

In [None]:
reviews[0]

In [None]:
predict(tokenizer_en, model_en, sequence)

### Sentiment
https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english?text=I+like+you.+I+love+you