In [1]:
import requests, time, json
from bs4 import BeautifulSoup as bs4

# MovieRec4Parents

## Problem Statement:
#### Busy parents need a way to choose movies for their family that all members of the family will enjoy without exposing any of their children to any objectionable material. Yet, every family has its own standards for determining what is objectionable and what is not. Common Sense Media has a great website, but it takes a while to page through all of the movies at this site and get to all of the valuable material. 
#### To solve this problem, I will build a content-based recommender system that will give movie recommendations in response to family members entering the name of a single movie that is similar to a movie they want to see. It will search through a list of thousands of movies that have already been curated by Common Sense Media and return a list of simlar movies that the entire family will be able to enjoy. It will warn parents of any potentially objectionable material in the movie and then direct parents to the appropriate web pages for a complete and hopefully spoiler-free review of the recommended movies.

## Get URL's of Individual Movies

### Create soup object and prepare to scrape

In [2]:
url = "https://www.commonsensemedia.org/reviews/category/movie"

In [3]:
res = requests.get(url)

In [4]:
res.status_code

200

In [5]:
soup = bs4(res.content, 'lxml')

### Scrape Common Sense Media for movie URL's

In [6]:
movies_urls = []

In [7]:
link = soup.find('strong', {'class': 'field-content'}).find('a').attrs.get('href')

In [8]:
link

'/movie-reviews/siberia'

In [9]:
movies_urls.append('https://www.commonsensemedia.org' + link)

In [10]:
movies_urls

['https://www.commonsensemedia.org/movie-reviews/siberia']

In [11]:
# Rewrite as a function...
movies_urls = []
missed_urls_pages = []
def get_movie_urls(num_pages):
    for page in range(num_pages):
        url = "https://www.commonsensemedia.org/movie-reviews?page={}".format(page)
        print(url)
        res = requests.get(url)
        soup = bs4(res.content, 'lxml')
        if res.status_code == 200:
            for movie in soup.find_all('strong', {'class': 'field-content'}):
                movies_urls.append('https://www.commonsensemedia.org'
                                       + str(movie.find('a').attrs.get('href')))
                time.sleep(.1)
        else:
            missed_urls_page.append('https://www.commonsensemedia.org'
                                        + str(movie.find('a').attrs.get('href')))
    return movies_urls, missed_urls_pages

In [1]:
# Leave this cell commented out; will take about 35 minutes to scrape URL's from 
# Common Sense Media's website.
# movies_urls, missed_urls_pages = get_movie_urls(437)

In [14]:
missed_urls_pages  # No missing pages!

[]

In [15]:
movies_urls[:5]

['https://www.commonsensemedia.org/movie-reviews/siberia',
 'https://www.commonsensemedia.org/movie-reviews/shock-and-awe',
 'https://www.commonsensemedia.org/movie-reviews/eighth-grade',
 'https://www.commonsensemedia.org/movie-reviews/skyscraper',
 'https://www.commonsensemedia.org/movie-reviews/hotel-transylvania-3-summer-vacation']

In [16]:
len(movies_urls)

8892

In [17]:
with open('data/movies_urls.json', 'w') as output:
    json.dump(movies_urls, output)

#### List of movie URL's saved as movies_urls.json for use in the next notebook for scraping...

### If Necessary:  Selenium to find hidden data

In [None]:
# import os
# from selenium import webdriver
# 
# chromedriver = '/Users/yukiharuhadeishi/Downloads/chromedriver'
# os.environ["webdriver.chrome.driver"] = chromedriver
# driver = webdriver.Chrome(chromedriver)

In [None]:
# driver.get(url)