In [1]:
import requests, time, json
from bs4 import BeautifulSoup as bs4

# MovieRec4Parents

## Problem Statement:
#### Busy parents need a way to choose movies for their family that all members of the family will enjoy without exposing their children to objectionable material. Yet, every family has its own standards for determining what is or is not objectionable. Common Sense Media has a great website, but it takes precious time to page through all the movies at this site and glean all the valuable information.

#### To help parents solve this problem, MovieRec4Parents is a content-based recommender system that will produce a list of movies that are similar to a single movie entered by the user. Movie similarity is determined by comparing language used to describe the movies by Common Sense Media as described below (see Notebook 5.1). If parents have a few minutes to answer several questions, MovieRec4Parents will filter this list of movies according to individual family standards. By doing so, parents will be provided with entertainment suggestions that the entire family will be able to enjoy.

## Get URL's of Individual Movies

### Create soup object and prepare to scrape

In [2]:
url = "https://www.commonsensemedia.org/reviews/category/movie"

In [3]:
res = requests.get(url)

In [4]:
res.status_code

200

In [5]:
soup = bs4(res.content, 'lxml')

### Scrape Common Sense Media for movie URL's

In [6]:
movies_urls = []

In [7]:
link = soup.find('strong', {'class': 'field-content'}).find('a').attrs.get('href')

In [8]:
link

'/movie-reviews/the-music-of-silence'

In [9]:
movies_urls.append('https://www.commonsensemedia.org' + link)

In [10]:
movies_urls

['https://www.commonsensemedia.org/movie-reviews/the-music-of-silence']

In [19]:
movies_urls = []
missed_urls_pages = []
def get_movie_urls(num_pages):
    for page in range(num_pages):
        url = "https://www.commonsensemedia.org/movie-reviews?page={}".format(page)
        print(url)
        res = requests.get(url)
        soup = bs4(res.content, 'lxml')
        if res.status_code == 200:
            for movie in soup.find_all('strong', {'class': 'field-content'}):
                movies_urls.append('https://www.commonsensemedia.org'
                                       + str(movie.find('a').attrs.get('href')))
                time.sleep(.1)
        else:
            missed_urls_page.append('https://www.commonsensemedia.org'
                                        + str(movie.find('a').attrs.get('href')))
    return movies_urls, missed_urls_pages

In [20]:
# Scrapes URL's from Common Sense Media's website
# ***Do not run this cell*** unless you want to rescrape. It will take about 35 minutes
movies_urls, missed_urls_pages = get_movie_urls(439)

https://www.commonsensemedia.org/movie-reviews?page=0
https://www.commonsensemedia.org/movie-reviews?page=1
https://www.commonsensemedia.org/movie-reviews?page=2
https://www.commonsensemedia.org/movie-reviews?page=3
https://www.commonsensemedia.org/movie-reviews?page=4
https://www.commonsensemedia.org/movie-reviews?page=5
https://www.commonsensemedia.org/movie-reviews?page=6
https://www.commonsensemedia.org/movie-reviews?page=7
https://www.commonsensemedia.org/movie-reviews?page=8
https://www.commonsensemedia.org/movie-reviews?page=9
https://www.commonsensemedia.org/movie-reviews?page=10
https://www.commonsensemedia.org/movie-reviews?page=11
https://www.commonsensemedia.org/movie-reviews?page=12
https://www.commonsensemedia.org/movie-reviews?page=13
https://www.commonsensemedia.org/movie-reviews?page=14
https://www.commonsensemedia.org/movie-reviews?page=15
https://www.commonsensemedia.org/movie-reviews?page=16
https://www.commonsensemedia.org/movie-reviews?page=17
https://www.commonse

https://www.commonsensemedia.org/movie-reviews?page=149
https://www.commonsensemedia.org/movie-reviews?page=150
https://www.commonsensemedia.org/movie-reviews?page=151
https://www.commonsensemedia.org/movie-reviews?page=152
https://www.commonsensemedia.org/movie-reviews?page=153
https://www.commonsensemedia.org/movie-reviews?page=154
https://www.commonsensemedia.org/movie-reviews?page=155
https://www.commonsensemedia.org/movie-reviews?page=156
https://www.commonsensemedia.org/movie-reviews?page=157
https://www.commonsensemedia.org/movie-reviews?page=158
https://www.commonsensemedia.org/movie-reviews?page=159
https://www.commonsensemedia.org/movie-reviews?page=160
https://www.commonsensemedia.org/movie-reviews?page=161
https://www.commonsensemedia.org/movie-reviews?page=162
https://www.commonsensemedia.org/movie-reviews?page=163
https://www.commonsensemedia.org/movie-reviews?page=164
https://www.commonsensemedia.org/movie-reviews?page=165
https://www.commonsensemedia.org/movie-reviews?p

https://www.commonsensemedia.org/movie-reviews?page=296
https://www.commonsensemedia.org/movie-reviews?page=297
https://www.commonsensemedia.org/movie-reviews?page=298
https://www.commonsensemedia.org/movie-reviews?page=299
https://www.commonsensemedia.org/movie-reviews?page=300
https://www.commonsensemedia.org/movie-reviews?page=301
https://www.commonsensemedia.org/movie-reviews?page=302
https://www.commonsensemedia.org/movie-reviews?page=303
https://www.commonsensemedia.org/movie-reviews?page=304
https://www.commonsensemedia.org/movie-reviews?page=305
https://www.commonsensemedia.org/movie-reviews?page=306
https://www.commonsensemedia.org/movie-reviews?page=307
https://www.commonsensemedia.org/movie-reviews?page=308
https://www.commonsensemedia.org/movie-reviews?page=309
https://www.commonsensemedia.org/movie-reviews?page=310
https://www.commonsensemedia.org/movie-reviews?page=311
https://www.commonsensemedia.org/movie-reviews?page=312
https://www.commonsensemedia.org/movie-reviews?p

In [21]:
missed_urls_pages  # No missing pages when run 7/14/2018

[]

In [22]:
movies_urls[:5]

['https://www.commonsensemedia.org/movie-reviews/the-music-of-silence',
 'https://www.commonsensemedia.org/movie-reviews/the-miseducation-of-cameron-post',
 'https://www.commonsensemedia.org/movie-reviews/the-spy-who-dumped-me',
 'https://www.commonsensemedia.org/movie-reviews/the-darkest-minds',
 'https://www.commonsensemedia.org/movie-reviews/like-father']

In [23]:
len(movies_urls)

8765

In [24]:
with open('data/movies_urls.json', 'w') as output:
    json.dump(movies_urls, output)

#### List of movie URL's saved as movies_urls.json for use in the next notebook for scraping...