In [1]:
import requests, time, json
from bs4 import BeautifulSoup as bs4

# MovieRec4Parents

## Problem Statement:
#### Busy parents need a way to choose movies for their family that all members of the family will enjoy without exposing their children to objectionable material. Yet, every family has its own standards for determining what is or is not objectionable. Common Sense Media has a great website, but it takes precious time to page through all the movies at this site and glean all the valuable information.

#### To help parents solve this problem, MovieRec4Parents is a content-based recommender system that will produce a list of movies that are similar to a single movie entered by the user. Movie similarity is determined by comparing language used to describe the movies by Common Sense Media as described below (see Notebook 5.1). If parents have a few minutes to answer several questions, MovieRec4Parents will filter this list of movies according to individual family standards. By doing so, parents will be provided with entertainment suggestions that the entire family will be able to enjoy.

## Get URL's of Individual Movies

### Create soup object and prepare to scrape

In [2]:
url = "https://www.commonsensemedia.org/reviews/category/movie"

In [3]:
res = requests.get(url)

In [4]:
res.status_code

200

In [5]:
soup = bs4(res.content, 'lxml')

### Scrape Common Sense Media for movie URL's

In [6]:
movies_urls = []

In [7]:
link = soup.find('strong', {'class': 'field-content'}).find('a').attrs.get('href')

In [8]:
link

'/movie-reviews/siberia'

In [9]:
movies_urls.append('https://www.commonsensemedia.org' + link)

In [10]:
movies_urls

['https://www.commonsensemedia.org/movie-reviews/siberia']

In [11]:
movies_urls = []
missed_urls_pages = []
def get_movie_urls(num_pages):
    for page in range(num_pages):
        url = "https://www.commonsensemedia.org/movie-reviews?page={}".format(page)
        print(url)
        res = requests.get(url)
        soup = bs4(res.content, 'lxml')
        if res.status_code == 200:
            for movie in soup.find_all('strong', {'class': 'field-content'}):
                movies_urls.append('https://www.commonsensemedia.org'
                                       + str(movie.find('a').attrs.get('href')))
                time.sleep(.1)
        else:
            missed_urls_page.append('https://www.commonsensemedia.org'
                                        + str(movie.find('a').attrs.get('href')))
    return movies_urls, missed_urls_pages

In [1]:
# Scrapes URL's from Common Sense Media's website
# ***Do not run this cell*** unless you want to rescrape. It will take about 35 minutes
movies_urls, missed_urls_pages = get_movie_urls(437)

In [14]:
missed_urls_pages  # No missing pages when run 7/14/2018

[]

In [15]:
movies_urls[:5]

['https://www.commonsensemedia.org/movie-reviews/siberia',
 'https://www.commonsensemedia.org/movie-reviews/shock-and-awe',
 'https://www.commonsensemedia.org/movie-reviews/eighth-grade',
 'https://www.commonsensemedia.org/movie-reviews/skyscraper',
 'https://www.commonsensemedia.org/movie-reviews/hotel-transylvania-3-summer-vacation']

In [16]:
len(movies_urls)

8892

In [17]:
with open('data/movies_urls.json', 'w') as output:
    json.dump(movies_urls, output)

#### List of movie URL's saved as movies_urls.json for use in the next notebook for scraping...