In [None]:
import pymongo
from pymongo import MongoClient
import pprint
from IPython.display import clear_output

# Replace XXXX with your connection URI from the Atlas UI
client = MongoClient(XXXX)

pipeline = [
    {
        #limit search to the fist 100 documents
        '$limit': 100
    },
    # In this case, what we're going to do is pass through all fields of every document, 
    #but for some we're going to do a little bit of work, to either clean them up or to reshape them in some way 
    #so that they meet our needs for the type of analysis and application building we want to do down the road
    {
        '$project': {
            #I can stipulate that I want to include fields by simply specifying the field name and a one. 
            #And I could explicitly exclude fields with the use of a zero
            'title': 1,
            'year': 1,
            #you'll note that we have that same split for each of these six fields here to convert each of the string values for these fields to array
            'directors': {'$split': ["$director", ", "]},
            'actors': {'$split': ["$cast", ", "]},
            'writers': {'$split': ["$writer", ", "]},
            'genres': {'$split': ["$genre", ", "]},
            'languages': {'$split': ["$language", ", "]},
            'countries': {'$split': ["$country", ", "]},
            'plot': 1,
            #In the movie's initial data set, there is a field called full plot. The only real problem with that field from my perspective is that it doesn't use camel case as the name of the field. 
            #So what I'm doing here is effectively renaming that field to full Plot
            'fullPlot': "$fullplot",
            #renaming the rating field to rated. I found with this data set that people get confused about what rating versus rated. 
            #The rating field is actually the MPAA rating, PG, R, PG-13. And that with the name rating, 
            #people tend to get confused thinking that its critics review or a viewer's review. But don't have that problem if we rename it rated
            'rated': "$rating",
            'released': 1,
            'runtime': 1,
            'poster': 1,
            #And lastly, what I'd like to point out is that we're actually going to create an embedded document here for IMDb. 
            #The movie's initial data set, if we take a look at this encompass, currently has field values for IMDb rating, IMDb votes. 
            #In fact there are three keys that all lead off with the name IMDb. So rather than do that in this data set, 
            #what I'd like to do instead is have a single IMDb key and as its value, 
            #I want not a scalar value, not a string, and not an array, but instead an embedded document or in Python parlance, a dictionary
            'imdb': {
                'id': "$imdbID",
                'rating': "$imdbRating",
                'votes': "$imdbVotes"
                },
            'metacritic': 1,
            'awards': 1,
            'type': 1,
            'lastUpdated': "$lastupdated"
        }
    },
    {
        #dump the result into movies_scratch collection
        '$out': "movies_scratch"
    }
]

clear_output()
pprint.pprint(list(client.mflix.movies_initial.aggregate(pipeline)))