In [2]:
import pprint
import pymongo
from pymongo import MongoClient
from IPython.display import clear_output


# Replace XXXX with your connection URI from the Atlas UI
client = MongoClient(XXX)

pipeline = [
    {
        '$limit': 100
    },
    {
        # addFields will add the specified fields
        # if the field already exists it will overwrite it which is why we implement it with lastupdated
        # the problem with lastupdated is that its too specified when it comes to time
        # for ex you will find " lastupdated: 12:30:00.4293491 " with ms which is redundant and will interrupt our next operation on it
        # so we will split the value of lastupdated to an array such that the part of ms will be alone
        # for ex, on the previous example we would have: [12:30:00, 4293491] (splitting based on '.')
        # then we will assign lastupdated the value at the index 0
        '$addFields':{
            'lastupdated':{
                '$arrayElemAt':[
                    {'$split':['$lastupdated', '.']},
                    0
                ]
            }
        }
    },
    {
        '$project': 
        {
            '__id': 1,
            'title':1,
            'year': 1,
            'rated': '$rating',
            'runtime': 1, # 1 means we keep the same value
            'genres': { '$split': ["$genre" , ","]},
            'directors': { '$split': [ "$director" , ","]},
            'cast' : {'$split': ["$cast", ","]},
            'countries': { '$split' : ["$country", ","]},
            'released' :{
                # '$cond' is used to specify a conditionnal operation to be made
                '$cond':{
                    'if':{ '$ne': ['$released','']},
                    # if ( value of released !== '') 'ne' means not equal
                    'then': {
                        '$dateFromString': {
                            'dateString': '$released'
                        }
                    },
                    # $dateFromString is used to format a date string according to the specified format
                    # $dateFromString makes it mendatory to specify the "dateString" format 
                    # it will return : year-Month-Day Hour:Minute:Second GMT/other
                    # there are other optional formats
                    'else': ""
                    # else we will be returning the empty string (cuz no relased value is specified)
                }
            },
            'fullPlot': '$fullplot',
            'plot':1,

            # we want to update the time to a new name (lastUpdated)
            # and also we want to have a real time stem but not just a string representation of the time
            # so we do the following :

            'lastUpdated': {
                '$cond':{
                    'if': {'$ne': ['$lastupdated', '']},
                    'then': {
                        '$dateFromString':{
                            'dateString': '$lastupdated',
                            'timezone': 'America/New_York'
                        }
                    },
                    'else': ''
                }
                # if(lastupdated !== ''){
                # lastupdated = year-Month-Day Hour:Minute:Second
                # timezone is used to format the date according to the place, here it is NewYork/USA
                # return lastupdated
                # } else{
                # return ''
                # }
            },
            'type': 1,
            'imdb': {
                'id': '$imdbID',
                'rating': '$imdbRating',
                'votes': '$imdbVotes'
            },
            'awards':1
        }
    },
    {
        '$out': "Movies Updated"
    }
]

clear_output()

pprint.pprint(list(client.Movies.movies_collection.aggregate(pipeline)))


[]


In [1]:
# the python method to clean the data as done in the previous cell
import pymongo
import pprint
from pymongo import MongoClient
from IPython.display import clear_output
import re
from datetime import datetime

client = MongoClient('mongodb+srv://Mohcen:20042004@firstcluster.pgjysjd.mongodb.net/?retryWrites=true&w=majority')

runtime_regex = re.compile(r'([0-9]+) min')

for movie in client.Movies.movies_collection.find({}).limit(100):
    
    fields_to_set = {}
    fields_to_unset = {}

    # we first loop through empty values 
    # we copy the document to not make changes on the original document
    # at the end , we will be posting our changes in a new document
    for key, value in movie.copy().items():
        if value == "" or value ==[""]:
            del movie[key]
            fields_to_unset[key]= ""
    
    # deal with keys that need to be in plural rather than in singular
    if 'director' in movie:
        fields_to_unset['director'] = ""
        fields_to_set['directors'] = movie['director'].split(', ')
    
    if 'genre' in movie: 
        fields_to_unset['genre'] = ""
        fields_to_set['genres'] = movie['genre'].split(', ')
    
    if 'country' in movie:
        fields_to_unset['country'] = ""
        fields_to_set['countries'] = movie['country'].split(', ')
    
    # deal with keys that need to have their names changed
    if 'cast' in movie:
        fields_to_set['cast'] = movie['cast'].split(", ")
        
    if 'rating' in movie:
        fields_to_set['rated'] = movie['rating']   

    if 'fullplot' in movie:
        fields_to_set['fullPlot'] = movie['fullplot'] 

    
    # deal with id object
    imdb = {}
    if 'imdbID' in movie:
        fields_to_unset['imdbID'] = ""
        imdb['id'] = movie['imdbID']
    if 'imdbRating' in movie:
        fields_to_unset['imdbRating'] = ""
        imdb['rating'] = movie['imdbRating']
    if 'imdbVotes' in movie:
        fields_to_unset['imdbVotes'] = ""
        imdb['votes'] = movie['imdbVotes']
    if imdb:
        fields_to_set['idmb'] = imdb
    
    # deal with keys that their values require date/time formatting
    if 'released' in movie:
        # datetime.strptime() we will take the string that contains the date and the proper format according to the string
        fields_to_set['released'] = datetime.strptime(movie['released'], "%Y-%m-%d")  
        fields_to_unset['released'] = ""

    if 'lastupdated' in movie:
        # we will only take the substring because we already know that we don't need the millie seconds part
        fields_to_set['lastUpdated'] = datetime.strptime(movie['lastupdated'][0:19], "%Y-%m-%d %H:%M:%S") 
        fields_to_unset['lastupdated'] = "" 

    # deal with runtime key as it is better to convert it to an integer
    if 'runtime' in movie:
        match = runtime_regex.match(movie['runtime'])
        # see if there is any match with the given value and the regex [XXX min]
        
        if match:
            # basically: match.group(1) will be returning the first matching subgroup
            # in our case we wanted something like (XXX min) then we would have just: XXX
            # if we were to do : match.group(2) we would have: min
            # the convertion to int is obvious because thats what all the operation is about
            fields_to_set['runtime'] = int(match.group(1))
    
    update_doc = {}
    if fields_to_set:
        update_doc['$set'] = fields_to_set
    if fields_to_unset:
        update_doc['$unset'] = fields_to_unset

    # update_one() takes a filter as 1st arg and the updated document as 2nd arg
    client.Movies.movies_collection.update_one({'_id': movie['_id']}, update_doc)


In [17]:
# the python method but with more effeciency 
import pymongo
import pprint
from pymongo import MongoClient , UpdateOne
from IPython.display import clear_output
import re
from datetime import datetime

client = MongoClient('mongodb+srv://Mohcen:20042004@firstcluster.pgjysjd.mongodb.net/?retryWrites=true&w=majority')

runtime_regex = re.compile(r'([0-9]+) min')

batch_size = 100 # this is used to update only 100 documents at a time
updated = [] # list of all the updated documents

# rest of the code is the same as the one above

for movie in client.Movies.movies.find({}).limit(100):
    
    fields_to_set = {}
    fields_to_unset = {}

    for key, value in movie.copy().items():
        if value == "" or value ==[""]:
            del movie[key]
            fields_to_unset[key]= ""
    
   
    if 'director' in movie:
        fields_to_unset['director'] = ""
        fields_to_set['directors'] = movie['director'].split(', ')
    
    if 'genre' in movie: 
        fields_to_unset['genre'] = ""
        fields_to_set['genres'] = movie['genre'].split(', ')
    
    if 'country' in movie:
        fields_to_unset['country'] = ""
        fields_to_set['countries'] = movie['country'].split(', ')
    
    if 'cast' in movie:
        fields_to_set['cast'] = movie['cast'].split(", ")
        
    if 'rating' in movie:
        fields_to_set['rated'] = movie['rating']   
        fields_to_unset['rating'] = ""

    if 'fullplot' in movie:
        fields_to_set['fullPlot'] = movie['fullplot'] 
        fields_to_unset['fullplot'] = ""

    
    imdb = {}
    if 'imdbID' in movie:
        fields_to_unset['imdbID'] = ""
        imdb['id'] = movie['imdbID']
    if 'imdbRating' in movie:
        fields_to_unset['imdbRating'] = ""
        imdb['rating'] = movie['imdbRating']
    if 'imdbVotes' in movie:
        fields_to_unset['imdbVotes'] = ""
        imdb['votes'] = movie['imdbVotes']
    if imdb:
        fields_to_set['imdb'] = imdb
    
    if 'released' in movie:
        fields_to_set['released'] = datetime.strptime(movie['released'], "%Y-%m-%d")  
        # fields_to_unset['released'] = ""

    if 'lastupdated' in movie:
        fields_to_set['lastUpdated'] = datetime.strptime(movie['lastupdated'][0:19], "%Y-%m-%d %H:%M:%S") 
        fields_to_unset['lastupdated'] = "" 

    if 'runtime' in movie:
        match = runtime_regex.match(movie['runtime'])
         
        if match:
            fields_to_set['runtime'] = int(match.group(1))
    
    update_doc = {}
    if fields_to_set:
        update_doc['$set'] = fields_to_set
    if fields_to_unset:
        update_doc['$unset'] = fields_to_unset

    updated.append(UpdateOne({'_id': movie['_id']}, update_doc))
    # update one takes the filter to specify the appropriate document and takes also the new version of the same document

    if len(updated) == batch_size:
        client.Movies.movies.bulk_write(updated)
        # bulk_write() is a method that takes a bunch of write operations
        # write operations are for ex: insert_one, update_one ...
        # in this case we are passing 100 update_one operations
        # this method makes us query all the updates at once which is more effecient
        updated = []

# if there are  some remaining updates, we write them
if updated: 
    client.Movies.movies.bulk_write(updated)



AttributeError: 'list' object has no attribute 'split'

<pymongo.results.DeleteResult at 0x13e744c6b30>