### Import Libraries

In [1]:
import configparser as configparser
import pandas as pd
import pymongo

## Parsing INI File

### Funtion: To parse INI file

In [2]:
def parse_ini(section: str) -> dict:
    """
    This function parses ini file for configuration details
    :param section: section to read from ini
    :return: Dictionary of config details
    """
    config = dict()
    parser = configparser.ConfigParser()
    parser.read("imdb_database.ini")
    if parser.has_section(section):
        config_items = parser.items(section)
        for item in config_items:
            config[item[0]] = item[1]
    return config

In [3]:
mongo_config = parse_ini("mongodb")
mongo_config

{'host': 'localhost', 'database': 'imdb', 'port': '27017'}

## Adding kmeansNorm field to documents with normalized startYear and avgRating

### Connecting to IMDB MongoDB

In [4]:
connection_string = "mongodb://" + mongo_config['host'] + ":" + mongo_config['port']
mongo_client = pymongo.MongoClient(connection_string)
imdb = mongo_client[mongo_config['database']]
imdb

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'imdb')

In [5]:
movie_collection = imdb['Movies']

### Fetching documents whose type='movie' with numVotes > 10,000 and both startYear & avgRating exist

In [6]:
cursor = movie_collection.aggregate([
    {
        '$match': {
            '$and': [
                {
                    'type': 'movie', 
                    'startYear': {
                        '$exists': True
                    }, 
                    'avgRating': {
                        '$exists': True
                    }, 
                    'numVotes': {
                        '$gt': 10000
                    }
                }
            ]
        }
    }
])

movies = list(cursor)

In [7]:
len(movies)

10379

### Get Max & Min of startYear and avgRating fields

In [8]:
cursor = movie_collection.aggregate(
    [
        {
            "$match" : {
                "$and" : [
                    {
                        "type" : "movie",
                        "startYear" : {
                            "$exists" : True
                        },
                        "avgRating" : {
                            "$exists" : True
                        },
                        "numVotes" : {
                            "$gt" : 10000
                        }
                    }
                ]
            }
        }, 
        {
            "$facet" : {
                "minYear" : [
                    {
                        "$sort" : { "startYear" : 1 }
                    },
                    { "$limit" : 1 }
                ],
                "maxYear" : [
                    {
                        "$sort" : { "startYear" : -1 }
                    },
                    { "$limit" : 1 }
                ],
                "minRating" : [
                    {
                        "$sort" : { "avgRating" : 1 }
                    },
                    { "$limit" : 1 }
                ],
                "maxRating" : [
                    {
                        "$sort" : { "avgRating" : -1 }
                    },
                    { "$limit" : 1 }
                ]
            }
        }, 
        {
            "$project" : {
                "minYear" : { "$first" : "$minYear.startYear" },
                "maxYear" : { "$first" : "$maxYear.startYear" },
                "minRating" : { "$first" : "$minRating.avgRating" },
                "maxRating" : { "$first" : "$maxRating.avgRating" }
            }
        }
    ]
)

bounds = list(cursor)

In [9]:
bounds

[{'minYear': 1915, 'maxYear': 2023, 'minRating': 1.0, 'maxRating': 9.6}]

### Creating UpdateOne Requests with kmeansNorm for the above fetched documents

In [10]:
requests = []
for data in movies:
    id = data['_id']
    normalizedStartYear = (data['startYear'] - bounds[0]['minYear']) / (bounds[0]['maxYear'] - bounds[0]['minYear'])
    normalizedRating = (data['avgRating'] - bounds[0]['minRating']) / (bounds[0]['maxRating'] - bounds[0]['minRating'])
    filter = {'_id': id}
    update = dict()
    update['kmeansNorm'] = [normalizedStartYear, normalizedRating]
    if bool(update):
        requests += [pymongo.UpdateOne(filter=filter, update={'$set': update})]

requests

[UpdateOne({'_id': 4972}, {'$set': {'kmeansNorm': [0.0, 0.6046511627906977]}}, False, None, None, None),
 UpdateOne({'_id': 6864}, {'$set': {'kmeansNorm': [0.009259259259259259, 0.7790697674418605]}}, False, None, None, None),
 UpdateOne({'_id': 9968}, {'$set': {'kmeansNorm': [0.037037037037037035, 0.7209302325581396]}}, False, None, None, None),
 UpdateOne({'_id': 10323}, {'$set': {'kmeansNorm': [0.046296296296296294, 0.813953488372093]}}, False, None, None, None),
 UpdateOne({'_id': 12349}, {'$set': {'kmeansNorm': [0.05555555555555555, 0.8488372093023256]}}, False, None, None, None),
 UpdateOne({'_id': 12364}, {'$set': {'kmeansNorm': [0.05555555555555555, 0.813953488372093]}}, False, None, None, None),
 UpdateOne({'_id': 13257}, {'$set': {'kmeansNorm': [0.06481481481481481, 0.7674418604651163]}}, False, None, None, None),
 UpdateOne({'_id': 13427}, {'$set': {'kmeansNorm': [0.06481481481481481, 0.7674418604651163]}}, False, None, None, None),
 UpdateOne({'_id': 13442}, {'$set': {'kmea

In [11]:
len(requests)

10379

### Bulk writing Update operations to Movies

In [12]:
output = movie_collection.bulk_write(requests)

In [13]:
output.bulk_api_result

{'writeErrors': [],
 'writeConcernErrors': [],
 'nInserted': 0,
 'nUpserted': 0,
 'nMatched': 10379,
 'nModified': 10379,
 'nRemoved': 0,
 'upserted': []}

In [14]:

cursor = movie_collection.aggregate([
        {
            "$match" : {
                "$and" : [
                    {
                        'kmeansNorm': { '$exists': True }
                    }
                ]
            }
        }
    ]
)
movies = list(cursor)
movies

[{'_id': 4972,
  'type': 'movie',
  'title': 'The Birth of a Nation',
  'originalTitle': 'The Birth of a Nation',
  'startYear': 1915,
  'runtime': 195,
  'avgRating': 6.2,
  'numVotes': 25191,
  'genres': ['Drama', 'War'],
  'actors': [{'actor': 1273, 'roles': ["Elsie - Stoneman's Daughter"]},
   {'actor': 178270, 'roles': ['Margaret Cameron - The Elder Sister']},
   {'actor': 550615, 'roles': ['Flora Cameron - The Pet Sister']},
   {'actor': 910400, 'roles': ['Col. Ben Cameron aka The Little Colonel']}],
  'directors': [428],
  'writers': [428, 228746, 940488],
  'kmeansNorm': [0.0, 0.6046511627906977]},
 {'_id': 6864,
  'type': 'movie',
  'title': 'Intolerance',
  'originalTitle': "Intolerance: Love's Struggle Throughout the Ages",
  'startYear': 1916,
  'runtime': 163,
  'avgRating': 7.7,
  'numVotes': 15898,
  'genres': ['Drama', 'History'],
  'actors': [{'actor': 1273,
    'roles': ['The Woman Who Rocks the Cradle', 'Eternal Mother']},
   {'actor': 366008, 'roles': ['The Boy']},
