### Import Libraries

In [1]:
import psycopg2
import psycopg2.extras
import configparser as configparser
import pandas as pd
import pymongo
import json
from decimal import Decimal

## Parsing INI File

### Funtion: To parse INI file

In [2]:
def parse_ini(section: str) -> dict:
    """
    This function parses ini file for configuration details
    :param section: section to read from ini
    :return: Dictionary of config details
    """
    config = dict()
    parser = configparser.ConfigParser()
    parser.read("imdb_database.ini")
    if parser.has_section(section):
        config_items = parser.items(section)
        for item in config_items:
            config[item[0]] = item[1]
    return config

In [3]:
sql_config = parse_ini("postgresql")
sql_config

{'host': 'localhost',
 'database': 'imdb',
 'user': 'postgres',
 'password': '1997',
 'port': '5432'}

In [4]:
mongo_config = parse_ini("mongodb")
mongo_config

{'host': 'localhost', 'database': 'imdb', 'port': '27017'}

## Fetching data from IMDB SQL DB

### Function: To connect to IMDB database and run select query to fetch from IMDB SQL Database

In [5]:
def run_sql(query: str):
    with psycopg2.connect(**sql_config) as conn:
        try:
            dataframe = pd.read_sql_query(query, conn)
            print("The query results loaded to Dataframe")
        except (Exception, psycopg2.DatabaseError) as error:
            print("SQL Exception:" + str(error))
            return None
    return dataframe

In [6]:
def fetch_sql(query: str):
    with psycopg2.connect(**sql_config) as conn:
        conn.autocommit = True
        cursor = conn.cursor()
        try:
            cursor.execute(query)
            results = cursor.fetchall()
            return results
        except (Exception, psycopg2.DatabaseError) as error:
            print("SQL Exception:" + str(error))
        finally:
            cursor.close()

### Fetching members table as JSON dictionary from IMDB SQL DB

In [7]:
query = '''SELECT id AS _id, name, birthyear, deathyear
	FROM member'''
members = fetch_sql(query)
members

[(1, 'Fred Astaire', 1899, 1987),
 (2, 'Lauren Bacall', 1924, 2014),
 (3, 'Brigitte Bardot', 1934, None),
 (4, 'John Belushi', 1949, 1982),
 (5, 'Ingmar Bergman', 1918, 2007),
 (6, 'Ingrid Bergman', 1915, 1982),
 (7, 'Humphrey Bogart', 1899, 1957),
 (8, 'Marlon Brando', 1924, 2004),
 (9, 'Richard Burton', 1925, 1984),
 (10, 'James Cagney', 1899, 1986),
 (11, 'Gary Cooper', 1901, 1961),
 (12, 'Bette Davis', 1908, 1989),
 (13, 'Doris Day', 1922, 2019),
 (14, 'Olivia de Havilland', 1916, 2020),
 (15, 'James Dean', 1931, 1955),
 (16, 'Georges Delerue', 1925, 1992),
 (17, 'Marlene Dietrich', 1901, 1992),
 (18, 'Kirk Douglas', 1916, 2020),
 (19, 'Federico Fellini', 1920, 1993),
 (20, 'Henry Fonda', 1905, 1982),
 (21, 'Joan Fontaine', 1917, 2013),
 (22, 'Clark Gable', 1901, 1960),
 (23, 'Judy Garland', 1922, 1969),
 (24, 'John Gielgud', 1904, 2000),
 (25, 'Jerry Goldsmith', 1929, 2004),
 (26, 'Cary Grant', 1904, 1986),
 (27, 'Alec Guinness', 1914, 2000),
 (28, 'Rita Hayworth', 1918, 1987),
 (

In [8]:
member_columns = ['_id', 'name', 'birthYear', 'deathYear']
members = [dict((key, value) for key, value in zip(member_columns, row) if value != None and value == value) for row in members]
members

[{'_id': 1, 'name': 'Fred Astaire', 'birthYear': 1899, 'deathYear': 1987},
 {'_id': 2, 'name': 'Lauren Bacall', 'birthYear': 1924, 'deathYear': 2014},
 {'_id': 3, 'name': 'Brigitte Bardot', 'birthYear': 1934},
 {'_id': 4, 'name': 'John Belushi', 'birthYear': 1949, 'deathYear': 1982},
 {'_id': 5, 'name': 'Ingmar Bergman', 'birthYear': 1918, 'deathYear': 2007},
 {'_id': 6, 'name': 'Ingrid Bergman', 'birthYear': 1915, 'deathYear': 1982},
 {'_id': 7, 'name': 'Humphrey Bogart', 'birthYear': 1899, 'deathYear': 1957},
 {'_id': 8, 'name': 'Marlon Brando', 'birthYear': 1924, 'deathYear': 2004},
 {'_id': 9, 'name': 'Richard Burton', 'birthYear': 1925, 'deathYear': 1984},
 {'_id': 10, 'name': 'James Cagney', 'birthYear': 1899, 'deathYear': 1986},
 {'_id': 11, 'name': 'Gary Cooper', 'birthYear': 1901, 'deathYear': 1961},
 {'_id': 12, 'name': 'Bette Davis', 'birthYear': 1908, 'deathYear': 1989},
 {'_id': 13, 'name': 'Doris Day', 'birthYear': 1922, 'deathYear': 2019},
 {'_id': 14,
  'name': 'Olivia 

### Fetching movie table, a result joins of multiple tables from IMDB SQL DB and converting to JSON dictionary

In [9]:
query = '''SELECT id AS _id, type, t.title AS title, originalTitle, startYear, endYear, runtime, avgRating, numVotes, genres, actors, directors, writers, producers
	FROM title AS t
	LEFT JOIN (SELECT title, ARRAY_AGG(g.genre) AS genres 
				FROM title_genre AS tg 
				INNER JOIN genre AS g 
				ON g.id = tg.genre 
				GROUP BY title) AS g
	ON t.id = g.title
	LEFT JOIN (SELECT title, ARRAY_AGG(writer) AS writers 
				FROM title_writer 
				GROUP BY title) AS w
	ON t.id = w.title
	LEFT JOIN (SELECT title, ARRAY_AGG(director) AS directors 
			   FROM title_director 
			   GROUP BY title) AS d
	ON t.id = d.title
	LEFT JOIN (SELECT title, ARRAY_AGG(producer) AS producers 
			   FROM title_producer 
			   GROUP BY title) AS p
	ON t.id = p.title
	LEFT JOIN (SELECT title, JSON_AGG(JSON_BUILD_OBJECT(
				   'actor', actor, 
				   'roles', characters)) AS actors 
			   FROM ( 
				   SELECT title, actor, ARRAY_AGG(c.character) AS characters 
				   FROM actor_title_character AS atc 
				   INNER JOIN character AS c 
				   ON atc.character = c.id 
				   GROUP BY title, actor) AS ta
			   GROUP BY title) AS a
	ON t.id = a.title'''
movies = fetch_sql(query)
movies[:10]

[(1,
  'short',
  'Carmencita',
  'Carmencita',
  1894,
  None,
  1,
  Decimal('5.7'),
  1949,
  ['Documentary', 'Short'],
  [{'actor': 1588970, 'roles': ['Self']}],
  [5690],
  None,
  None),
 (2,
  'short',
  'Le clown et ses chiens',
  'Le clown et ses chiens',
  1892,
  None,
  5,
  Decimal('5.8'),
  264,
  ['Short', 'Animation'],
  None,
  [721526],
  None,
  None),
 (3,
  'short',
  'Pauvre Pierrot',
  'Pauvre Pierrot',
  1892,
  None,
  4,
  Decimal('6.5'),
  1779,
  ['Animation', 'Comedy', 'Romance'],
  None,
  [721526],
  None,
  [1770680]),
 (4,
  'short',
  'Un bon bock',
  'Un bon bock',
  1892,
  None,
  12,
  Decimal('5.6'),
  179,
  ['Short', 'Animation'],
  None,
  [721526],
  None,
  None),
 (5,
  'short',
  'Blacksmith Scene',
  'Blacksmith Scene',
  1893,
  None,
  1,
  Decimal('6.2'),
  2583,
  ['Short', 'Comedy'],
  [{'actor': 443482, 'roles': ['Blacksmith']},
   {'actor': 653042, 'roles': ['Assistant']}],
  [5690],
  None,
  [249379]),
 (6,
  'short',
  'Chinese O

In [10]:
movie_columns = ['_id', 'type', 'title', 'originalTitle', 'startYear', 'endYear', 'runtime', 'avgRating', 'numVotes', 'genres', 'actors', 'directors', 'writers', 'producers']
movie_list = []
# movies = [dict((key, value) for key, value in zip(movie_columns, row) if value != None and value == value) for row in movies]
for row in movies:
    row_dict = {}
    row_zip = zip(movie_columns, row)
    for key, value in row_zip:
        if value != None and value == value:
            if isinstance(value, Decimal):
                row_dict[key] = float(str(value))
            else:
                row_dict[key] = value
    movie_list.append(row_dict)
movies = movie_list
movies

[{'_id': 1,
  'type': 'short',
  'title': 'Carmencita',
  'originalTitle': 'Carmencita',
  'startYear': 1894,
  'runtime': 1,
  'avgRating': 5.7,
  'numVotes': 1949,
  'genres': ['Documentary', 'Short'],
  'actors': [{'actor': 1588970, 'roles': ['Self']}],
  'directors': [5690]},
 {'_id': 2,
  'type': 'short',
  'title': 'Le clown et ses chiens',
  'originalTitle': 'Le clown et ses chiens',
  'startYear': 1892,
  'runtime': 5,
  'avgRating': 5.8,
  'numVotes': 264,
  'genres': ['Short', 'Animation'],
  'directors': [721526]},
 {'_id': 3,
  'type': 'short',
  'title': 'Pauvre Pierrot',
  'originalTitle': 'Pauvre Pierrot',
  'startYear': 1892,
  'runtime': 4,
  'avgRating': 6.5,
  'numVotes': 1779,
  'genres': ['Animation', 'Comedy', 'Romance'],
  'directors': [721526],
  'producers': [1770680]},
 {'_id': 4,
  'type': 'short',
  'title': 'Un bon bock',
  'originalTitle': 'Un bon bock',
  'startYear': 1892,
  'runtime': 12,
  'avgRating': 5.6,
  'numVotes': 179,
  'genres': ['Short', 'Ani

## Loading data to IMDB MongoDB

### Connecting to IMDB MongoDB

In [11]:
connection_string = "mongodb://" + mongo_config['host'] + ":" + mongo_config['port']
mongo_client = pymongo.MongoClient(connection_string)

In [12]:
imdb = mongo_client[mongo_config['database']]
imdb

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'imdb')

In [14]:
movie_collection = imdb['Movies']
movie_collection.insert_many(movies)

<pymongo.results.InsertManyResult at 0x1f109550a30>

In [None]:
member_collection = imdb['Members']
member_collection.insert_many(members)

<pymongo.results.InsertManyResult at 0x1a18e209ea0>