### Import Libraries

In [1]:
import configparser as configparser
import pandas as pd
import pymongo
import json
from decimal import Decimal

## Parsing INI File

### Funtion: To parse INI file

In [2]:
def parse_ini(section: str) -> dict:
    """
    This function parses ini file for configuration details
    :param section: section to read from ini
    :return: Dictionary of config details
    """
    config = dict()
    parser = configparser.ConfigParser()
    parser.read("imdb_database.ini")
    if parser.has_section(section):
        config_items = parser.items(section)
        for item in config_items:
            config[item[0]] = item[1]
    return config

In [3]:
mongo_config = parse_ini("mongodb")
mongo_config

{'host': 'localhost', 'database': 'imdb', 'port': '27017'}

## Processing "extra-data.json"

### Reading extra-data.json file and splitting and loading every object into list of JSON objects

In [4]:
with open('extra-data.json', 'r') as file:
    temp = file.read()  
    temp = temp.split("}\n")
    temp = [data.strip() + "}" for data in temp]
    temp = list(filter(("}").__ne__, temp))
    temp = [json.loads(data) for data in temp]

extra_data = temp
extra_data

[{'box_office_currencyLabel': {'xml:lang': 'en',
   'type': 'literal',
   'value': 'United States dollar'},
  'titleLabel': {'type': 'literal', 'value': 'A Good Day to Die Hard'},
  'IMDb_ID': {'type': 'literal', 'value': 'tt1606378'},
  'cost': {'datatype': 'http://www.w3.org/2001/XMLSchema#decimal',
   'type': 'literal',
   'value': '92000000'},
  'distributorLabel': {'xml:lang': 'en',
   'type': 'literal',
   'value': 'InterCom'},
  'box_office': {'datatype': 'http://www.w3.org/2001/XMLSchema#decimal',
   'type': 'literal',
   'value': '304654182'}},
 {'box_office_currencyLabel': {'xml:lang': 'en',
   'type': 'literal',
   'value': 'United States dollar'},
  'titleLabel': {'type': 'literal', 'value': "De rouille et d'os"},
  'IMDb_ID': {'type': 'literal', 'value': 'tt2053425'},
  'cost': {'datatype': 'http://www.w3.org/2001/XMLSchema#decimal',
   'type': 'literal',
   'value': '16000000'},
  'distributorLabel': {'xml:lang': 'en',
   'type': 'literal',
   'value': 'InterCom'},
  'box

In [16]:
len(extra_data)

230825

### Counting documents with "titleLabel" field

In [5]:
count = 0
titleLabels = []
for data in extra_data:
    data1 = data
    if 'titleLabel' in data:
        titleLabels += [data['titleLabel']['value']]
        count += 1
count

114062

In [6]:
titleLabels

['A Good Day to Die Hard',
 "De rouille et d'os",
 'Fistful of Dollars',
 'Per un pugno di dollari',
 'Juno',
 'The Hobbit: An Unexpected Journey',
 'Elysium',
 'The Eagle',
 'Renaissance Man',
 'The Lord of the Rings: The Return of the King',
 'The Lord of the Rings: The Return of the King',
 '300',
 '300',
 "Dr. Seuss' How the Grinch Stole Christmas",
 'Fun with Dick and Jane',
 'The Silence of the Lambs',
 'Forrest Gump',
 'Forrest Gump',
 'The Heat',
 'Bullet to the Head',
 'The Wrestler',
 'Grown Ups',
 'Insidious',
 'Cloverfield',
 'Slumdog Millionaire',
 'Slumdog Millionaire',
 'Gran Torino',
 'Gran Torino',
 'The Lord of the Rings: The Fellowship of the Ring',
 'Machete Kills',
 'Machete',
 'Machete',
 'Gladiator',
 'Gladiator',
 'Monsieur Ibrahim et les fleurs du Coran',
 'Duplex',
 'The Proposal',
 'A Few Good Men',
 'The Great Dictator',
 'Top Gun',
 'Collateral',
 'Vanilla Sky',
 'Dancer in the Dark',
 'True Lies',
 "A Knight's Tale",
 'Dune',
 'Dune',
 'Silver Linings Play

In [7]:
titleLabels_df = pd.DataFrame(titleLabels)
titleLabels_df.rename(columns={0: "title"}, inplace=True)
titleLabels_df = titleLabels_df.groupby(['title'])['title'].count().reset_index(name='title_count')
titleLabels_df

Unnamed: 0,title,title_count
0,!Women Art Revolution,1
1,"""Atikva"" ümid deməkdir",1
2,"""Avtoqraf""",1
3,"""Crocodile"" Dundee",1
4,"""Crocodile"" Dundee II",1
...,...,...
106206,흑표비객,1
106207,흙,2
106208,흥부,1
106209,히말라야,1


## Analyzing matches with IMDB

### Connecting to IMDB MongoDB

In [8]:
connection_string = "mongodb://" + mongo_config['host'] + ":" + mongo_config['port']
mongo_client = pymongo.MongoClient(connection_string)
imdb = mongo_client[mongo_config['database']]
imdb

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'imdb')

In [9]:
movie_collection = imdb['Movies']

### Fetching list of titles and their count from IMDB MongoDB

In [10]:
result = movie_collection.aggregate(
    [{
    "$group" : 
        {"_id" : "$title", 
         "count" : {"$sum" : 1}
         }}
    ])

output = [res for res in result]


In [11]:
titles_in_imdb = pd.DataFrame(output)
titles_in_imdb

Unnamed: 0,_id,count
0,!Next?,1
1,!Que ve el Bisbe!,1
2,!Women Art Revolution,1
3,"""#1 Fan - """"I Can Haz Mils Garage Decals?""""""",1
4,"""#173 """"The Untethered Soul""""""",1
...,...,...
4104622,Čáp - Moments of Decisions,1
4104623,Šentilj-Spielfeld - Border Crossing That Once Was,1
4104624,Špansko the Continent,1
4104625,Τhe Improvisation of Petros Mokas,1


#### IMDB title with Extra-Data titleLabel Matches

In [12]:
match_titles_df = titles_in_imdb.merge(titleLabels_df, left_on="_id", right_on="title")
match_titles_df['matches'] = match_titles_df['count'] * match_titles_df['title_count']
total_matches = match_titles_df['matches'].sum()
total_unique_matches = match_titles_df.shape[0]
title_multiple_matches = match_titles_df[match_titles_df['matches'] > 1].shape[0]
title_single_matches = match_titles_df[match_titles_df['matches'] == 1].shape[0]
output = {'Total Matches': total_matches, 'Unique Matches': total_unique_matches, 'Titles with Mutiple Matches': title_multiple_matches, 'Titles with Single Match': title_single_matches}
output

{'Total Matches': 697914,
 'Unique Matches': 69109,
 'Titles with Mutiple Matches': 31224,
 'Titles with Single Match': 37885}

### Fetching list of originalTitles and their count from IMDB MongoDB

In [13]:
result = movie_collection.aggregate(
    [{
    "$group" : 
        {"_id" : "$originalTitle", 
         "count" : {"$sum" : 1}
         }}
    ])

output = [res for res in result]

In [14]:
originaltitles_in_imdb = pd.DataFrame(output)
originaltitles_in_imdb

Unnamed: 0,_id,count
0,!Next?,1
1,!Que ve el Bisbe!,1
2,"""#1 Fan - """"I Can Haz Mils Garage Decals?""""""",1
3,"""#173 """"The Untethered Soul""""""",1
4,"""#52 Heaven Only Knows What Will Happen With T...",1
...,...,...
4125606,öregHarcos,1
4125607,öregHarcos II,1
4125608,über den Wolken,1
4125609,überRICH,1


#### IMDB originalTitle with Extra-Data titleLabel Matches

In [15]:
match_titles_df = originaltitles_in_imdb.merge(titleLabels_df, left_on="_id", right_on="title")
match_titles_df['matches'] = match_titles_df['count'] * match_titles_df['title_count']
total_matches = match_titles_df['matches'].sum()
total_unique_matches = match_titles_df.shape[0]
title_multiple_matches = match_titles_df[match_titles_df['matches'] > 1].shape[0]
title_single_matches = match_titles_df[match_titles_df['matches'] == 1].shape[0]
output = {'Total Matches': total_matches, 'Unique Matches': total_unique_matches, 'Titles with Mutiple Matches': title_multiple_matches, 'Titles with Single Match': title_single_matches}
output

{'Total Matches': 691832,
 'Unique Matches': 78411,
 'Titles with Mutiple Matches': 32517,
 'Titles with Single Match': 45894}