In [1]:
# Installing the packages that will be used. The "!" on the commands it's for use in the OS, out of Jupyter.
# This is for use SQLite
!pip install -q imdb-sqlite
# This is for use the countries name and abbreviations
!pip install -q pycountry

In [2]:
# Importing the necessary packages and ignoring some warnings
import re
import time
import sqlite3
import pycountry
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore")
sns.set_theme(style = "whitegrid")

In [4]:
# Jupyter command for see the execution time
%time
# Download data from IMDB
!imdb-sqlite

Wall time: 0 ns


2021-09-05 10:54:02,279 DB already exists: (imdb.db). Refusing to modify. Exiting


In [5]:
# Connecting to database
conn = sqlite3.connect("imdb.db")
# Makes a table list
tabelas = pd.read_sql_query("SELECT NAME AS 'Table_Name' FROM sqlite_master WHERE type = 'table'", conn)
# Show the result
tabelas.head()

Unnamed: 0,Table_Name
0,people
1,titles
2,akas
3,crew
4,episodes


In [6]:
# Converting dataframe to a list
tabelas = tabelas["Table_Name"].values.tolist()
# Extracting the list
for tabela in tabelas:
    consulta = "PRAGMA TABLE_INFO({})".format(tabela)
    resultado = pd.read_sql_query(consulta, conn)
    print("Esquema da tabela:", tabela)
    display(resultado)
    print("-"*100)
    print("\n")

Esquema da tabela: people


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,person_id,VARCHAR,0,,1
1,1,name,VARCHAR,0,,0
2,2,born,INTEGER,0,,0
3,3,died,INTEGER,0,,0


----------------------------------------------------------------------------------------------------


Esquema da tabela: titles


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,title_id,VARCHAR,0,,1
1,1,type,VARCHAR,0,,0
2,2,primary_title,VARCHAR,0,,0
3,3,original_title,VARCHAR,0,,0
4,4,is_adult,INTEGER,0,,0
5,5,premiered,INTEGER,0,,0
6,6,ended,INTEGER,0,,0
7,7,runtime_minutes,INTEGER,0,,0
8,8,genres,VARCHAR,0,,0


----------------------------------------------------------------------------------------------------


Esquema da tabela: akas


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,title_id,VARCHAR,0,,0
1,1,title,VARCHAR,0,,0
2,2,region,VARCHAR,0,,0
3,3,language,VARCHAR,0,,0
4,4,types,VARCHAR,0,,0
5,5,attributes,VARCHAR,0,,0
6,6,is_original_title,INTEGER,0,,0


----------------------------------------------------------------------------------------------------


Esquema da tabela: crew


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,title_id,VARCHAR,0,,0
1,1,person_id,VARCHAR,0,,0
2,2,category,VARCHAR,0,,0
3,3,job,VARCHAR,0,,0
4,4,characters,VARCHAR,0,,0


----------------------------------------------------------------------------------------------------


Esquema da tabela: episodes


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,episode_title_id,INTEGER,0,,0
1,1,show_title_id,INTEGER,0,,0
2,2,season_number,INTEGER,0,,0
3,3,eposide_number,INTEGER,0,,0


----------------------------------------------------------------------------------------------------


Esquema da tabela: ratings


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,title_id,VARCHAR,0,,1
1,1,rating,INTEGER,0,,0
2,2,votes,INTEGER,0,,0


----------------------------------------------------------------------------------------------------




### Most common movies category

In [7]:
# Makes a SQL consult
consulta1 = '''SELECT type, COUNT(*) AS COUNT FROM titles GROUP BY type'''
# Results
resultado1 = pd.read_sql_query(consulta1, conn)
# Show the result
display(resultado1)

Unnamed: 0,type,COUNT
0,movie,586428
1,radioEpisode,1
2,radioSeries,1
3,short,828461
4,tvEpisode,6027449
5,tvMiniSeries,39532
6,tvMovie,132765
7,tvPilot,1
8,tvSeries,213736
9,tvShort,10247


In [8]:
# Percentage for each type
resultado1['percentual'] = (resultado1['COUNT'] / resultado1['COUNT'].sum()) * 100
# Show the result
display(resultado1)

Unnamed: 0,type,COUNT,percentual
0,movie,586428,7.132905
1,radioEpisode,1,1.2e-05
2,radioSeries,1,1.2e-05
3,short,828461,10.076827
4,tvEpisode,6027449,73.313724
5,tvMiniSeries,39532,0.48084
6,tvMovie,132765,1.614862
7,tvPilot,1,1.2e-05
8,tvSeries,213736,2.599737
9,tvShort,10247,0.124637


In [9]:
# Choosing only the top 3 categories and sum all the rest
# Empty dictionary
others = {}
# Filtering the percentage
others['COUNT'] = resultado1[resultado1['percentual'] < 5]['COUNT'].sum()
others['percentual'] = resultado1[resultado1['percentual'] < 5]['percentual'].sum()
# The rest
others['type'] = 'others'
# Show the result
display(others)

{'COUNT': 779109, 'percentual': 9.476543484376897, 'type': 'others'}

In [None]:
# Filtra o dataframe de resultado
resultado1 = resultado1[resultado1['percentual'] > 5]
# Append com o dataframe de outras categorias
resultado1 = resultado1.append(others, ignore_index = True)
# Ordena o resultado
resultado1 = resultado1.sort_values(by = 'COUNT', ascending = False)
# Show the result
resultado1.head()