## Import des librairies

In [1]:
from sqlalchemy import create_engine
import pandas as pd
from dotenv import load_dotenv
import os

## Création de la connexion

In [2]:
load_dotenv()  # take environment variables from .env.
database_url = os.getenv("DATABASE_URL")
engine = create_engine(database_url) 

## Analyse de données

### Table title_basics

In [3]:

query = "SELECT * FROM sebastien.title_basics LIMIT 10000;"
with engine.connect() as conn, conn.begin():
    df = pd.read_sql_query(query, engine)

df.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
0,tt1521167,tvEpisode,Leerplicht,Leerplicht,False,1982.0,,21.0,Comedy,,
1,tt15211672,tvEpisode,Episode #1.839,Episode #1.839,False,,,,Talk-Show,,
2,tt15211674,tvEpisode,Episode #1.840,Episode #1.840,False,,,,Talk-Show,,
3,tt15211676,tvEpisode,Vanderbilt vs Mizzou,Vanderbilt vs Mizzou,False,2020.0,,,Sport,,
4,tt15211678,short,Djinn,Jen,False,1970.0,,,Short,,


Les films/séries qui n'ont pas suffisemment de données (trop de nan en features) seront pénalisé par le modèle par manque de données, ce qui est souhaitable puisque cela signifie qu'ils ne sont pas regardés.

#### Différence primary title et originaltitle

In [4]:
df[df["primarytitle"] == df["originaltitle"]]

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
0,tt1521167,tvEpisode,Leerplicht,Leerplicht,False,1982.0,,21.0,Comedy,,
1,tt15211672,tvEpisode,Episode #1.839,Episode #1.839,False,,,,Talk-Show,,
2,tt15211674,tvEpisode,Episode #1.840,Episode #1.840,False,,,,Talk-Show,,
3,tt15211676,tvEpisode,Vanderbilt vs Mizzou,Vanderbilt vs Mizzou,False,2020.0,,,Sport,,
5,tt1521168,tvEpisode,Lenen,Lenen,False,1981.0,,32.0,Comedy,,
...,...,...,...,...,...,...,...,...,...,...,...
9995,tt15230100,tvEpisode,Busty girlfriends toying each other,Busty girlfriends toying each other,True,2015.0,,,Adult,,
9996,tt15230102,tvEpisode,Episode #1.389,Episode #1.389,False,,,,Talk-Show,,
9997,tt15230104,tvEpisode,Episode #1.390,Episode #1.390,False,,,,Talk-Show,,
9998,tt15230106,tvEpisode,Episode #1.392,Episode #1.392,False,,,,Talk-Show,,


Primary title est le titre le plus utilisé et original title et le titre original.

#### Analyse des genres

In [5]:
df["genres"].str.split(',').explode().value_counts()

genres
Drama          2476
Talk-Show      1683
Comedy         1598
Short          1098
News            859
Documentary     829
Family          706
Game-Show       601
Horror          593
Romance         464
Action          416
Mystery         396
Animation       347
Reality-TV      342
nan             340
Sport           329
Adult           320
Music           317
Adventure       315
Crime           216
Fantasy         180
History         161
Biography       158
Thriller        117
Sci-Fi           71
Musical          47
War               9
Western           6
Name: count, dtype: int64

In [6]:
df[df["genres"] == "nan"]["titletype"].value_counts()

titletype
tvEpisode       202
tvSpecial        64
video            31
movie            23
tvSeries          9
videoGame         6
tvMovie           4
tvMiniSeries      1
Name: count, dtype: int64

#### Analyse des notations

In [7]:
df[(df["averagerating"].notna()) & (df["numvotes"].isna())]

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes


Il n'y a pas de ligne ayant une note sans le nombre de vote ou l'inverse

In [8]:
df[df["averagerating"] == df["averagerating"].min()]

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
8329,tt15227110,tvEpisode,Caitlyn Jenner; Nancy Pelosi; Maxine Waters; R...,Caitlyn Jenner; Nancy Pelosi; Maxine Waters; R...,False,2021.0,,,News,1.0,97.0


In [9]:

query = "SELECT min(averagerating) FROM sebastien.title_basics;"
with engine.connect() as conn, conn.begin():
    df = pd.read_sql_query(query, engine)

df.head()

: 

: 

In [21]:
df["averagerating"].describe()

count    1069.000000
mean        6.955847
std         1.447928
min         1.000000
25%         6.100000
50%         7.100000
75%         8.000000
max        10.000000
Name: averagerating, dtype: float64

#### Analyse des types

In [13]:
df["titletype"].value_counts()

titletype
tvEpisode       7605
short           1106
movie            509
video            300
tvSeries         218
tvMovie          152
tvSpecial         65
tvMiniSeries      25
videoGame         12
tvShort            8
Name: count, dtype: int64

In [18]:
df[df["titletype"] == "tvEpisode"]

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
1,tt1787830,tvEpisode,Troubadours: Carole King/James Taylor & the Ri...,Troubadours: Carole King/James Taylor & the Ri...,False,2011.0,,101.0,"Biography,Documentary,History",8.1,137.0
26,tt17878548,tvEpisode,Top 10 Disaster Movies That Could Really Happen,Top 10 Disaster Movies That Could Really Happen,False,2022.0,,,"Comedy,Talk-Show",,
32,tt17878590,tvEpisode,The LIBBY SHOW LIVE from Christmas Con,The LIBBY SHOW LIVE from Christmas Con,False,2021.0,,,Talk-Show,,
48,tt17878746,tvEpisode,Episode #1.4,Episode #1.4,False,2022.0,,,Drama,,
54,tt17878790,tvEpisode,Episode #1.9276,Episode #1.9276,False,2022.0,,,"Crime,Drama,Romance",,
...,...,...,...,...,...,...,...,...,...,...,...
9995,tt1794686,tvEpisode,Episode dated 28 April 1969,Episode dated 28 April 1969,False,1969.0,,,Comedy,,
9996,tt17946862,tvEpisode,Episode #12.5,Episode #12.5,False,2022.0,,60.0,"News,Talk-Show",,
9997,tt1794687,tvEpisode,"Bampy Battles Bots/Truth, Revision, and the Le...","Bampy Battles Bots/Truth, Revision, and the Le...",False,2011.0,,25.0,"Action,Adventure,Animation",5.1,37.0
9998,tt1794688,tvEpisode,The Malone Family,The Malone Family,False,2010.0,,,Reality-TV,,


### Lien avec title_episode

In [51]:
query = f"SELECT * FROM sebastien.title_episode LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,parenttconst,seasonnumber,episodenumber
0,tt0031458,tt32857063,,
1,tt0041951,tt0041038,1.0,9.0
2,tt0042816,tt0989125,1.0,17.0
3,tt0042889,tt0989125,,
4,tt0043426,tt0040051,3.0,42.0


In [55]:
query = f"SELECT * FROM sebastien.title_basics WHERE tconst = 'tt0048893' LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes


In [49]:
tvEpisode = df[df["titletype"] == "tvSeries"]["tconst"].values[1]
tvEpisode

'tt1787843'

In [50]:
query = f"SELECT * FROM sebastien.title_episode WHERE parenttconst = '{tvEpisode}' LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,parenttconst,seasonnumber,episodenumber


In [3]:
query = "SELECT * FROM sebastien.title_basics WHERE primarytitle LIKE 'Parlement' LIMIT 10;"
with engine.connect() as conn:
    df_episode = pd.read_sql_query(query, conn)

df_episode.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
0,tt9812666,tvSeries,Parlement,Parlement,False,2020,,25,Comedy,7.7,1206


In [4]:
query = f"SELECT * FROM sebastien.title_episode WHERE parenttconst = 'tt9812666' LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,parenttconst,seasonnumber,episodenumber
0,tt12136640,tt9812666,1,1
1,tt12136656,tt9812666,1,2
2,tt12136660,tt9812666,1,3
3,tt12136662,tt9812666,1,4
4,tt12136666,tt9812666,1,5


In [11]:
query = f"SELECT * FROM sebastien.title_basics WHERE tconst = 'tt12136666' LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes


tvEpisode est bien une série qui peux avoir un tconst lié dans title_episode. tvSeries est bien une série qui peux avoir un parenttconst lié dans title_episode.

### Table title_akas

In [None]:
query = f"SELECT * FROM sebastien.title_akas LIMIT 10000;"
with engine.connect() as conn, conn.begin():
    df_akas = pd.read_sql_query(query, engine)

df_akas.head()

Unnamed: 0,identifiant,tconst,ordering,title,region,language,types,attributes,isoriginaltitle
0,1238,tt0000284,2,At the Photographer's,US,\N,dvd,\N,
1,1239,tt0000284,3,Chez le photographe,FR,\N,imdbDisplay,\N,
2,1240,tt0000284,4,У фотографа,RU,\N,imdbDisplay,\N,
3,1241,tt0000285,1,Chinese Magic,\N,\N,original,\N,
4,1242,tt0000285,2,Chinese Magic,GB,\N,imdbDisplay,\N,


In [3]:
query = """
SELECT tb.tconst, tb.primarytitle, COUNT(ta.region) 
FROM sebastien.title_basics AS tb 
JOIN sebastien.title_akas AS ta 
ON tb.tconst = ta.tconst 
GROUP BY tb.tconst, tb.primarytitle, ta.region 
LIMIT 1;
"""
with engine.connect() as conn, conn.begin():
    df_akas = pd.read_sql_query(query, conn)

df_akas.head()

OperationalError: (psycopg2.errors.DiskFull) could not write to file "base/pgsql_tmp/pgsql_tmp450294.0.sharedfileset/i27of256.p0.0": No space left on device

[SQL: 
SELECT tb.tconst, tb.primarytitle, COUNT(ta.region) 
FROM sebastien.title_basics AS tb 
JOIN sebastien.title_akas AS ta 
ON tb.tconst = ta.tconst 
GROUP BY tb.tconst, tb.primarytitle, ta.region 
LIMIT 1;
]
(Background on this error at: https://sqlalche.me/e/20/e3q8)