## Import des librairies

In [1]:
from sqlalchemy import create_engine
import pandas as pd
from dotenv import load_dotenv
import os

## Création de la connexion

In [2]:
load_dotenv()  # take environment variables from .env.
database_url = os.getenv("DATABASE_URL")
engine = create_engine(database_url) 

## Analyse de données

### Table title_basics

In [13]:

query = "SELECT * FROM sebastien.title_basics LIMIT 10000;"
with engine.connect() as conn, conn.begin():
    df = pd.read_sql_query(query, engine)

df.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
0,tt2626600,tvEpisode,Episode #1.5,Episode #1.5,False,2008.0,,,,,
1,tt2626602,tvEpisode,Forget Me Not,Forget Me Not,False,2013.0,,44.0,"Crime,Drama,Mystery",8.3,559.0
2,tt2626604,tvEpisode,The End,The End,False,2012.0,,,Comedy,,
3,tt2626608,tvEpisode,"Promised Land, the Impossible, Tarantino Retro...","Promised Land, the Impossible, Tarantino Retro...",False,2012.0,,,Talk-Show,,
4,tt2626610,tvEpisode,Daddy Issues,Daddy Issues,False,2013.0,,43.0,"Comedy,Crime,Drama",7.8,285.0


Les films/séries qui n'ont pas suffisemment de données (trop de nan en features) seront pénalisé par le modèle par manque de données, ce qui est souhaitable puisque cela signifie qu'ils ne sont pas regardés.

#### Différence primary title et originaltitle

In [14]:
df[df["primarytitle"] == df["originaltitle"]]

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
0,tt2626600,tvEpisode,Episode #1.5,Episode #1.5,False,2008.0,,,,,
1,tt2626602,tvEpisode,Forget Me Not,Forget Me Not,False,2013.0,,44.0,"Crime,Drama,Mystery",8.3,559.0
2,tt2626604,tvEpisode,The End,The End,False,2012.0,,,Comedy,,
3,tt2626608,tvEpisode,"Promised Land, the Impossible, Tarantino Retro...","Promised Land, the Impossible, Tarantino Retro...",False,2012.0,,,Talk-Show,,
4,tt2626610,tvEpisode,Daddy Issues,Daddy Issues,False,2013.0,,43.0,"Comedy,Crime,Drama",7.8,285.0
...,...,...,...,...,...,...,...,...,...,...,...
9995,tt26344933,tvEpisode,Episode #1.960,Episode #1.960,False,2000.0,,30.0,Drama,,
9996,tt26344935,tvEpisode,Episode #1.961,Episode #1.961,False,2000.0,,30.0,Drama,,
9997,tt26344936,tvEpisode,Episode #1.963,Episode #1.963,False,2000.0,,30.0,Drama,,
9998,tt26344937,tvEpisode,Episode #1.962,Episode #1.962,False,2000.0,,30.0,Drama,,


Primary title est le titre le plus utilisé et original title et le titre original.

#### Analyse des genres

In [17]:
df["genres"].str.split(',').explode().value_counts()

genres
Drama          2753
Talk-Show      2311
Comedy         2063
News           1334
Documentary    1022
Short           956
Reality-TV      638
Romance         503
Animation       366
Horror          364
Crime           362
Family          295
Action          294
Music           260
Adventure       237
Game-Show       229
Thriller        219
Sport           198
Fantasy         170
nan             156
Mystery         151
Adult           146
History         136
Sci-Fi           86
Biography        80
Musical          76
War              36
Western           3
Name: count, dtype: int64

In [20]:
df[df["genres"] == "nan"]["titletype"].value_counts()

titletype
tvEpisode    99
movie        31
tvMovie       9
tvSeries      8
videoGame     6
tvSpecial     3
Name: count, dtype: int64

#### Analyse des notations

In [12]:
df[(df["averagerating"].notna()) & (df["numvotes"].isna())]

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes


Il n'y a pas de ligne ayant une note sans le nombre de vote ou l'inverse

In [None]:
df[df["averagerating"] == df["averagerating"].min()]

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
5325,tt2632386,tvEpisode,In Steeltown,In Steeltown,False,2012.0,,,Talk-Show,1.0,34.0
5331,tt2632394,tvEpisode,Episode #3.57,Episode #3.57,False,2012.0,,,Talk-Show,1.0,34.0
5338,tt2632404,tvEpisode,Episode #3.58,Episode #3.58,False,2012.0,,,Talk-Show,1.0,35.0
5340,tt2632414,tvEpisode,Episode #3.59,Episode #3.59,False,2012.0,,,Talk-Show,1.0,34.0
5352,tt2632432,tvEpisode,Episode #3.60,Episode #3.60,False,2012.0,,,Talk-Show,1.0,34.0
5357,tt2632442,tvEpisode,Episode #3.61,Episode #3.61,False,2012.0,,,Talk-Show,1.0,34.0
5361,tt2632448,tvEpisode,Episode #3.62,Episode #3.62,False,2012.0,,,Talk-Show,1.0,34.0
5364,tt2632458,tvEpisode,Holiday Special,Holiday Special,False,2012.0,,,Talk-Show,1.0,33.0
5371,tt2632468,tvEpisode,Great Moments,Great Moments,False,2012.0,,,Talk-Show,1.0,32.0


In [None]:

query = "SELECT min(averagerating) FROM sebastien.title_basics;"
with engine.connect() as conn, conn.begin():
    df = pd.read_sql_query(query, engine)

df.head()

Unnamed: 0,min
0,1.0


In [21]:
df["averagerating"].describe()

count    1069.000000
mean        6.955847
std         1.447928
min         1.000000
25%         6.100000
50%         7.100000
75%         8.000000
max        10.000000
Name: averagerating, dtype: float64

#### Analyse des types

In [13]:
df["titletype"].value_counts()

titletype
tvEpisode       7605
short           1106
movie            509
video            300
tvSeries         218
tvMovie          152
tvSpecial         65
tvMiniSeries      25
videoGame         12
tvShort            8
Name: count, dtype: int64

In [18]:
df[df["titletype"] == "tvEpisode"]

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
1,tt1787830,tvEpisode,Troubadours: Carole King/James Taylor & the Ri...,Troubadours: Carole King/James Taylor & the Ri...,False,2011.0,,101.0,"Biography,Documentary,History",8.1,137.0
26,tt17878548,tvEpisode,Top 10 Disaster Movies That Could Really Happen,Top 10 Disaster Movies That Could Really Happen,False,2022.0,,,"Comedy,Talk-Show",,
32,tt17878590,tvEpisode,The LIBBY SHOW LIVE from Christmas Con,The LIBBY SHOW LIVE from Christmas Con,False,2021.0,,,Talk-Show,,
48,tt17878746,tvEpisode,Episode #1.4,Episode #1.4,False,2022.0,,,Drama,,
54,tt17878790,tvEpisode,Episode #1.9276,Episode #1.9276,False,2022.0,,,"Crime,Drama,Romance",,
...,...,...,...,...,...,...,...,...,...,...,...
9995,tt1794686,tvEpisode,Episode dated 28 April 1969,Episode dated 28 April 1969,False,1969.0,,,Comedy,,
9996,tt17946862,tvEpisode,Episode #12.5,Episode #12.5,False,2022.0,,60.0,"News,Talk-Show",,
9997,tt1794687,tvEpisode,"Bampy Battles Bots/Truth, Revision, and the Le...","Bampy Battles Bots/Truth, Revision, and the Le...",False,2011.0,,25.0,"Action,Adventure,Animation",5.1,37.0
9998,tt1794688,tvEpisode,The Malone Family,The Malone Family,False,2010.0,,,Reality-TV,,


### Lien avec title_episode

In [51]:
query = f"SELECT * FROM sebastien.title_episode LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,parenttconst,seasonnumber,episodenumber
0,tt0031458,tt32857063,,
1,tt0041951,tt0041038,1.0,9.0
2,tt0042816,tt0989125,1.0,17.0
3,tt0042889,tt0989125,,
4,tt0043426,tt0040051,3.0,42.0


In [55]:
query = f"SELECT * FROM sebastien.title_basics WHERE tconst = 'tt0048893' LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes


In [49]:
tvEpisode = df[df["titletype"] == "tvSeries"]["tconst"].values[1]
tvEpisode

'tt1787843'

In [50]:
query = f"SELECT * FROM sebastien.title_episode WHERE parenttconst = '{tvEpisode}' LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,parenttconst,seasonnumber,episodenumber


In [3]:
query = "SELECT * FROM sebastien.title_basics WHERE primarytitle LIKE 'Parlement' LIMIT 10;"
with engine.connect() as conn:
    df_episode = pd.read_sql_query(query, conn)

df_episode.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
0,tt9812666,tvSeries,Parlement,Parlement,False,2020,,25,Comedy,7.7,1206


In [4]:
query = f"SELECT * FROM sebastien.title_episode WHERE parenttconst = 'tt9812666' LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,parenttconst,seasonnumber,episodenumber
0,tt12136640,tt9812666,1,1
1,tt12136656,tt9812666,1,2
2,tt12136660,tt9812666,1,3
3,tt12136662,tt9812666,1,4
4,tt12136666,tt9812666,1,5


In [11]:
query = f"SELECT * FROM sebastien.title_basics WHERE tconst = 'tt12136666' LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes


tvEpisode est bien une série qui peux avoir un tconst lié dans title_episode. tvSeries est bien une série qui peux avoir un parenttconst lié dans title_episode.