## Import des librairies

In [2]:
from sqlalchemy import create_engine
import pandas as pd
from dotenv import load_dotenv
import os

## Création de la connexion

In [3]:
load_dotenv()  # take environment variables from .env.
database_url = os.getenv("DATABASE_URL")
engine = create_engine(database_url) 

## Analyse de données

### Table title_basics

In [3]:

query = "SELECT * FROM sebastien.title_basics LIMIT 10000;"
with engine.connect() as conn, conn.begin():
    df = pd.read_sql_query(query, engine)

df.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
0,tt1521167,tvEpisode,Leerplicht,Leerplicht,False,1982.0,,21.0,Comedy,,
1,tt15211672,tvEpisode,Episode #1.839,Episode #1.839,False,,,,Talk-Show,,
2,tt15211674,tvEpisode,Episode #1.840,Episode #1.840,False,,,,Talk-Show,,
3,tt15211676,tvEpisode,Vanderbilt vs Mizzou,Vanderbilt vs Mizzou,False,2020.0,,,Sport,,
4,tt15211678,short,Djinn,Jen,False,1970.0,,,Short,,


Les films/séries qui n'ont pas suffisemment de données (trop de nan en features) seront pénalisé par le modèle par manque de données, ce qui est souhaitable puisque cela signifie qu'ils ne sont pas regardés.

#### Différence primary title et originaltitle

In [4]:
df[df["primarytitle"] == df["originaltitle"]]

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
0,tt1521167,tvEpisode,Leerplicht,Leerplicht,False,1982.0,,21.0,Comedy,,
1,tt15211672,tvEpisode,Episode #1.839,Episode #1.839,False,,,,Talk-Show,,
2,tt15211674,tvEpisode,Episode #1.840,Episode #1.840,False,,,,Talk-Show,,
3,tt15211676,tvEpisode,Vanderbilt vs Mizzou,Vanderbilt vs Mizzou,False,2020.0,,,Sport,,
5,tt1521168,tvEpisode,Lenen,Lenen,False,1981.0,,32.0,Comedy,,
...,...,...,...,...,...,...,...,...,...,...,...
9995,tt15230100,tvEpisode,Busty girlfriends toying each other,Busty girlfriends toying each other,True,2015.0,,,Adult,,
9996,tt15230102,tvEpisode,Episode #1.389,Episode #1.389,False,,,,Talk-Show,,
9997,tt15230104,tvEpisode,Episode #1.390,Episode #1.390,False,,,,Talk-Show,,
9998,tt15230106,tvEpisode,Episode #1.392,Episode #1.392,False,,,,Talk-Show,,


Primary title est le titre le plus utilisé et original title et le titre original.

#### Analyse des genres

In [5]:
df["genres"].str.split(',').explode().value_counts()

genres
Drama          2476
Talk-Show      1683
Comedy         1598
Short          1098
News            859
Documentary     829
Family          706
Game-Show       601
Horror          593
Romance         464
Action          416
Mystery         396
Animation       347
Reality-TV      342
nan             340
Sport           329
Adult           320
Music           317
Adventure       315
Crime           216
Fantasy         180
History         161
Biography       158
Thriller        117
Sci-Fi           71
Musical          47
War               9
Western           6
Name: count, dtype: int64

In [6]:
df[df["genres"] == "nan"]["titletype"].value_counts()

titletype
tvEpisode       202
tvSpecial        64
video            31
movie            23
tvSeries          9
videoGame         6
tvMovie           4
tvMiniSeries      1
Name: count, dtype: int64

#### Analyse des notations

In [7]:
df[(df["averagerating"].notna()) & (df["numvotes"].isna())]

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes


Il n'y a pas de ligne ayant une note sans le nombre de vote ou l'inverse

In [8]:
df[df["averagerating"] == df["averagerating"].min()]

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
8329,tt15227110,tvEpisode,Caitlyn Jenner; Nancy Pelosi; Maxine Waters; R...,Caitlyn Jenner; Nancy Pelosi; Maxine Waters; R...,False,2021.0,,,News,1.0,97.0


In [9]:

query = "SELECT min(averagerating) FROM sebastien.title_basics;"
with engine.connect() as conn, conn.begin():
    df = pd.read_sql_query(query, engine)

df.head()

: 

: 

In [21]:
df["averagerating"].describe()

count    1069.000000
mean        6.955847
std         1.447928
min         1.000000
25%         6.100000
50%         7.100000
75%         8.000000
max        10.000000
Name: averagerating, dtype: float64

#### Analyse des types

In [13]:
df["titletype"].value_counts()

titletype
tvEpisode       7605
short           1106
movie            509
video            300
tvSeries         218
tvMovie          152
tvSpecial         65
tvMiniSeries      25
videoGame         12
tvShort            8
Name: count, dtype: int64

In [18]:
df[df["titletype"] == "tvEpisode"]

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
1,tt1787830,tvEpisode,Troubadours: Carole King/James Taylor & the Ri...,Troubadours: Carole King/James Taylor & the Ri...,False,2011.0,,101.0,"Biography,Documentary,History",8.1,137.0
26,tt17878548,tvEpisode,Top 10 Disaster Movies That Could Really Happen,Top 10 Disaster Movies That Could Really Happen,False,2022.0,,,"Comedy,Talk-Show",,
32,tt17878590,tvEpisode,The LIBBY SHOW LIVE from Christmas Con,The LIBBY SHOW LIVE from Christmas Con,False,2021.0,,,Talk-Show,,
48,tt17878746,tvEpisode,Episode #1.4,Episode #1.4,False,2022.0,,,Drama,,
54,tt17878790,tvEpisode,Episode #1.9276,Episode #1.9276,False,2022.0,,,"Crime,Drama,Romance",,
...,...,...,...,...,...,...,...,...,...,...,...
9995,tt1794686,tvEpisode,Episode dated 28 April 1969,Episode dated 28 April 1969,False,1969.0,,,Comedy,,
9996,tt17946862,tvEpisode,Episode #12.5,Episode #12.5,False,2022.0,,60.0,"News,Talk-Show",,
9997,tt1794687,tvEpisode,"Bampy Battles Bots/Truth, Revision, and the Le...","Bampy Battles Bots/Truth, Revision, and the Le...",False,2011.0,,25.0,"Action,Adventure,Animation",5.1,37.0
9998,tt1794688,tvEpisode,The Malone Family,The Malone Family,False,2010.0,,,Reality-TV,,


### Lien avec title_episode

In [51]:
query = f"SELECT * FROM sebastien.title_episode LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,parenttconst,seasonnumber,episodenumber
0,tt0031458,tt32857063,,
1,tt0041951,tt0041038,1.0,9.0
2,tt0042816,tt0989125,1.0,17.0
3,tt0042889,tt0989125,,
4,tt0043426,tt0040051,3.0,42.0


In [55]:
query = f"SELECT * FROM sebastien.title_basics WHERE tconst = 'tt0048893' LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes


In [49]:
tvEpisode = df[df["titletype"] == "tvSeries"]["tconst"].values[1]
tvEpisode

'tt1787843'

In [50]:
query = f"SELECT * FROM sebastien.title_episode WHERE parenttconst = '{tvEpisode}' LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,parenttconst,seasonnumber,episodenumber


In [3]:
query = "SELECT * FROM sebastien.title_basics WHERE primarytitle LIKE 'Parlement' LIMIT 10;"
with engine.connect() as conn:
    df_episode = pd.read_sql_query(query, conn)

df_episode.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes
0,tt9812666,tvSeries,Parlement,Parlement,False,2020,,25,Comedy,7.7,1206


In [4]:
query = f"SELECT * FROM sebastien.title_episode WHERE parenttconst = 'tt9812666' LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,parenttconst,seasonnumber,episodenumber
0,tt12136640,tt9812666,1,1
1,tt12136656,tt9812666,1,2
2,tt12136660,tt9812666,1,3
3,tt12136662,tt9812666,1,4
4,tt12136666,tt9812666,1,5


In [11]:
query = f"SELECT * FROM sebastien.title_basics WHERE tconst = 'tt12136666' LIMIT 10;"
with engine.connect() as conn, conn.begin():
    df_episode = pd.read_sql_query(query, engine)

df_episode.head()

Unnamed: 0,tconst,titletype,primarytitle,originaltitle,isadult,startyear,endyear,runtimeminutes,genres,averagerating,numvotes


tvEpisode est bien une série qui peux avoir un tconst lié dans title_episode. tvSeries est bien une série qui peux avoir un parenttconst lié dans title_episode.

### Table title_akas

In [None]:
query = f"SELECT * FROM sebastien.title_akas LIMIT 10000;"
with engine.connect() as conn, conn.begin():
    df_akas = pd.read_sql_query(query, engine)

df_akas.head()

Unnamed: 0,identifiant,tconst,ordering,title,region,language,types,attributes,isoriginaltitle
0,1238,tt0000284,2,At the Photographer's,US,\N,dvd,\N,
1,1239,tt0000284,3,Chez le photographe,FR,\N,imdbDisplay,\N,
2,1240,tt0000284,4,У фотографа,RU,\N,imdbDisplay,\N,
3,1241,tt0000285,1,Chinese Magic,\N,\N,original,\N,
4,1242,tt0000285,2,Chinese Magic,GB,\N,imdbDisplay,\N,


In [18]:
query = """
SELECT tb.tconst, tb.primaryTitle, ta.region
FROM sebastien.title_basics AS tb 
JOIN sebastien.title_akas AS ta 
ON tb.tconst = ta.tconst
LIMIT 200;
"""
with engine.connect() as conn, conn.begin():
    df_akas = pd.read_sql_query(query, conn)

df_akas.head()

Unnamed: 0,tconst,primarytitle,region
0,tt33889296,Episode #1.220,IT
1,tt33889296,Episode #1.220,DE
2,tt33889296,Episode #1.220,JP
3,tt33889296,Episode #1.220,ES
4,tt33889297,Episode #1.221,\N


In [5]:
df_akas.groupby(["tconst", "primarytitle"])["region"].count()

tconst      primarytitle                         
tt33889296  Episode #1.220                           4
tt33889297  Episode #1.221                           8
tt33889299  Episode #1.222                           8
tt3388930   Episode #1.36                            8
tt33889300  Episode #1.223                           8
tt33889301  Denise Darcel                            1
tt33889302  Episode #1.224                           8
tt33889303  Episode #1.225                           8
tt33889304  Episode #1.226                           8
tt33889305  The Rats of Venice                       2
tt33889306  Episode #1.227                           8
tt33889307  Episode #1.228                           8
tt33889308  Episode #1.229                           8
tt33889309  Episode #1.230                           8
tt33889310  Episode #1.231                           8
tt33889311  Episode #1.232                           8
tt33889312  Episode #1.233                           8
tt33889313  Epi

On peux utiliser le nombre de régions par film dans les features, mais en utilisant pandas et non sql pour calculer le nombre de régions par film.

In [20]:
import numpy as np


df_akas["region"] = df_akas["region"].replace("\\N", np.nan)
df_akas.dropna(subset=["region"], inplace=True)
df_akas.groupby(["tconst", "primarytitle"])['region'].agg(list)

tconst      primarytitle                         
tt33889296  Episode #1.220                                       [IT, DE, JP, ES]
tt33889297  Episode #1.221                           [PT, IN, FR, IT, DE, JP, ES]
tt33889299  Episode #1.222                           [PT, IN, FR, IT, DE, JP, ES]
tt3388930   Episode #1.36                            [PT, IN, FR, IT, DE, JP, ES]
tt33889300  Episode #1.223                           [PT, IN, FR, IT, DE, JP, ES]
tt33889302  Episode #1.224                           [PT, IN, FR, IT, DE, JP, ES]
tt33889303  Episode #1.225                           [PT, IN, FR, IT, DE, JP, ES]
tt33889304  Episode #1.226                           [PT, IN, FR, IT, DE, JP, ES]
tt33889305  The Rats of Venice                                               [GB]
tt33889306  Episode #1.227                           [PT, IN, FR, IT, DE, JP, ES]
tt33889307  Episode #1.228                           [PT, IN, FR, IT, DE, JP, ES]
tt33889308  Episode #1.229                      

On peux aussi utiliser la liste des régions par films comme features.

In [23]:
df_akas["region"].unique()

array(['IT', 'DE', 'JP', 'ES', 'PT', 'IN', 'FR', 'GB', 'BR'], dtype=object)

### Analyse de title_principals

In [28]:
query = f"SELECT * FROM sebastien.title_principals LIMIT 10000;"
with engine.connect() as conn, conn.begin():
    df_principals = pd.read_sql_query(query, engine)

df_principals.head()

Unnamed: 0,identifiant,tconst,ordering,nconst,category,job,characters
0,33562099,tt15312350,3,nm6004811,actress,\N,"[""Head Constable Pushpa Singh""]"
1,33562100,tt15312350,4,nm8941959,actress,\N,"[""Constable Santosh""]"
2,33562101,tt15312350,5,nm10176104,actor,\N,"[""Constable Cheetah Chaturvedi""]"
3,33562102,tt15312350,6,nm4861794,director,\N,\N
4,33562103,tt15312350,7,nm14167000,writer,dialogue,\N


In [50]:
query = f"SELECT * FROM sebastien.title_principals as tp JOIN sebastien.name_basics as nb ON nb.nconst = tp.nconst LIMIT 10000;"
with engine.connect() as conn, conn.begin():
    df_principals = pd.read_sql_query(query, engine)

df_principals.head(25)

Unnamed: 0,identifiant,tconst,ordering,nconst,category,job,characters,nconst.1,primaryname,birthyear,deathyear,primaryprofession,knownfortitles
0,33599337,tt15322122,26,nm0892064,casting_director,\N,\N,nm0892064,René Veilleux,,,"casting_director,casting_department,miscellaneous","tt12681586,tt13090810,tt13352178,tt3485722"
1,33599338,tt15322124,1,nm1930205,self,\N,"[""Self - Presenter""]",nm1930205,Jules Hudson,1970.0,,"actor,archive_footage","tt0165064,tt0445887,tt13051960,tt16972146"
2,33599339,tt15322126,1,nm4050901,self,\N,"[""Self - Presenter""]",nm4050901,Steve Brown,,,,"tt30485397,tt3770700,tt6377480,tt15317252"
3,33599340,tt15322128,1,nm1478339,actor,\N,"[""Niko Schuurmans""]",nm1478339,Jo Hens,,,actor,"tt0162076,tt9756106,tt0217205,tt2981372"
4,33599341,tt15322128,2,nm0535434,actress,\N,"[""Mieke Van den Bossche""]",nm0535434,Caroline Maes,1976.0,,"actress,archive_footage","tt0243733,tt0219394,tt0217221,tt0287269"
5,33599342,tt15322128,3,nm0029239,actress,\N,"[""Veronique Van den Bossche""]",nm0029239,Sandrine André,1973.0,,"actress,archive_footage","tt0830854,tt0162076,tt1648168,tt3641514"
6,33599343,tt15322128,4,nm0505447,actor,\N,"[""Peter Van den Bossche""]",nm0505447,Gunther Levi,1976.0,,"actor,archive_footage","tt0227974,tt0162076,tt8736020,tt32749395"
7,33599344,tt15322128,5,nm0888863,actor,\N,"[""Benny Coppens""]",nm0888863,Roel Vanderstukken,1976.0,,"actor,archive_footage","tt1688211,tt1298297,tt0217205,tt3543082"
8,33599345,tt15322128,6,nm5365532,actress,\N,"[""Stefanie Coppens""]",nm5365532,Jasmijn Van Hoof,,,"actress,script_department","tt5203748,tt2981372,tt13850522,tt4047050"
9,33599346,tt15322128,7,nm0737332,actor,\N,"[""Lars De Wulf""]",nm0737332,Kürt Rogiers,1971.0,,"actor,writer,archive_footage","tt0382365,tt4733540,tt1467394,tt2276275"


In [27]:
df_principals["category"].value_counts()

category
actor                  2014
actress                1686
writer                 1452
self                   1191
director               1161
producer                775
editor                  676
cinematographer         424
composer                336
production_designer     138
casting_director         81
archive_footage          66
Name: count, dtype: int64

La catégorie peux être une feature, en gardant les acteur / actrices / self (personne qui se représente elle même type célébrité ou emission de télé ou autobiographie) / producteur

In [60]:
df_principals["knownfortitles2"] = df_principals["knownfortitles"].apply(lambda x: len(x.split(",")) if x else 0)
df_principals

Unnamed: 0,identifiant,tconst,ordering,nconst,category,job,characters,nconst.1,primaryname,birthyear,deathyear,primaryprofession,knownfortitles,knownfortitles2
0,33599337,tt15322122,26,nm0892064,casting_director,\N,\N,nm0892064,René Veilleux,,,"casting_director,casting_department,miscellaneous","tt12681586,tt13090810,tt13352178,tt3485722",4
1,33599338,tt15322124,1,nm1930205,self,\N,"[""Self - Presenter""]",nm1930205,Jules Hudson,1970.0,,"actor,archive_footage","tt0165064,tt0445887,tt13051960,tt16972146",4
2,33599339,tt15322126,1,nm4050901,self,\N,"[""Self - Presenter""]",nm4050901,Steve Brown,,,,"tt30485397,tt3770700,tt6377480,tt15317252",4
3,33599340,tt15322128,1,nm1478339,actor,\N,"[""Niko Schuurmans""]",nm1478339,Jo Hens,,,actor,"tt0162076,tt9756106,tt0217205,tt2981372",4
4,33599341,tt15322128,2,nm0535434,actress,\N,"[""Mieke Van den Bossche""]",nm0535434,Caroline Maes,1976.0,,"actress,archive_footage","tt0243733,tt0219394,tt0217221,tt0287269",4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,33609332,tt15324358,15,nm11011614,cinematographer,cinematographer,\N,nm11011614,Manish V Soni,,,cinematographer,"tt15464420,tt11867650,tt9760930,tt15458502",4
9996,33609333,tt15324358,16,nm9948754,cinematographer,\N,\N,nm9948754,Rahul Soni,,,cinematographer,"tt27727527,tt32536908,tt5929848,tt11867650",4
9997,33609334,tt15324358,17,nm7477080,editor,\N,\N,nm7477080,Pankaj Katpal,,,editor,"tt15234990,tt4886424,tt11867650,tt33047479",4
9998,33609335,tt15324358,18,nm12270003,editor,\N,\N,nm12270003,Rahul Mathur,,,"editor,miscellaneous","tt14071150,tt11867650,tt7209352,tt12651104",4


In [62]:
df_principals.groupby("category")["knownfortitles2"].mean()

category
actor                  3.214835
actress                3.576488
archive_footage        3.285714
casting_director       3.650000
cinematographer        3.454777
composer               3.167920
director               2.673729
editor                 3.032644
producer               3.047847
production_designer    3.863636
self                   3.035011
writer                 2.673077
Name: knownfortitles2, dtype: float64

On est pas sûre de la pertinence de cette features, on ne sait pas.

### Analyse de title_crew

In [63]:
query = f"SELECT * FROM sebastien.title_crew LIMIT 10000;"
with engine.connect() as conn, conn.begin():
    df_crew = pd.read_sql_query(query, engine)

df_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0008746,nm0454535,"nm0166777,nm0795082"
1,tt0008747,\N,\N
2,tt0008748,nm0911959,nm0518844
3,tt0008749,nm0204677,\N
4,tt0008750,nm0912817,nm0551361


In [64]:
df_crew.head(35)

Unnamed: 0,tconst,directors,writers
0,tt0008746,nm0454535,"nm0166777,nm0795082"
1,tt0008747,\N,\N
2,tt0008748,nm0911959,nm0518844
3,tt0008749,nm0204677,\N
4,tt0008750,nm0912817,nm0551361
5,tt0008752,\N,nm0907778
6,tt0008753,nm0912817,"nm0167424,nm0398464"
7,tt0008754,nm0154352,"nm0547966,nm0791014"
8,tt0008755,nm0916113,\N
9,tt0008756,nm0588056,nm0370271


Si on a le temps, ca peux être une features d'amélioration future.