In [11]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import polars as pl
import pandas as pd
pd.set_option('display.float_format', lambda x: f'{x :.1f}')
from cleaner import DataCleaner
import numpy as np
from datetime import datetime
from tools import (
    order_and_rename,
    import_datasets,
    bins_generator
)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
movies = pd.read_csv('clean_datasets/movies_clean.csv')
ratings = pd.read_csv(
    'movies_datasets/title_ratings.tsv',
    sep = "\t",
)
joined = pd.merge(
    movies,
    ratings,
    left_on = "titre_id",
    right_on = "tconst"
)

In [13]:
joined = joined.rename(
    columns={
        "averageRating" : "rating_avg",
        "numVotes" : "rating_votes"
    }
).drop("tconst", axis=1)

rating = joined

In [14]:
bins, names = bins_generator(rating["titre_date_sortie"].max())

rating["cuts"] = pd.cut(
    rating["titre_date_sortie"],
    bins=bins,
    labels=names
)

In [15]:
rating

Unnamed: 0,titre_id,titre_str,titre_type,titre_date_sortie,titre_date_fin,titre_duree,titre_genres,person_id,person_name,person_birthdate,person_job,person_role,person_index,rating_avg,rating_votes,cuts
0,tt0000009,Miss Jerry,movie,1894,0,45,Romance,nm0063086,Blanche Bayliss,1878,actress,"[""Miss Geraldine Holbrook (Miss Jerry)""]",1,5.3,207,<1900
1,tt0000009,Miss Jerry,movie,1894,0,45,Romance,nm0183823,William Courtenay,1875,actor,"[""Mr. Hamilton""]",2,5.3,207,<1900
2,tt0000009,Miss Jerry,movie,1894,0,45,Romance,nm1309758,Chauncey Depew,1834,actor,"[""Chauncey Depew - the Director of the New Yor...",3,5.3,207,<1900
3,tt0000009,Miss Jerry,movie,1894,0,45,Romance,nm0085156,Alexander Black,1859,director,0,4,5.3,207,<1900
4,tt0000147,The Corbett-Fitzsimmons Fight,movie,1897,0,100,"Documentary,News,Sport",nm0179163,James J. Corbett,1866,self,"[""Self""]",1,5.3,483,<1900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2527381,tt9916730,6 Gunn,movie,2017,0,116,Drama,nm10538612,Kiran Gawade,0,director,0,5,7.6,11,2010-2020
2527382,tt9916730,6 Gunn,movie,2017,0,116,Drama,nm10538614,Ujjwala Gawde,0,producer,0,6,7.6,11,2010-2020
2527383,tt9916730,6 Gunn,movie,2017,0,116,Drama,nm10538613,Abhishek Jathar,0,producer,0,7,7.6,11,2010-2020
2527384,tt9916730,6 Gunn,movie,2017,0,116,Drama,nm1957275,Suresh Deshmane,0,cinematographer,0,8,7.6,11,2010-2020


In [16]:
rating["rating_avg"].median()

6.2

In [17]:
rating["rating_avg"].describe()

count   2527386.0
mean          6.1
std           1.3
min           1.0
25%           5.3
50%           6.2
75%           7.0
max          10.0
Name: rating_avg, dtype: float64

In [18]:
condi = (
    rating["rating_avg"] == 10
)

best_movies = rating[condi]

best_movies

Unnamed: 0,titre_id,titre_str,titre_type,titre_date_sortie,titre_date_fin,titre_duree,titre_genres,person_id,person_name,person_birthdate,person_job,person_role,person_index,rating_avg,rating_votes,cuts
1311947,tt10449358,Kaputol,movie,2019,0,120,Drama,nm2921912,Popo Diaz,0,production_designer,0,10,10.0,7,2010-2020
1311948,tt10449358,Kaputol,movie,2019,0,120,Drama,nm0317737,Cherie Gil,1963,actress,"[""Kiki"",""Rina""]",1,10.0,7,2010-2020
1311949,tt10449358,Kaputol,movie,2019,0,120,Drama,nm1368756,Alfred Vargas,1981,actor,"[""Caloy"",""Robert""]",2,10.0,7,2010-2020
1311950,tt10449358,Kaputol,movie,2019,0,120,Drama,nm6651713,Ronwaldo Martin,0,actor,"[""Conrad""]",3,10.0,7,2010-2020
1311951,tt10449358,Kaputol,movie,2019,0,120,Drama,nm0032629,Angel Aquino,1973,actress,"[""Maybelle""]",4,10.0,7,2010-2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490337,tt9080346,Poets Are the Destroyers,movie,2021,0,0,Drama,nm5138377,Nancy Pop,0,director,0,5,10.0,6,2010-2020
2490338,tt9080346,Poets Are the Destroyers,movie,2021,0,0,Drama,nm3530768,J.C. Hopkins,0,writer,0,6,10.0,6,2010-2020
2490339,tt9080346,Poets Are the Destroyers,movie,2021,0,0,Drama,nm9126120,Christopher Reza Tabassi,0,producer,0,7,10.0,6,2010-2020
2490340,tt9080346,Poets Are the Destroyers,movie,2021,0,0,Drama,nm8279090,Sam T. Wilson,0,cinematographer,0,8,10.0,6,2010-2020


In [19]:
for date in names[1:]:
    condi = (
        (rating["cuts"] == date)
    )


    print(f"Average note for {date}, {rating['rating_avg'][condi].median()}")

Average note for 1900-1920, 6.0
Average note for 1920-1940, 6.2
Average note for 1940-1960, 6.3
Average note for 1960-1980, 6.1
Average note for 1980-1990, 6.1
Average note for 1990-2000, 6.1
Average note for 2000-2010, 6.3
Average note for 2010-2020, 6.3
Average note for >2021, 6.6


In [21]:
condi = rating["rating_avg"] > rating["rating_avg"].median()

rating["titre_genres"][condi].mode()

0    Drama
Name: titre_genres, dtype: object

Avg rating par genre, avg rating par acteur, avg par director, correlation entre durée et note?

In [43]:
# le film avec le plus de votes ayant un rating > à la moyenne

nodup = rating[~rating["titre_str"].duplicated(keep="first")]

condi = (
    (nodup["rating_avg"] > nodup["rating_avg"].median()) &
    (nodup["rating_votes"] > 413)
)
nodup[condi].reset_index(drop='index')

Unnamed: 0,titre_id,titre_str,titre_type,titre_date_sortie,titre_date_fin,titre_duree,titre_genres,person_id,person_name,person_birthdate,person_job,person_role,person_index,rating_avg,rating_votes,cuts
0,tt0002130,Dante's Inferno,movie,1911,0,71,"Adventure,Drama,Fantasy",nm1376180,Sandro Properzi,0,production_designer,0,10,7.0,3267,1900-1920
1,tt0002423,Passion,movie,1919,0,85,"Biography,Drama,Romance",nm0913298,Kurt Waschneck,1882,cinematographer,0,10,6.6,968,1900-1920
2,tt0002646,Atlantis,movie,1913,0,121,Drama,nm0004592,Robert Israel,1963,composer,0,10,6.5,459,1900-1920
3,tt0002844,Fantômas: In the Shadow of the Guillotine,movie,1913,0,54,"Crime,Drama",nm1666136,Georges Guérin,0,cinematographer,0,10,6.9,2426,1900-1920
4,tt0003014,Ingeborg Holm,movie,1913,0,96,Drama,nm0526234,Richard Lund,1885,actor,"[""Doctor at Poorhouse""]",10,7.0,1368,1900-1920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26512,tt9900782,Kaithi,movie,2019,0,145,"Action,Adventure,Crime",nm4237148,Sam C.S.,0,composer,0,10,8.4,38145,2010-2020
26513,tt9902160,Herself,movie,2020,0,97,Drama,nm2570742,Natalie Holt,0,composer,0,10,7.0,4436,2010-2020
26514,tt9904844,Ott Tänak: The Movie,movie,2019,0,125,"Documentary,Sport",nm2761473,Margus Malm,0,cinematographer,0,10,8.1,498,2010-2020
26515,tt9905462,Pengalila,movie,2019,0,111,Drama,nm4143605,V.T. Sreejith,0,editor,0,10,7.6,679,2010-2020


In [44]:
# le film avec le plus de votes

condi = rating["rating_votes"].max()

rating[condi]

KeyError: 2809141