In [2]:
# List comprehension & list flattening

l = [
    [1, 2, 3],
    [4, 5, 6]
]


[
    sublist 
    for sublist in l 
]


[
    item
    for sublist in l 
    for item  in sublist
]


for sublist in l:
    for item in sublist:
        print(item)

#[1, 2, 3, 4, 5, 6]

1
2
3
4
5
6


In [101]:
import pandas as pd

df = pd.read_csv("../data/imdb/movie_metadata.csv")



# Question 1
score_mean_per_dir = df.groupby("director_name").imdb_score.mean()

# Example d'accès à une valeur sur la base de l'id de la ligne
print(score_mean_per_dir.loc["Quentin Tarantino"])

score_mean_per_dir.sort_values(ascending=False)[:10]


# Question 2
score_mean_per_country = df.groupby("country").imdb_score.mean()
# exemple
print(score_mean_per_country.loc["France"])
score_mean_per_country.sort_values(ascending=False)[:10]


# Question 2 (pour un seul genre)

# Méthode 1 avec une lambda fonction
df["Action"] = df["genres"].map(lambda string: "Action" in string.split("|"))


# Méthode 2 avec une fonction définie préalablement
def has_action_in_genres(string):
    return "Action" in string.split("|")

df["Action"] = df["genres"].map(has_action_in_genres)

# Question 2 (pour tous les genres)

# On crée la liste avec les sous listes de genres
genres = [ 
    genres.split("|")
    for genres in df["genres"].tolist()
]

# On flatten la liste
genres = [
    genre 
    for genres_sublist in genres
    for genre in genres_sublist
]

# On récupère les valeurs
genres = list(set(genres))

for genre in genres:
    df[genre] = df["genres"].map(lambda string: genre in string.split("|"))
    
df[genres].sum()

8.2
6.678571428571429


News              3
War             213
Romance        1107
Horror          565
Reality-TV        2
Film-Noir         6
Sport           182
Adventure       923
Sci-Fi          616
Fantasy         610
Family          546
Mystery         500
History         207
Drama          2594
Animation       242
Western          97
Game-Show         1
Thriller       1411
Short             5
Biography       293
Action         1153
Comedy         1872
Music           214
Documentary     121
Crime           889
Musical         132
dtype: int64

In [49]:
df[genres].sum()
df.sum()

  df.sum()


num_critic_for_reviews                                                699990.0
duration                                                              539007.0
director_facebook_likes                                              3390669.0
actor_3_facebook_likes                                               3237949.0
actor_1_facebook_likes                                              33036397.0
gross                                                           201580106904.0
genres                       Action|Adventure|Fantasy|Sci-FiAction|Adventur...
movie_title                  Avatar Pirates of the Caribbean: At World's En...
num_voted_users                                                      421938535
cast_total_facebook_likes                                             48912379
facenumber_in_poster                                                    6897.0
movie_imdb_link              http://www.imdb.com/title/tt0499549/?ref_=fn_t...
num_user_for_reviews                                

In [50]:
print(pd.Series([True, True, False]) & pd.Series([True, False, False]))
print(pd.Series([True, True, False]) | pd.Series([True, False, False]))

0     True
1    False
2    False
dtype: bool
0     True
1     True
2    False
dtype: bool


In [51]:
test = pd.DataFrame(
    [
        ["A", 1],
        ["B", 1],
        ["B", 2],
        ["C", 2]
    ]
)
test.columns = ["col_1", "col_2"]
# test[test.col_1 == "B"]
# test.col_1 == "B"

test.col_1[test.col_2 ==1]

0    A
1    B
Name: col_1, dtype: object

In [52]:
for genre in genres:
    movies_2010 = df[(df.title_year==2010) & df[genre]]
    movies_2000 = df[(df.title_year==2000) & df[genre]]

    count_movies_2010 = len(movies_2010)
    count_movies_2000 = len(movies_2000)
    
    if count_movies_2000 ==0:
        print(f"No movies for genre {genre} in 2000")
        continue

    # Autre solution
    # count_action_movies_2010 = action_movies_2010.shape[0]
    # count_action_movies_2000 = action_movies_2000.shape[0]

    # Evolution en pourcent : (nouveau - ancien) / ancien
    # (moyen mnémotechnique : nouveau = (1+evol%) * ancien)
    count_evolution = (count_movies_2010 - count_movies_2000) / count_movies_2000
    print(genre, round(count_evolution * 100, 2), "%")

No movies for genre News in 2000
War 250.0 %
Romance 40.91 %
Horror 62.5 %
No movies for genre Reality-TV in 2000
No movies for genre Film-Noir in 2000
Sport -44.44 %
Adventure 19.35 %
Sci-Fi 10.53 %
Fantasy 89.47 %
Family 78.95 %
Mystery 28.57 %
History 250.0 %
Drama 30.34 %
Animation 30.0 %
Western 50.0 %
No movies for genre Game-Show in 2000
Thriller 27.66 %
No movies for genre Short in 2000
Biography 57.14 %
Action 64.71 %
Comedy 26.67 %
Music -30.0 %
Documentary 250.0 %
Crime -16.67 %
Musical 100.0 %


In [53]:
for genre in genres:
    for year_delta in range(10):
        movies_n = df[(df.title_year==2000 + year_delta) & df[genre]]
        movies_nplus1 = df[(df.title_year== 2000 + year_delta + 1) & df[genre]]

        count_movies_n = len(movies_n)
        count_movies_nplus1 = len(movies_nplus1)

        if count_movies_n ==0:
            print(f"No movies for genre {genre} in 2000")
            continue

        # Autre solution
        # count_action_movies_2010 = action_movies_2010.shape[0]
        # count_action_movies_2000 = action_movies_2000.shape[0]

        # Evolution en pourcent : (nouveau - ancien) / ancien
        # (moyen mnémotechnique : nouveau = (1+evol%) * ancien)
        count_evolution = (count_movies_nplus1 - count_movies_n) / count_movies_n
        print(
            genre, 
            2000+year_delta, 
            "to" , 
            2001+year_delta+1, 
            round(count_evolution * 100, 2), 
            "%"
        )

No movies for genre News in 2000
No movies for genre News in 2000
No movies for genre News in 2000
No movies for genre News in 2000
No movies for genre News in 2000
No movies for genre News in 2000
No movies for genre News in 2000
No movies for genre News in 2000
No movies for genre News in 2000
News 2009 to 2011 -100.0 %
War 2000 to 2002 300.0 %
War 2001 to 2003 25.0 %
War 2002 to 2004 -20.0 %
War 2003 to 2005 0.0 %
War 2004 to 2006 25.0 %
War 2005 to 2007 10.0 %
War 2006 to 2008 18.18 %
War 2007 to 2009 38.46 %
War 2008 to 2010 -77.78 %
War 2009 to 2011 75.0 %
Romance 2000 to 2002 15.91 %
Romance 2001 to 2003 0.0 %
Romance 2002 to 2004 -25.49 %
Romance 2003 to 2005 86.84 %
Romance 2004 to 2006 -21.13 %
Romance 2005 to 2007 -3.57 %
Romance 2006 to 2008 1.85 %
Romance 2007 to 2009 5.45 %
Romance 2008 to 2010 5.17 %
Romance 2009 to 2011 1.64 %
Horror 2000 to 2002 -25.0 %
Horror 2001 to 2003 25.0 %
Horror 2002 to 2004 0.0 %
Horror 2003 to 2005 20.0 %
Horror 2004 to 2006 44.44 %
Horror 20

In [54]:
df["country"]

0          USA
1          USA
2           UK
3          USA
4          NaN
         ...  
5038    Canada
5039       USA
5040       USA
5041       USA
5042       USA
Name: country, Length: 5043, dtype: object

In [55]:
# set(df["country"])
for country in df["country"].unique():
    for year_delta in range(10):
        movies_n = df[(df.title_year==2000 + year_delta) & (df["country"]==country)]
        movies_nplus1 = df[(df.title_year== 2000 + year_delta + 1) & (df["country"]==country)]

        count_movies_n = len(movies_n)
        count_movies_nplus1 = len(movies_nplus1)

        if count_movies_n ==0:
            print(f"No movies for country {country} in 2000")
            continue

        # Autre solution
        # count_action_movies_2010 = action_movies_2010.shape[0]
        # count_action_movies_2000 = action_movies_2000.shape[0]

        # Evolution en pourcent : (nouveau - ancien) / ancien
        # (moyen mnémotechnique : nouveau = (1+evol%) * ancien)
        count_evolution = (count_movies_nplus1 - count_movies_n) / count_movies_n
        print(
            country, 
            2000+year_delta, 
            "to" , 
            2000+year_delta+1, 
            round(count_evolution * 100, 2), 
            "%"
        )

USA 2000 to 2001 9.56 %
USA 2001 to 2002 5.37 %
USA 2002 to 2003 -14.01 %
USA 2003 to 2004 15.56 %
USA 2004 to 2005 -1.28 %
USA 2005 to 2006 7.14 %
USA 2006 to 2007 -8.48 %
USA 2007 to 2008 7.95 %
USA 2008 to 2009 12.88 %
USA 2009 to 2010 -9.24 %
UK 2000 to 2001 -25.0 %
UK 2001 to 2002 83.33 %
UK 2002 to 2003 -72.73 %
UK 2003 to 2004 250.0 %
UK 2004 to 2005 9.52 %
UK 2005 to 2006 -17.39 %
UK 2006 to 2007 0.0 %
UK 2007 to 2008 21.05 %
UK 2008 to 2009 -8.7 %
UK 2009 to 2010 0.0 %
No movies for country nan in 2000
No movies for country nan in 2000
No movies for country nan in 2000
No movies for country nan in 2000
No movies for country nan in 2000
No movies for country nan in 2000
No movies for country nan in 2000
No movies for country nan in 2000
No movies for country nan in 2000
No movies for country nan in 2000
No movies for country New Zealand in 2000
New Zealand 2001 to 2002 0.0 %
New Zealand 2002 to 2003 -100.0 %
No movies for country New Zealand in 2000
No movies for country New Ze

In [None]:
       print(
            genre, 
            2000+year_delta, 
            "to" , 
            2001+year_delta+1, 
            round(count_evolution * 100, 2), 
            "%"
        )

In [100]:
for genre in genres:
    for year_delta in range(10):
        movies_n = df[(df.title_year==2000 + year_delta) & df[genre]]
        movies_nplus1 = df[(df.title_year== 2000 + year_delta + 1) & df[genre]]

        count_movies_n = len(movies_n)
        count_movies_nplus1 = len(movies_nplus1)

        if count_movies_n ==0:
            print(f"No movies for genre {genre} in 2000")
            continue

        # Autre solution
        # count_action_movies_2010 = action_movies_2010.shape[0]
        # count_action_movies_2000 = action_movies_2000.shape[0]

        # Evolution en pourcent : (nouveau - ancien) / ancien
        # (moyen mnémotechnique : nouveau = (1+evol%) * ancien)
        count_evolution = (count_movies_nplus1 - count_movies_n) / count_movies_n
 

KeyError: 'News'

In [57]:
df[df["movie_title"].str.lower().str.contains("chain saw")]
df.drop_duplicates(["movie_title", "director_name", "title_year"])[df["movie_title"].str.lower().str.contains("chain saw")]

  df.drop_duplicates(["movie_title", "director_name", "title_year"])[df["movie_title"].str.lower().str.contains("chain saw")]


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,Western,Game-Show,Thriller,Short,Biography,Comedy,Music,Documentary,Crime,Musical
3278,Color,Tobe Hooper,277.0,88.0,365.0,177.0,Edwin Neal,383.0,30859000.0,Horror|Thriller,...,False,False,True,False,False,False,False,False,False,False


In [65]:
# Project Blair-Witch look-up
# df.sort_values("profitability", ascending=False)[["movie_title", "profitability"]]

In [73]:
df = df[df.title_year >= 2010]

df["profitability"] = df.gross / df.budget
# df[df.profitability<100].profitability.hist(bins=100)



profitable_directors = df.sort_values("profitability", ascending=False)\
                         .director_name[:50]\
                         .unique()

print(len(profitable_directors))

44


## Sur un seul réalisateur

In [83]:
director_name = "Ethan Coen"

subset = df[df.director_name==director_name][["movie_title", "profitability"]].copy()

# A la main
subset.profitability.sum() / len(subset)

# Direct avec pandas
subset.profitability.mean()

2.35520838277512

In [86]:
mean_profitabilities = {}

for director_name in df.director_name.unique():
    # Subset
    subset = df[df.director_name==director_name][["movie_title", "profitability"]].copy()
    # Moyenne
    mean_profitabilities[director_name] = subset.profitability.mean()

alamano = pd.Series(mean_profitabilities)

Unnamed: 0,0,profitability,budget
Sam Mendes,1.286744,1.286744,2.150000e+08
Christopher Nolan,1.586807,1.586807,1.916667e+08
Andrew Stanton,0.277052,0.277052,2.637000e+08
Nathan Greno,0.772336,0.772336,2.600000e+08
Joss Whedon,2.500715,2.500715,2.300000e+08
...,...,...,...
Thomas L. Phillips,,,2.000000e+05
Edward Burns,0.509333,0.509333,9.000000e+03
Scott Smith,,,
Benjamin Roberds,,,1.400000e+03


In [96]:
import sklearn
import matplotlib.pyplot as plt

In [78]:
df.director_name.value_counts().head(50)

Ridley Scott             6
Steven Soderbergh        5
Steven Spielberg         5
Shawn Levy               5
Tyler Perry              5
David Ayer               5
James Wan                5
Jaume Collet-Serra       5
Tim Burton               5
Joe Wright               5
David O. Russell         5
Nicholas Stoller         4
Dennis Dugan             4
Jon M. Chu               4
Woody Allen              4
Jonathan Liebesman       4
Kenneth Branagh          4
Robert Schwentke         4
Francis Lawrence         4
Simon West               4
Roland Emmerich          4
Kevin Smith              4
Timur Bekmambetov        4
Zack Snyder              4
Tim Story                4
Paul Feig                4
Paul W.S. Anderson       4
Joseph Kosinski          4
Frank Coraci             4
Clint Eastwood           4
Angelina Jolie Pitt      4
Michael Bay              4
David Gordon Green       4
Danny Boyle              4
Antoine Fuqua            3
Ethan Coen               3
Rob Letterman            3
B

In [99]:
df = pd.read_csv("../data/imdb/movie_metadata.csv")

mask_genres = df["Adventure"] | df["Action"] | df["Fantasy"]
mask_countries = (df["country"] == "USA") | (df["country"] == "United Kingdom") | (df["country"] == "Australia")
mask_directors = (df["director_name"] == "Ethan Coen")
mask = mask_genres & mask_countries & mask_directors

df[mask].groupby("title_year").imdb_score.mean()

KeyError: 'Adventure'

In [None]:
def generate_random_price():
    return np.random.choice(
        [12, 14, 15, 16, 17, 18], 
        p=[0.4,0.05,0.1,0.05,0.2, 0.2]
    )



import datetime
import random

start_date = datetime.datetime(2022, 7, 1)
end_date = datetime.datetime(2022, 12, 31)

start_timestamp = start_date.timestamp()
end_timestamp = end_date.timestamp()

difference = end_timestamp - start_timestamp

random_integer = random.randint(0, int(difference))
random_date = datetime.datetime.fromtimestamp(start_timestamp + random_integer)

print(random_date)