# Équipe 7
## Notebook de Jean-Christophe

In [70]:
import numpy as np
import pandas as pd
import datetime

import dash
import dash_html_components as html
import dash_core_components as dcc
from dash.dependencies import Input, Output, State
from jupyter_dash import JupyterDash

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

## Load data

In [2]:
def load_master_film_file():

    # Read .csv
    temp_df = pd.read_csv(
        filepath_or_buffer="../Src/Assets/Data/Jc/film_vsMaitre.csv",
        sep=',',
        header=0,
        low_memory=False,
        dtype={
            'FilmoId': int,
            'titreOriginal': str,
            'anneeSortie': int,
            'GeneriqueId': float,
            'fonction': str,
            'nom': str,
            'prenom': str,
            'nomComplet': str,
            'langue': str,
            'pays': str,
            'continent': str,
            'capitale': str,
            'planete': str,
            'genreIdentifiant': str,
            'genre': str,
            'sousGenre0': str,
            'sousGenre1': str
        }
    )

    # Make sure missing value are None
    temp_df = temp_df.replace(r'^\s*$', None, regex=True)

    return temp_df

## Preprocess

In [3]:
def clean_duplicates_for_scatterplot_matrix(df):
    # Drop duplicates due to irrelevant fields
    temp_df = df.drop_duplicates(
        subset=['titreOriginal','anneeSortie', 'pays', 'genre'], 
        keep='first'
    )

    return temp_df

In [44]:
def clean_country_column_for_scatterplot_matrix(df): 
    
    def extract_country(x):
        temp = x.split(':')
        if len(temp)==2:
            return temp[0][:-1]
        else:
            return x
        
    # Explicit copy
    temp_df = df.copy()
    
    # Set missing values
    temp_df['pays'] = temp_df['pays'].fillna("n.d.")
    
    # Extract country 
    temp_df['pays'] = temp_df['pays'].apply(lambda x: extract_country(x))
    
    return temp_df

In [40]:
def clean_genre_column_for_scatterplot_matrix(df):  
    
    # Explicit copy
    temp_df = df.copy()
    
    # Replace value
    temp_df['genre'] = temp_df['genre'].replace("FICTION SPÉCULATIVE","SCIENCE FICTION")
    
    # Set missing values
    temp_df['genre'] = temp_df['genre'].fillna("n.d.")
    
    return temp_df

In [41]:
def clean_film_df_for_scatterplot_matrix(film_df):
    
    # Explicit copy
    temp_df = film_df.copy()
    
    # Clean duplicates, year, title and genre
    temp_df = clean_duplicates_for_scatterplot_matrix(df=temp_df)
    temp_df = clean_country_column_for_scatterplot_matrix(df=temp_df)
    temp_df = clean_genre_column_for_scatterplot_matrix(df=temp_df)

    return temp_df

In [50]:
def prepare_data_for_scatterplot_matrix(raw_scatterplot_matrix_df, top_n):
    
    # Replace function to use with .apply()
    def replace_string(x):
        if x in top_n_country:
            return x
        else:
                return 'Autres'
            
    # Aggregate by country, genre, year
    temp_df = (raw_scatterplot_matrix_df
        .groupby(['pays', 'genre', 'anneeSortie'])
        .agg(nombreDeFilms=pd.NamedAgg(column='titreOriginal', aggfunc='count'))
        .reset_index()
        .sort_values(by=['pays', 'genre', 'anneeSortie'], ascending=True)
    )
    
    # Get count of movies for each country
    top_n_df = (temp_df
        .groupby('pays')
        .agg(nombreDeFilms=pd.NamedAgg(column='nombreDeFilms', aggfunc='count'))
        .reset_index()
        .sort_values(['nombreDeFilms'], ascending=[False])
    )
    
    # handles cases where there is more than n countries
    if len(top_n_df) > top_n:
        # create a list with top n countries
        top_n_country = top_n_df.head(top_n)['pays'].tolist()

        # replace non top n languages by "autres"
        temp_df['pays'] = temp_df['pays'].apply(lambda x: replace_string(x))

        # aggregate to have top n + autres languages
        temp_df = (temp_df
            .groupby(['pays', 'genre', 'anneeSortie'])
            .agg(nombreDeFilms=pd.NamedAgg(column='nombreDeFilms', aggfunc='sum'))
            .reset_index()
            .sort_values(['pays', 'genre', 'anneeSortie'], ascending=True)
        )
    # else: no need to do anything

    return temp_df

In [8]:
film_df = load_master_film_file()

In [51]:
raw_scatterplot_matrix_df = clean_film_df_for_scatterplot_matrix(film_df=film_df)
scatterplot_matrix_df = prepare_data_for_scatterplot_matrix(
    raw_scatterplot_matrix_df=raw_scatterplot_matrix_df,
    top_n=20
)

In [52]:
scatterplot_matrix_df.head(5)

Unnamed: 0,pays,genre,anneeSortie,nombreDeFilms
0,Allemagne,Action,1929,1
1,Allemagne,Action,1976,1
2,Allemagne,Action,1990,1
3,Allemagne,Action,1997,1
4,Allemagne,Action,1998,1


In [59]:
temp_df = scatterplot_matrix_df.set_index(keys=['pays', 'genre'])

In [60]:
temp_df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,anneeSortie,nombreDeFilms
pays,genre,Unnamed: 2_level_1,Unnamed: 3_level_1
Allemagne,Action,1929,1
Allemagne,Action,1976,1
Allemagne,Action,1990,1
Allemagne,Action,1997,1
Allemagne,Action,1998,1


In [67]:
temp_df.loc[("Allemagne", "Action"), :]['anneeSortie'].values

array([1929, 1976, 1990, 1997, 1998, 2000, 2002, 2003, 2004, 2005, 2006,
       2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2017])

In [72]:
def get_list_of_genres(df):
    genres = list(set(df['genre'].values))
    return genres

In [73]:
def get_list_of_countries(df):
    pays = list(set(df['pays'].values))
    return pays

In [74]:
genres = get_list_of_genres(df=scatterplot_matrix_df)
countries = get_list_of_countries(df=scatterplot_matrix_df)

In [90]:
print("genres : ", genres)
print("countries : ", countries)

genres :  ['n.d.', 'Historique', 'Crime', 'Amour', 'Sf', 'Comédie', 'Animation', 'Musical', 'Autres', 'Documentaire', 'Action', 'Aventure', 'Drame']
countries :  ['Belgique', 'Hong-kong', 'Danemark', 'Italie', 'Canada', 'Australie', 'Allemagne', 'Autres', 'n.d.', 'États-unis', 'Espagne', 'Pays-bas', 'Japon', 'Suisse', "Allemagne, république fédérale d'", 'Pologne', 'Urss', 'Royaume-uni', 'Mexique', 'Suède', 'France']


In [125]:
type(len(genres))

int

In [98]:
def get_min_and_max_year(df):
    year_max = df['anneeSortie'].max()
    year_min = df['anneeSortie'].min()
    
    return year_min, year_max

In [None]:
create_scatterplot_fig()
raw_scatterplot_matrix_df = clean_film_df_for_scatterplot_matrix(film_df=film_df)
scatterplot_matrix_df = prepare_data_for_scatterplot_matrix(
    raw_scatterplot_matrix_df=raw_scatterplot_matrix_df,
    top_n=20
)

In [194]:
print(len(genres))
    
vertical_spacing=0.02
horizontal_spacing=0.02

shapes = []
height = 5000
width = 2500
#shape_x_increment = width / len(genres)
#shape_y_increment = height / len(countries)
shape_x_increment = 1 / len(genres)
shape_y_increment = 1 / len(countries)

#for country_idx, country in enumerate(countries):
for genre_idx, genre in enumerate(genres):

    #x0 = genre_idx * shape_x_increment - genre_idx * (vertical_spacing / 2)
    #x1 = (genre_idx + 1) * shape_x_increment - (genre_idx + 1) * (vertical_spacing / 2)
    x0 = genre_idx * shape_x_increment
    x1 = (genre_idx + 1) * shape_x_increment
    y0 = country_idx * shape_y_increment - country_idx * (horizontal_spacing / 2)
    y1 = (country_idx + 1) * shape_y_increment - (country_idx + 1) * (horizontal_spacing / 2)
    
    print("genre_idx, x0, x1 : ", genre_idx, x0, x1)
    #print("x1 : ", x1)
    #print("y0 : ", y0)
    #print("y1 : ", y1)

13
genre_idx, x0, x1 :  0 0.0 0.07692307692307693
genre_idx, x0, x1 :  1 0.07692307692307693 0.15384615384615385
genre_idx, x0, x1 :  2 0.15384615384615385 0.23076923076923078
genre_idx, x0, x1 :  3 0.23076923076923078 0.3076923076923077
genre_idx, x0, x1 :  4 0.3076923076923077 0.38461538461538464
genre_idx, x0, x1 :  5 0.38461538461538464 0.46153846153846156
genre_idx, x0, x1 :  6 0.46153846153846156 0.5384615384615385
genre_idx, x0, x1 :  7 0.5384615384615385 0.6153846153846154
genre_idx, x0, x1 :  8 0.6153846153846154 0.6923076923076923
genre_idx, x0, x1 :  9 0.6923076923076923 0.7692307692307693
genre_idx, x0, x1 :  10 0.7692307692307693 0.8461538461538463
genre_idx, x0, x1 :  11 0.8461538461538463 0.9230769230769231
genre_idx, x0, x1 :  12 0.9230769230769231 1.0


In [229]:
print(fig['layout']['xaxis7']['domain'])
print(fig['layout']['yaxis']['domain'])


(0.4707692307692308, 0.5292307692307693)
(0.9714285714285715, 1.0)


In [239]:
def create_scatterplot_fig(scatterplot_matrix_df):

    year_min, year_max = get_min_and_max_year(df=scatterplot_matrix_df)
    film_max = max(scatterplot_matrix_df['nombreDeFilms'])
    
    # Create titles
    subplot_titles = []
    for country in countries:
        for genre in genres:
            subplot_title = country + " - " + genre
            subplot_titles.append(subplot_title)

    vertical_spacing=0.015
    horizontal_spacing=0.015
    
    fig = make_subplots(
        rows=len(countries), 
        cols=len(genres),
        subplot_titles=subplot_titles,
        vertical_spacing=vertical_spacing,
        horizontal_spacing=horizontal_spacing,
    )

    # Create components used to create subplot borders
    shapes = []
    height = 7000
    width = 3000
    #shape_x_increment = width / len(genres)
    #shape_y_increment = height / len(countries)
    shape_x_increment = 1 / len(genres)
    shape_y_increment = 1 / len(countries)

    for country_idx, country in enumerate(countries):
        for genre_idx, genre in enumerate(genres):

            temp_df = scatterplot_matrix_df[scatterplot_matrix_df['pays']==country]
            temp_df = temp_df[temp_df['genre']==genre]

            fig.add_trace(
                go.Scatter(
                    x=temp_df['anneeSortie'].values,
                    y=temp_df['nombreDeFilms'].values,
                    marker_color='blue',
                    mode='markers',
                ),
                row=country_idx+1, 
                col=genre_idx+1,
            )

            fig.update_xaxes(
                row=country_idx+1, 
                col=genre_idx+1,
                tickmode = 'array',
                tickvals= [2000],
                ticktext= [2000],
                tickangle = 0,
                ticks="outside",
                fixedrange=True,
                range=[year_min, year_max],
            )

            # Compute tickval
            tickval = (film_max - (film_max%2)) / 2
            fig.update_yaxes( 
                row=country_idx+1, 
                col=genre_idx+1,
                tickmode = 'array',
                tickvals= [0, tickval],
                ticktext= [0, tickval],
                tickangle = 0,
                ticks="inside",
                fixedrange=True,
                range=[0, film_max]
            )

            # Compute subplot border reference
            x0 = genre_idx * shape_x_increment + 0.005
            x1 = (genre_idx + 1) * shape_x_increment - 0.005
            y0 = country_idx * shape_y_increment + 0.005
            y1 = (country_idx + 1) * shape_y_increment - 0.005
            #print("x0 : ", x0)
            #print("x1 : ", x1)
            #print("y0 : ", y0)
            #print("y1 : ", y1)
            
            # Handling special case
            #if x0==0:
            #    x0 = -0.1
            #
            #if x1==1:
            #    x1=1.05

            # Create subplot border
            shape = go.layout.Shape(
                type="rect",
                xref="paper",
                yref="paper",
                x0=x0,
                y0=y0,
                x1=x1,
                y1=y1,
                line={'width': 1, 'color': 'black'},
            )

            # Add subplot border to the list
            shapes.append(shape)

    # Set the font for all subplots' title 
    for i in fig['layout']['annotations']:
        i['font'] = dict(size=10,color='#ff0000')

    # Set layout
    fig.update_layout(
        height=height, 
        width=width, 
        title_text="Side By Side Subplots",
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        showlegend=False,
        shapes=shapes,
        margin_l=40,
        margin_r=40,
        margin_t=40,
        margin_b=40,
    )
    
    return fig

In [240]:
fig = create_scatterplot_fig(scatterplot_matrix_df)

scatterplot_graph = dcc.Graph(
    figure=fig, 
    id='scatterplot-graph',
    config=dict(
        showTips=False,
        showAxisDragHandles=False,
        displayModeBar=False
    )
)

In [241]:
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
nom_de_la_page = "Notes sur la librairie Dash de Plotly"
app = JupyterDash(nom_de_la_page, external_stylesheets=external_stylesheets)
app.layout = html.Div([
    scatterplot_graph
])

app.run_server(mode='external', port=8060, dev_tools_ui=True, #debug=True, 
               dev_tools_hot_reload =True, threaded=True)

Dash app running on http://127.0.0.1:8060/


In [None]:
fig.update_layout( shapes=[
# unfilled rectange
go.layout.Shape(
type=“rect”,
xref=“paper”,
yref=“paper”,
x0=0,
y0=-0.1,
x1=1.05,
y1=1,
line={‘width’: 1, ‘color’: ‘black’}),
# line
go.layout.Shape(
type=“line”,
xref=“paper”,
yref=“paper”,
x0=0.5,
y0=-0.1,
x1=0.5,
y1=1,
line={‘width’: 1, ‘color’: ‘black’}), ])