# Équipe 7
## Notebook de Jean-Christophe

In [1]:
import numpy as np
import pandas as pd
import datetime

import json

import dash
import dash_html_components as html
import dash_core_components as dcc
from dash.dependencies import Input, Output, State
from jupyter_dash import JupyterDash

import plotly.graph_objects as go
import plotly.express as px

## Treemap 

### Functions

In [2]:
def clean_year_column(df):
    temp = df.dropna(subset=['annee'])
    temp = temp.astype({'annee': 'int'})
    temp = temp[temp['annee']<=datetime.datetime.now().year]
    return temp

In [3]:
def clean_genre_column(df):
    temp = df.dropna(subset=['genre'])
    temp = temp.astype({'genre': 'str'})
    temp['genre'] = temp['genre'].apply(str.capitalize)
    
    return temp

In [4]:
def clean_title_column(df):
    temp = df.dropna(subset=['titre'])
    temp = temp.astype({'titre': 'str'})
    temp['titre'] = temp['titre'].apply(str.capitalize)
    
    return temp

In [5]:
def load_data_for_treemap():
    relative_path = "../Src/Assets/Data/"
    file_name = "_film_genres.csv"
    full_path = relative_path + file_name
    df_film_genres = pd.read_csv(
        filepath_or_buffer=full_path,
        sep=',',
        header=0,
        index_col=None,
        parse_dates=True,
    )

    relative_path = "../Src/Assets/Data/"
    file_name = "_film_pays.csv"
    full_path = relative_path + file_name
    df_film_pays = pd.read_csv(
        filepath_or_buffer=full_path,
        sep=',',
        header=0,
        index_col=None,
        parse_dates=True,
    )

    relative_path = "../Src/Assets/Data/"
    file_name = "_film_langue.csv"
    full_path = relative_path + file_name
    df_film_langues = pd.read_csv(
        filepath_or_buffer=full_path,
        sep=',',
        header=0,
        index_col=None,
        parse_dates=True,
    )

    relative_path = "../Src/Assets/Data/"
    file_name = "pays_vsParContinent_vsFrancais.csv"
    full_path = relative_path + file_name
    df_pays = pd.read_csv(
        filepath_or_buffer=full_path,
        sep=',',
        header=0,
        index_col=None,
        parse_dates=True,
    )

    temp = pd.merge(
        left=df_film_pays,
        right=df_pays,
        how="left",
        left_on="pays",
        right_on="pays",
        sort=True,
        suffixes=("", ""),
    )

    temp = pd.merge(
        left=temp,
        right=df_film_langues,
        how="left",
        left_on="filmoId",
        right_on="filmoId",
        sort=True,
        suffixes=("", ""),
    )

    temp = pd.merge(
        left=temp,
        right=df_film_genres,
        how="left",
        left_on="filmoId",
        right_on="filmoId",
        sort=True,
        suffixes=("", ""),
    )

    temp = temp[["anneeSortie", "continent", "pays", "capitale", "langue", "filmoId", "titreOriginal", "genres_categorized"]]
    treemap_df = temp.rename(columns={"anneeSortie": "annee", "filmoId": "id", "titreOriginal": "titre", "genres_categorized": "genre"})

    return treemap_df

In [6]:
def get_min_and_max_year(df):
    year_max = df['annee'].max()
    year_min = df['annee'].min()
    
    return year_min, year_max

In [7]:
def get_list_of_genres(df):
    genres = list(set(df['genre'].values))
    
    return genres

In [8]:
def add_constant_col(df, col_name, const_name):
    temp_df = df.copy()
    temp_df[col_name] = const_name
    
    return temp_df

In [9]:
def query_data_for_treemap(df, genre, from_year, to_year):
    
    # Extract data corresponding to the right genre
    if genre == "All":
        temp_df = df
    else:
        query_str = "genre=='" + genre + "'"
        temp_df = df.query(query_str)
    
    # Extract data corresponding to the right years
    query_str = "annee>=" + str(from_year)
    temp_df = temp_df.query(query_str)
    query_str = "annee<=" + str(to_year)
    temp_df = temp_df.query(query_str)
        
    # Aggregate agnostic to years
    temp_df = (temp_df
        .groupby(['planete', 'continent', 'pays'])
        .agg(nombreDeFilms=pd.NamedAgg(column='genre', aggfunc='count'))
        .reset_index()
    )
    return temp_df

In [10]:
def create_treemap_dropdown_menu(genres):
    # Create options
    options=[{'label': "All", 'value': "All"}]
    for genre in genres:
        option = {'label': genre, 'value': genre}
        options.append(option)

    # Create menu
    ddm = dcc.Dropdown(
            id='treemap-dropdown-menu',
            options=options,
            value='All',
        )
    
    return ddm

In [11]:
def create_treemap_title(year_min_displayed, year_max_displayed, genre_displayed):
    
    if genre_displayed=="All":
        title_str = (
            'Nombre de films de tous genres produits entre les années ' + 
            str(year_min_displayed) + 
            ' et ' + 
            str(year_max_displayed) + 
            '.'
        )
    else: 
        title_str = (
            'Nombre de films de genre ' + 
            genre_displayed + 
            ' produit entre les années ' + 
            str(year_min_displayed) + 
            ' et ' + 
            str(year_max_displayed) + 
            '.'
        )

    title = html.Header( 
        children=[
            html.H2(id='treemap-title', children=title_str)
        ]
    )
    
    return title

In [12]:
def create_treemap_range_slider(year_min, year_max):
    
    # Create list of years
    first_decade = 10 - (year_min%10) + year_min
    last_decade = year_max - (year_max%10)
    years = np.arange(start=first_decade, stop=last_decade+1, step=10)

    rs = dcc.RangeSlider(
        id='treemap-range-slider',
        min=year_min,
        max=year_max,
        value=[year_min, year_max],
        pushable=False,
        allowCross=True,
        dots=False,
        updatemode='mouseup',
        marks={str(year): str(year) for year in years},
    )    
    
    return rs

In [13]:
def init_treemap_fig(df, year_min, year_max):
    # Get data
    temp_df = query_data_for_treemap(
        df=df, 
        genre='All', 
        from_year=year_min, 
        to_year=year_max
    )

    # Create a figure with px
    fig = px.treemap(
        temp_df, 
        path=['planete', 'continent', 'pays'], 
        values='nombreDeFilms',
    )

    return fig

In [14]:
def callback_treemap(df, slider_min_year, slider_max_year, ddm_genre):
    
    # Create new title
    treemap_title = create_treemap_title(
        year_min_displayed=slider_min_year, 
        year_max_displayed=slider_max_year, 
        genre_displayed=ddm_genre
    )
    
    # Get data
    temp_df = query_data_for_treemap(
        df=df, 
        genre=ddm_genre, 
        from_year=slider_min_year, 
        to_year=slider_max_year
    )

    # Create a figure with px
    treemap_fig = px.treemap(
        temp_df, 
        path=['planete', 'continent', 'pays'], 
        values='nombreDeFilms',
    )

    return treemap_title, treemap_fig

In [15]:
def create_treemap_graph(treemap_fig):
    treemap_graph = dcc.Graph(
        figure=treemap_fig, 
        id='treemap-graph',
        config=dict(
            showTips=False,
            showAxisDragHandles=False,
            displayModeBar=False
        )
    )

    return treemap_graph

In [16]:
def create_treemap(treemap_title, treemap_rs, treemap_ddm, treemap_graph):
    
    treemap = html.Div([
        treemap_title,
        treemap_ddm,
        treemap_graph,
        treemap_rs
    ])

    return treemap

## Table
### Test - With real data

In [25]:
# Create list of years
max_top = 10
top_n = np.arange(start=1, stop=max_top+1, step=1)

rs = dcc.RangeSlider(
    id='bar-range-slider',
    min=1,
    max=max_top,
    value=[5],
    pushable=False,
    allowCross=False,
    dots=False,
    updatemode='mouseup',
    marks={str(i): str(i) for i in top_n},
) 

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
nom_de_la_page = "Notes sur la librairie Dash de Plotly"
app = JupyterDash(nom_de_la_page, external_stylesheets=external_stylesheets)
app.layout = html.Div([
    rs,
    html.Div(id='output-range-slider')
])
@app.callback(
    dash.dependencies.Output('output-range-slider', 'children'),
    [dash.dependencies.Input('range-slider', 'value')])
def update_output(value):
    return 'You have selected "{}"'.format(value)

app.run_server(mode='inline', port=8030)

### Functions

In [17]:
def round_decimals(df, n):
    # TODO : Round the dataframe
    temp_df = df.round(decimals=n)
    
    return temp_df

In [18]:
def get_list_of_languages(df):
    languages = list(set(df['langue'].values))
    
    return languages

In [19]:
def drop_duplicated_movies(df):
    subset=['annee', 'continent', 'pays', 'capitale', 'langue', 'id', 'titre']
    temp = df.drop_duplicates(subset=subset, keep='first')
    
    return temp

In [20]:
def clean_language_column(df):
    temp = df.copy()
    temp['langue'] = (temp['langue']
        .apply(lambda x: str.capitalize(x) if isinstance(x, str) else x)
    )
    
    return temp

In [131]:
def query_data_for_table(df, genre, from_year, to_year, top_n):
    
    def replace_string(x):
        if x in top_n_languages:
            return x
        else:
             return 'autres' 
         
        
    # Extract data corresponding to the right genre
    if genre == "All":
        temp_df = df
    else:
        query_str = "genre=='" + genre + "'"
        temp_df = df.query(query_str)
    
    # Extract data corresponding to the right years
    query_str = "annee>=" + str(from_year)
    temp_df = temp_df.query(query_str)
    query_str = "annee<=" + str(to_year)
    temp_df = temp_df.query(query_str)
    
    # Extract oldest movie from that period
    oldest_date = temp_df['annee'].min()
    query_str = "annee==" + str(oldest_date)
    oldest_df = temp_df.query(query_str)
    
    # Extract latest movie from that period
    latest_date = temp_df['annee'].max()
    query_str = "annee==" + str(latest_date)
    latest_df = temp_df.query(query_str)
    
    # get count of movies for each language
    top_n_df = (temp_df
        .groupby('langue')
        .agg(nombreDeFilms=pd.NamedAgg(column='annee', aggfunc='count'))
        .reset_index()
        .sort_values(['nombreDeFilms'], ascending=[False])
    )
    
    # handles cases where there is more than n languages
    if len(top_n_df) > top_n:
        # create a list with top n languages
        top_n_languages = top_n_df.head(top_n)['langue'].tolist()

        # replace non top n languages by "autres"
        top_n_df['langue'] = top_n_df['langue'].apply(replace_string)

        # aggregate to have top n + autres languages
        top_n_df = (top_n_df
            .groupby(['langue'])
            .agg(nombreDeFilms=pd.NamedAgg(column='nombreDeFilms', aggfunc='sum'))
            .reset_index()
            .sort_values(['nombreDeFilms'], ascending=[False])
        )
    # else: no need to do anything
    
    # get the percentage of each language
    n_films = top_n_df['nombreDeFilms'].sum()
    top_n_df['pourcentageDeFilms'] = (top_n_df['nombreDeFilms']
        .apply(lambda x: 100 * x / n_films)
    )
    
    top_n_df = round_decimals(df=top_n_df, n=2)
    
    return top_n_df, oldest_df, latest_df

In [121]:
def create_table_title(year_min_displayed, year_max_displayed, genre_displayed, top_n):
    
    if genre_displayed=="All":
        title_str = (
            'Top ' +
            str(top_n) +
            ': Distribution des langues dans les films de tous genres produits entre les années ' + 
            str(year_min_displayed) + 
            ' et ' + 
            str(year_max_displayed) + 
            '.'
        )
    else: 
        title_str = (
            'Top ' +
            str(top_n) +
            ': Distribution des langues dans les films de genre ' + 
            genre_displayed + 
            ' produit entre les années ' + 
            str(year_min_displayed) + 
            ' et ' + 
            str(year_max_displayed) + 
            '.'
        )

    title = html.H4(
        id='table-title', 
        children=title_str
    )
    
    return title

In [122]:
## Create slider
def create_table_slider(initial_value):
    
    # Create list of years
    max_top = 10
    top_n = np.arange(start=1, stop=max_top+1, step=1)

    slider = dcc.RangeSlider(
        id='table-slider',
        min=1,
        max=max_top,
        value=[initial_value],
        pushable=False,
        allowCross=False,
        dots=False,
        updatemode='mouseup',
        marks={str(i): str(i) for i in top_n},
    )    
    
    return slider

In [123]:
def create_table_top_n_fig(df, genre, from_year, to_year, top_n):
    
    # Get the top 5 languages df
    top_n_df, oldest_df, latest_df = query_data_for_table(
        df=df, 
        genre=genre,
        from_year=from_year,
        to_year=to_year, 
        top_n=top_n
    )
    
    fig = go.Figure(
        data=[
            go.Table(
                header_values=["Langue", "Nombre de films", "Proportion"],
                header_fill_color='paleturquoise',
                header_align='left',
                cells_values=[top_n_df.langue, top_n_df.nombreDeFilms, top_n_df.pourcentageDeFilms.astype(str) + " %"],
                cells_fill_color='lavender',
                cells_align='left',
            )
        ]
    )
    
    return fig

In [145]:
def create_table_latest_fig(df, genre, from_year, to_year, top_n):
    
    top_n_df, oldest_df, latest_df = query_data_for_table(
        df=df, 
        genre=genre,
        from_year=from_year,
        to_year=to_year, 
        top_n=top_n
    )
    
    # Filter n.d.
    latest_df = latest_df[~(latest_df['langue'].isin([None, "n.d.", np.nan]))].head()
    
    fig = go.Figure(
        data=[
            go.Table(
                header_values=["Titre", "Année de sortie", "Langue"],
                header_fill_color='paleturquoise',
                header_align='left',
                cells_values=[latest_df.titre, latest_df.annee, latest_df.langue],
                cells_fill_color='lavender',
                cells_align='left',
            )
        ]
    )
    
    return fig

In [146]:
def create_table_oldest_fig(df, genre, from_year, to_year, top_n):
    
    top_n_df, oldest_df, latest_df = query_data_for_table(
        df=df, 
        genre=genre,
        from_year=from_year,
        to_year=to_year, 
        top_n=top_n
    )
    
    # Filter n.d.
    oldest_df = oldest_df[~(oldest_df['langue'].isin([None, "n.d.", np.nan]))].head()
    
    fig = go.Figure(
        data=[
            go.Table(
                header_values=["Titre", "Année de sortie", "Langue"],
                header_fill_color='paleturquoise',
                header_align='left',
                cells_values=[oldest_df.titre, oldest_df.annee, oldest_df.langue],
                cells_fill_color='lavender',
                cells_align='left',
            )
        ]
    )
    
    return fig

In [147]:
def create_table_top_n_graph(table_top_n_fig):
    table_top_n_graph = dcc.Graph(
        figure=table_top_n_fig, 
        id='table-top-n-graph',
        config=dict(
            showTips=False,
            showAxisDragHandles=False,
            displayModeBar=False
        )
    )

    return table_top_n_graph

In [148]:
def create_table_latest_graph(table_latest_fig):
    table_latest_graph = dcc.Graph(
        figure=table_latest_fig, 
        id='table-latest-graph',
        config=dict(
            showTips=False,
            showAxisDragHandles=False,
            displayModeBar=False
        )
    )

    return table_latest_graph

In [149]:
def create_table_oldest_graph(table_oldest_fig):
    table_oldest_graph = dcc.Graph(
        figure=table_oldest_fig, 
        id='table-oldest-graph',
        config=dict(
            showTips=False,
            showAxisDragHandles=False,
            displayModeBar=False
        )
    )

    return table_oldest_graph

In [150]:
def callback_table(df, slider_min_year, slider_max_year, ddm_genre, top_n):
    
    # Create new title
    table_title = create_table_title(
        year_min_displayed=slider_min_year, 
        year_max_displayed=slider_max_year, 
        genre_displayed=ddm_genre,
        top_n=top_n
    )
    
    # Create a figure with px
    table_top_n_fig = create_table_top_n_fig(df=df, genre=ddm_genre, from_year=slider_min_year, to_year=slider_max_year, top_n=top_n)
    table_body_oldest_fig = create_table_latest_fig(df=df, genre=ddm_genre, from_year=slider_min_year, to_year=slider_max_year, top_n=top_n)
    table_body_latest_fig = create_table_oldest_fig(df=df, genre=ddm_genre, from_year=slider_min_year, to_year=slider_max_year, top_n=top_n)

    return table_title, table_top_n_fig, table_body_oldest_fig, table_body_latest_fig

In [151]:
def create_table(table_title, table_top_n_graph, table_oldest_graph, table_latest_graph, table_slider):
    table = html.Div([
        table_title,
        table_top_n_graph,
        table_oldest_graph,
        table_latest_graph,
        table_slider
    ])
                     
    return table

In [152]:
my_df = load_data_for_treemap()
my_df = clean_year_column(my_df)
my_df = clean_genre_column(my_df)
my_df = clean_title_column(my_df)
my_df = clean_language_column(my_df)
my_df = add_constant_col(my_df, "planete", "Terre")
year_min, year_max = get_min_and_max_year(df=my_df)
genres = get_list_of_genres(df=my_df)

treemap_fig = init_treemap_fig(df=my_df, year_min=year_min, year_max=year_max)

treemap_title = create_treemap_title(
    year_min_displayed=year_min, 
    year_max_displayed=year_max, 
    genre_displayed="All"
)
treemap_graph = create_treemap_graph(treemap_fig=treemap_fig)
treemap_ddm = create_treemap_dropdown_menu(genres=genres)
treemap_rs = create_treemap_range_slider(year_min=year_min, year_max=year_max)

top_n = 5
table_title = create_table_title(
    year_min_displayed=year_min, 
    year_max_displayed=year_max, 
    genre_displayed="All",
    top_n=top_n
)
table_slider = create_table_slider(initial_value=top_n)
table_top_n_fig = create_table_top_n_fig(df=my_df, genre="All", from_year=year_min, to_year=year_max, top_n=top_n)
table_latest_fig = create_table_latest_fig(df=my_df, genre="All", from_year=year_min, to_year=year_max, top_n=top_n)
table_oldest_fig = create_table_oldest_fig(df=my_df, genre="All", from_year=year_min, to_year=year_max, top_n=top_n)
table_top_n_graph = create_table_top_n_graph(table_top_n_fig=table_top_n_fig)
table_latest_graph = create_table_latest_graph(table_latest_fig=table_latest_fig)
table_oldest_graph = create_table_oldest_graph(table_oldest_fig=table_oldest_fig)


external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
nom_de_la_page = "Projet INF8808 - Cinemathèque québécoise"
app = JupyterDash(nom_de_la_page, external_stylesheets=external_stylesheets)
app.layout = html.Div([
    create_treemap(
        treemap_title=treemap_title,
        treemap_rs=treemap_rs,
        treemap_ddm=treemap_ddm,
        treemap_graph=treemap_graph
    ),
    create_table(
        table_title=table_title, 
        table_top_n_graph=table_top_n_graph,
        table_oldest_graph=table_oldest_graph,
        table_latest_graph=table_latest_graph,
        table_slider=table_slider
    )
])
@app.callback([Output('treemap-title', 'children'),
               Output('treemap-graph', 'figure'),
               Output('table-title', 'children'),
               Output('table-top-n-graph', 'figure'),
               Output('table-latest-graph', 'figure'),
               Output('table-oldest-graph', 'figure')],
              [Input('treemap-dropdown-menu', 'value'),
               Input('treemap-range-slider', 'value'),
               Input('table-slider', 'value')])
def update_treemap(ddm_value, rs_value, s_value):     
    treemap_title, treemap_fig = callback_treemap(
        df=my_df,
        slider_min_year=rs_value[0], 
        slider_max_year=rs_value[1], 
        ddm_genre=ddm_value
    )
    
    table_title, table_top_n_fig, table_body_oldest_fig, table_body_latest_fig = callback_table(
        df=my_df,
        slider_min_year=rs_value[0], 
        slider_max_year=rs_value[1], 
        ddm_genre=ddm_value,
        top_n=s_value[0] 
    )
       
    return treemap_title, treemap_fig, table_title, table_top_n_fig, table_body_oldest_fig, table_body_latest_fig

#app.run_server(mode='inline', port=8010)
app.run_server(mode='jupyterlab', port = 8020, dev_tools_ui=True, #debug=True, 
               dev_tools_hot_reload =True, threaded=True)

### Extra tools

In [40]:
fig.show()

In [None]:
fig.full_figure_for_development(warn=False).show('json')