# Équipe 7
## Notebook de Jean-Christophe

In [1]:
import numpy as np
import pandas as pd
import datetime

## Load data

### Sunburst

In [6]:
def load_genre_hierarchy():
    # Set file path and name
    relative_path = "../Src/Assets/Data/"
    file_name = "genres_hierarchy.csv"
    full_path = relative_path + file_name
    
    temp_df = pd.read_csv(
        filepath_or_buffer=full_path,
        sep=r'[";]',
        engine='python',
        header=0,
    )
    
    temp_df = temp_df.drop(temp_df.columns[[0, -1]], axis=1)
    temp_df = temp_df.fillna(value="")

    return temp_df

### Treemap 

In [7]:
def load_film_genres():
    # Set file path and name
    relative_path = "../Src/Assets/Data/"
    file_name = "_film_genres.csv"
    full_path = relative_path + file_name
    
    # Read csv
    temp_df = pd.read_csv(
        filepath_or_buffer=full_path,
        sep=',',
        header=0,
        index_col=None,
        parse_dates=True,
    )
    
    return temp_df

In [8]:
def load_film_countries():
    # Set file path and name
    relative_path = "../Src/Assets/Data/"
    file_name = "_film_pays.csv"
    full_path = relative_path + file_name
    
    # Read csv
    temp_df = pd.read_csv(
        filepath_or_buffer=full_path,
        sep=',',
        header=0,
        index_col=None,
        parse_dates=True,
    )
    
    return temp_df

In [9]:
def load_film_languages():
    # Set file path and name
    relative_path = "../Src/Assets/Data/"
    file_name = "_film_langue.csv"
    full_path = relative_path + file_name
    
    # Read csv
    temp_df = pd.read_csv(
        filepath_or_buffer=full_path,
        sep=',',
        header=0,
        index_col=None,
        parse_dates=True,
    )
    
    return temp_df

In [10]:
def load_capital_by_countries_by_continent():
    # Set file path and name
    relative_path = "../Src/Assets/Data/"
    file_name = "pays_vsParContinent_vsFrancais.csv"
    full_path = relative_path + file_name
    
    # Read csv
    temp_df = pd.read_csv(
        filepath_or_buffer=full_path,
        sep=',',
        header=0,
        index_col=None,
        parse_dates=True,
    )
    
    return temp_df

In [11]:
def merge_on_countries(df1, df2):
    temp_df = pd.merge(
        left=df1,
        right=df2,
        how="left",
        left_on="pays",
        right_on="pays",
        sort=True,
        suffixes=("", ""),
    )
    
    return temp_df

In [12]:
def merge_on_film_id(df1, df2):
    temp_df = pd.merge(
        left=temp,
        right=df_film_langues,
        how="left",
        left_on="filmoId",
        right_on="filmoId",
        sort=True,
        suffixes=("", ""),
    )
    
    return temp_df

## Clean data

In [None]:
def clean_common_df(df):
    
    # Specify columns relevant for each viz - Don't wory about duplicates
    col_for_sunburn = []
    col_for_treemap = ["anneeSortie", "continent", "pays", "capitale", "langue", "filmoId", "titreOriginal", "genres_categorized"]
    
    # Put all lists together 
    cols = (
        col_for_sunburn +
        col_for_treemap + 
        col_for_table
    )
    
    # Drop duplicates
    cols = list(set(cols))
    
    # Create a dict for the new names
    new_col_names={
        "anneeSortie": "annee",
        "filmoId": "id", 
        "titreOriginal": "titre",
        "genres_categorized": "genre"
    }
    
    # Select relevant columns
    temp_df = df[cols]
    
    # Rename the columns
    temp_df = temp_df.rename(columns=new_col_names)
    
    return temp_df

In [None]:
def capitalize_first_letter_first_word(df, col_name):
    temp_df = df.copy()
    temp_df[col_name] = temp_df[col_name].apply(str.capitalize)

    return temp_df

In [None]:
def capitalize_first_letter_all_words(df, col_name):
    temp_df = df.copy()
    temp_df[col_name] = temp_df[col_name].apply(str.title)
    
    return temp_df

### Sunburst

In [5]:
def clean_genre_hierarchy(df):
    temp_df = capitalize_first_letter_all_words(df=df, col_name='subgenre_0')
    temp_df = capitalize_first_letter_all_words(df=temp_df, col_name='subgenre_1')

### Treemap

In [2]:
def clean_year_column(df):
    # Capture the lenght before dropping
    initial_length = len(df)
    
    # Drop movies for which no date is available
    temp_df = df.dropna(subset=['annee'])
    
    # Capture the lenght after dropping
    after_drop_length = len(temp_df)
    
    # Cast years to type int
    temp_df = temp_df.astype({'annee': 'int'})
    
    # Filter movies for which date is greater than now
    temp_df = temp_df[temp_df['annee']<=datetime.datetime.now().year]
    
    # Capture the length after filtering
    after_filter_length = len(temp_df)
    
    # Compute length deltas
    count_dropped_no_year = initial_length - after_drop_length
    count_filtered_year_greater_now = after_drop_length - after_filter_length
    
    return temp_df, count_dropped_no_year, count_filtered_year_greater_now

In [3]:
def clean_genre_column(df):
    # Capture the lenght before dropping
    initial_length = len(df)

    # Drop movies for which no genre is available
    temp_df = df.dropna(subset=['genre'])
    
    # Capture the lenght after dropping
    after_drop_length = len(temp_df)
    
    # Cast genres to type str
    temp_df = temp_df.astype({'genre': 'str'})
    
    # Capitalize the first letter of the genres
    temp_df = capitalize_first_letter_first_word(df=temp_df, col_name='genre')
    
    # Compute length deltas
    count_dropped_no_genre = initial_length - after_drop_length
    
    return temp_df, count_dropped_no_genre

In [4]:
def clean_title_column(df):
    # Capture the lenght before dropping
    initial_length = len(df)

    # Drop movies for which no title is available
    temp_df = df.dropna(subset=['titre'])
    
    # Capture the lenght after dropping
    after_drop_length = len(temp_df)
    
    # Cast titles to type str
    temp_df = temp_df.astype({'titre': 'str'})
    
    
    # Capitalize the first letter of the titles
    temp_df = capitalize_first_letter_first_word(df=temp_df, col_name='titre')
    
    # Compute length deltas
    count_dropped_no_title = initial_length - after_drop_length
    
    return temp, count_dropped_no_title

In [8]:
def add_constant_column(df, col_name, const_name):
    # Create a copy of the given df
    temp_df = df.copy()
    
    # Add the constant col with the given name
    temp_df[col_name] = const_name
    
    return temp_df

### Table

In [21]:
def clean_language_column(df):
    temp = df.copy()
    temp['langue'] = (temp['langue']
        .apply(lambda x: str.capitalize(x) if isinstance(x, str) else x)
    )
    
    return temp

In [18]:
def round_decimals(df, n):
    temp_df = df.round(decimals=n)
    
    return temp_df

In [20]:
def drop_duplicated_movies(df):
    subset=['annee', 'continent', 'pays', 'capitale', 'langue', 'id', 'titre']
    temp = df.drop_duplicates(subset=subset, keep='first')
    
    return temp

In [2]:
def replace_non_top_languages(x):
    if x in top_n_languages:
        return x
    else:
         return 'autres' 

## Extract

### Treemap

In [6]:
def get_min_and_max_year(df):
    year_max = df['annee'].max()
    year_min = df['annee'].min()
    
    return year_min, year_max

In [7]:
def get_list_of_genres(df):
    genres = list(set(df['genre'].values))
    
    return genres

### Table

In [19]:
def get_list_of_languages(df):
    languages = list(set(df['langue'].values))
    
    return languages