# Data Visualization Exam

In [None]:
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
import numpy as np

## Data acquisition
CSV file creation from raw data and loading the CSV file into the program.

In [None]:
dataFolderName = "data"
fileName = dataFolderName + '/DCSC_RACLI_01092021113430630.csv'
df = pd.read_csv(fileName)      # load data from CSV to program
df.head() # data loaded

## Data parsing

In [None]:
# TODO: Check for errors in data
#       total is equal to the arithmetic mean of the parts? not seem
#       find missing data
#   	check that value apprendista < operaio < dirigente (for territory)

print('\nBefore remove duplicates: ' + str(len(df)) + ' rows')
df.drop_duplicates()
print('After remove duplicates:  ' + str(len(df)) + ' rows')

# are values reasonable?
print('\nMin value is ' + str(df['Value'].min()))
print('Max value is ' + str(df['Value'].max()))

# TODO: Change type

# TODO: Choose the level for hierachical data



In [None]:
# TODO: Transform the data
df['Territorio'] = df['Territorio'].str.replace(' / ','/')

# TODO: rename and translate df fields
# rename sectors in english
it_sec_names = df.query('`Ateco 2007`!="TOTALE" & `ATECO_2007`>="A" & `ATECO_2007`<="Z"')['Ateco 2007'].drop_duplicates().reset_index(drop=True)

en_sec_names = []

import googletrans #--->pip install googletrans==4.0.0-rc1
from googletrans import Translator,constants
translator = Translator()

# print(sectors_name[0])
for sector in it_sec_names:
    translation = translator.translate(sector, src="it", dest="en")
    en_sec_names.append(translation.text)

for i in range(0, len(it_sec_names)):
    df.loc[df['Ateco 2007']==it_sec_names[i],"Ateco 2007"] = en_sec_names[i]

# df_sectors_tot


## Data filtering

In [None]:
df2 = df.copy()

# unique data
del df2['TIPO_DATO7'] # always the same (HOUWAG_ENTEMP_AV_MI)
del df2['Tipo dato']  # always the same (Retribuzione lorda oraria per ora retribuita delle posizioni lavorative dipendenti in euro (media).)

# ridondance of information
df2 = df2.drop(['SEXISTAT1', 'ETA1_A','PROFILO_PROF','CLLVT','Seleziona periodo'], axis=1)
# del df2['ATECO_2007']

df2 = df2[df2['Flag Codes'] != 'c'] # delete incomplete data

del df2['Flags']
del df2['Flag Codes']

df2.head()

## Data mining

In [None]:
# granularity of sectors exists only for entire Italy (no territorial granularity)
df_sectors = df2.query('`Ateco 2007`!="TOTALE"')

# choose granularity of sectors
df_sectors = df_sectors.query('`ATECO_2007`>="A" & `ATECO_2007`<="Z"')

df_sectors = df_sectors.drop(['Territorio', 'ATECO_2007'], axis=1)

In [None]:
df_territory = df2.query('`Ateco 2007`=="TOTALE"')
df_territory = df_territory.drop(['Ateco 2007', 'ATECO_2007'], axis=1)

In [None]:
# TODO: start with univariate analysis (one variable at a time), continue with multivariate analysis

## Parameters for questions

In [None]:
import geopandas as gpd
import json
import numpy as np
import os
import pandas as pd
import plotly.express as px
import shutil as sh

# PARAMETERS                    # TODO : all common parameters should be declared at the beginning
dataFolderName = 'data'
geoJsonFolder = dataFolderName+'/geoJson/'
figureOutputFolder = 'exported_figures'
dataFileName = dataFolderName + '/DCSC_RACLI_01092021113430630.csv' # for data loading (salaries)
outputWidthImage = 10000
outputHeightImage = 7000

colors_palette = ['#003a2b','#249e89','#f5f5f5','#d86e58','#6a0000']

exportFigure = False    # set to true if you want to export the figureif exportFigure:

if os.path.exists(figureOutputFolder):
    sh.rmtree(figureOutputFolder)
os.makedirs(f'{figureOutputFolder}/question 1')
os.makedirs(f'{figureOutputFolder}/question 2')
os.makedirs(f'{figureOutputFolder}/question 3')

## Question 1
In private companies, are salaries higher in northern Italy than in the south? (Where do people earn more? Maybe divide by principal/worker/apprentice)

### Utility functions

In [None]:
# Utility functions to read data from csv and shape files, remove useless columns frome the dataframe and transfrom the data for the program

def loadDataFromCSV(forceUpdate=False):
    '''
    Load data about salaries into the program.
    Returns a Pandas Dataframe.
    If the parameter forceUpdate is set to True, this function will
    reload the dataframe from the file even if it was already loaded
    (to be used when suspecting the data are chenged on the file).
    '''

    df_ref = {}

    def closureFun(forceUpdate=False):

        if (forceUpdate==True):
            df_ref.clear() # clear the df
        
        if (len(df_ref)==0):

            df = pd.read_csv(dataFileName).drop_duplicates()

            # Transform the data and remove useless columns
            df['Territorio'] = df['Territorio'].str.replace(' / ','/')
            df = df.drop('TIPO_DATO7', axis=1) # always the same (HOUWAG_ENTEMP_AV_MI)
            df = df.drop('Tipo dato', axis=1)  # always the same (Retribuzione lorda oraria per ora retribuita delle posizioni lavorative dipendenti in euro (media).)
            df = df.drop(['SEXISTAT1', 'ETA1_A','PROFILO_PROF','CLLVT','Seleziona periodo'], axis=1)  # ridondance of information
            df = df[df['Flag Codes'] != 'c'].drop(['Flags','Flag Codes'], axis=1) # delete incomplete data and drop columns with corresponding flag ('c' is the flag for hidden data)

            # Transform data for consistency with datasets of geocoords
            df.loc[df['Territorio']=="Forlì-Cesena", "Territorio"] = "Forli'-Cesena"

            # Save the dataframe
            df_ref[0] = df

        return df_ref[0]
    
    return closureFun

loadDataFromCSV = loadDataFromCSV() # use the closure
    


def getDataAboutTerritory():
    '''
    Returns data about salaries in territories (data about sectors are excluded).
    '''
    return loadDataFromCSV().query('`Ateco 2007`=="TOTALE"').drop(['Ateco 2007', 'ATECO_2007'], axis=1)


def getDataAboutProvinces():
    '''
    Returns data about salaries in provinces (data about sectors, regions, entire Italy are excluded).
    '''
    df_territory = getDataAboutTerritory()
    years = loadDataFromCSV()['TIME'].drop_duplicates()

    # Note: this column is present also in geo-data and can be used to join the datasets
    df_territory["TerritorioAnno"] = df_territory["Territorio"] + df_territory['TIME'].astype(str)
    return df_territory[df_territory['ITTER107'].str.contains('.{5}')].drop('ITTER107', axis=1)   # for provinces, 'ITTER107' code is 5 chars long


def getDataAboutProvincesInDictHavingYearsAsKey(years=-1):
    '''
    Returns data about salaries in provinces (data about sectors, regions, entire Italy are excluded),
    organized in a dictionary having years (the parameters) as keys.
    Params: years, e.g.: years=range(2014,2018).
    If the parameter years is not specified, all the years are considered.
    '''
    dataProvinces = getDataAboutProvinces()
    if(years==-1):
        years = dataProvinces['TIME'].drop_duplicates()

    return {year: dataProvinces.query(f'TIME=={year}').drop_duplicates() for year in years}


def getProvinceSalaryvalue(year=-1):        # TODO: take a list as input parameter
    '''
    Returns a Pandas Dataframe with three columns: one for Province names ("Territorio"), the second for the
    year ("TIME") and the third for the corresponding salary value ("Value"); column names are the ones inside
    the brackets ("Territorio", "TIME", "Value").
    Returned data refer to the year which is given as parameter.
    If the year parameter is not specified, also the column 'TIME' is returned, with the corresponding year
    '''
    df_years = getDataAboutProvincesInDictHavingYearsAsKey([year]) if year!=-1 \
                                                             else getDataAboutProvincesInDictHavingYearsAsKey()
    
    years = sorted(df_years.keys())

    df_years = {year: df_years[year].query("Sesso=='totale' & `Classe di età`=='totale' & `Qualifica contrattuale`=='totale' & `Classe di dipendenti`=='totale'")   \
                                    .drop(['Sesso', 'Classe di età', 'Qualifica contrattuale', 'Classe di dipendenti'], axis=1)                                     \
                for year in years}

    # Categorization of Salary values (grouping in categories)
    valueCountedData = {year: np.floor(df_years[year]["Value"]).astype(int).value_counts() for year in years}

    # NOTE: This part should be part of data transforming? But ranges should adapt to the context?

    salaryCategoryBorders = range(9,20,2)   # same category subdivion for all years
    for year in years:
        oldCategory=0
        df = df_years[year]
        for category in salaryCategoryBorders:
            numberProvinceInThisCategory = sum([valueCountedData[year][key] for key in np.intersect1d(valueCountedData[year].keys().tolist(), range(oldCategory,category))])
            df.loc[(oldCategory<=df['Value']) & ( (df['Value']<category) | (df['Value']>=salaryCategoryBorders[-1]) ), "SalaryCategory"] =                      \
                (f"{oldCategory} ≤ " if oldCategory > salaryCategoryBorders[0] else "        ")                                                                 \
                + ".."                                                                                                                                          \
                + (f" < {category}"  if category < salaryCategoryBorders[-1] else "        ")                                                                   \
                + f"  €/hr\t({numberProvinceInThisCategory} provinces)"
            oldCategory = category
        
        # sort (needed to respect the range-scale in plots if categorization is used)
        df.sort_values(by=['Value'], ascending=True, inplace=True)
        
        df_years[year] = df

    df = pd.concat(tuple(df_years[year] for year in years))

    # sort (needed to respect the range-scale in plots if categorization is used)
    #   Sort (first) ascending wrt 'TIME' (oldest first) then descending wrt 'Value'
    df['Value'] = -df['Value']  # invert sign, so 'Value' can be sorted descending
    df.sort_values(by=['TIME', 'Value'], ascending=True, inplace=True)
    df['Value'] = -df['Value']  # restore the correct sign
    
    return  df


def avgSalary(territory='Italia', year=-1):
    '''
    Returns the average salary value in a given territory for a given year (parameters).
    The default value for the territory is entire Italy.
    If the year is not specified, the average value is computed over all the years which
    are available.
    '''
    query = f"Territorio=='Italia' & Sesso=='totale' & `Classe di età`=='totale' & `Qualifica contrattuale`=='totale' & `Classe di dipendenti`=='totale'"   \
            + (f" & `TIME=={year}" if year!=-1 else "")
    return round(100*getDataAboutTerritory().query(query)['Value'].mean())/100  # round(100*..)/100 is used to have two decimal digits

# Utility functions for geo-data
def readGeoDataToDictHavingYearAsKey():
    '''
    Import data Geo-data (coordinates) and returns the dictionary having as key
    the year and as values the dataframe with geodata loaded from shape files.
    '''
    map_df = {} # dictionary, year as key
    map_df[2014] = gpd.read_file(f'{dataFolderName}/province_shapes/Prov01012014_g/Prov01012014_g_WGS84.shp')
    map_df[2014]['DEN_PCM'] = map_df[2014]['DEN_PROV']  # duplicate this column to make the dataframe compliant with those of subsequent years 
    map_df[2014].loc[ map_df[2014].DEN_PCM=="Forlì-Cesena","DEN_PCM" ] = "Forli'-Cesena"

    for year in range(2015,2018):
        fp = f'{dataFolderName}/province_shapes/ProvCM01012017_g/ProvCM01012017_g_WGS84.shp' # data updated to 1st Jan 2017 work for our purposes
        map_df[year] = gpd.read_file(fp) #reading the file stored in variable fp
        map_df[year].loc[ map_df[year].DEN_PCM=="Aosta","DEN_PCM" ] = "Valle d'Aosta/Vallée d'Aoste"
        map_df[year].loc[ map_df[year].DEN_PCM=="Massa Carrara","DEN_PCM" ] = "Massa-Carrara"
        map_df[year].loc[ map_df[year].DEN_PCM=="Bolzano","DEN_PCM" ] = "Bolzano/Bozen"

    # Note: territories coords change over the year, hence we save the year near the territory names
    for year in map_df.keys():
        map_df[year]["TerritorioAnno"] = map_df[year]["DEN_PCM"] + str(year)
    
    return map_df


# Function to convert (project) coordinates to latitude/longitude
def convertCrsToLatLong(inputGeopandasDf, inplace=False):
    '''
    Convert the geo-coordinates of the iunput GeoPandas Dataframe to EPSG:4326 (latitude and longitude)
    and returns a new GeoPandas dataframe having the data in the new coordinates system.
    You can specify the parameter inplace=True if you want to change the coordinate system "inplace",
    i.e., directly in the input GeoPandas Dataframe.
    '''
    outputGeopandasDf = inputGeopandasDf.set_geometry("geometry") # The original geometry column is replaced with "geometry" (if it was different).
    outputGeopandasDf = outputGeopandasDf.to_crs("EPSG:4326", inplace=inplace)
    return outputGeopandasDf
    

def createGeoJsonFromFile(geoJsonFolder, shapeDataDictYears, convertCrsToLatLongFlag=True):
    '''
    Creates GeoJson files in the folder whose path is specified as parameter as string,
    from the given dictionary having years as keys and the corresponding shape file data
    (GeoPandas dataframe) as values.
    The parameter shapeDataDictYears can also be the shape file data directly, i.e. the
    value of onlyh one record of a dictionary.
    Specify the parameter convertCrsToLatLongFlag=False if you do NOT want to convert the
    geo-coordinate system to EPSG:4326; default is True.
    Returns a dictionary having as key the years (the same as the input dictionary) and
    the corresponding GeoJson data as values.
    '''
    geoJsonData = {}
    if not os.path.exists(geoJsonFolder):
        os.makedirs(geoJsonFolder)              # TODO : check for issues (everything correct? Warning: '"writeGeoJson" is not accessed', as if os.makedirs was never used)

    isInputShapeDataAsDict = type(shapeDataDictYears) is dict # true id a dictionary is given as input parameter
    if(not isInputShapeDataAsDict):
        shapeDataDictYears = {'': shapeDataDictYears}    # converted to dict to use the same code

    for year in shapeDataDictYears.keys():
        if(convertCrsToLatLongFlag):
            shapeDataDictYears[year] = convertCrsToLatLong(shapeDataDictYears[year])
        geoJsonPathThisYear = geoJsonFolder+str(year)+'.json'
        shapeDataDictYears[year].to_file(geoJsonPathThisYear, driver="GeoJSON")
        with open(geoJsonPathThisYear, encoding="utf-8") as geofile:
            geoJsonData[year] = json.load(geofile)    
    
    return geoJsonData if(isInputShapeDataAsDict) \
                       else geoJsonData[[v for v in shapeDataDictYears.keys()][0]]


def loadDataMultipleYears(provinceNames=[], years=[]):
    '''
    Returns the GeoJson data and the dataframe of provinces (only with territories, economic sectors
    excluded) for all the years. The two dataframes (geoJsonData, df_province) have to be unpacked.
    This function can be used to rapidly load both geo-data and data about salaries in provinces, over
    all the years (province granularity only).
    If the parameter provinceNames is specified, only data about the desired provinces will be loaded
    (a list is expected).
    If the parameter years (a list is expected) is specified, only data about selected years will be
    returned
    '''

    # Read geo-data
    map_df = readGeoDataToDictHavingYearAsKey() # dictionary, year as key

    # Load data about salaries for each province
    df_province = getProvinceSalaryvalue()

    if(len(years)>0):   # filter according to years
        df_province = df_province.query(' | '.join({f"(TIME=={year})" for year in years}))
        map_df = {year: map_df[year] for year in years}
    else:
        years = map_df.keys()

    if(len(provinceNames)>0):
        df_province = df_province.query(' | '.join({f'(Territorio=="{provinceName}")' for provinceName in provinceNames}))
        map_df = {year: map_df[year].query(' | '.join({f'(DEN_PCM=="{provinceName}")' for provinceName in provinceNames})) for year in years}
    
    # Create GeoJson from SHP dataframe (union over years of shp files)
    geoJsonData = createGeoJsonFromFile(geoJsonFolder, pd.concat(tuple(convertCrsToLatLong(map_df[year]) for year in years)))


    return geoJsonData, df_province

### Plot maps

In [None]:

# Load both geodata and data about salaries (only for the desired province), for each year

for year in range(2014,2018):
    geoJsonData, df_province = loadDataMultipleYears(years=[year])

    print(f"\n\nYear: {year}")
    maxSalary = max(df_province['Value'])
    minSalary = min(df_province['Value'])
    best_province  = df_province.query(f"Value=={maxSalary}")
    worst_province = df_province.query(f"Value=={minSalary}")
    if(len(best_province)>1 or len(worst_province)>1):
        print("WARNING: query returned more than one result, only the first result is showed")
    
    print(f"\tBest province{'s' if len(best_province)>1 else ''}:\t{str(best_province.to_dict('records'))}")
    print(f"\tWorst province{'s' if len(worst_province)>1 else ''}:\t{str(worst_province.to_dict('records'))}")

    # Choropleth by categories

    fig = px.choropleth(
        data_frame=df_province, 
        geojson=geoJsonData, 
        locations='Territorio',              # name of dataframe column
        featureidkey='properties.DEN_PCM',   # path to field in GeoJSON feature object with which to match the values passed in to locations
        color='SalaryCategory',
        color_discrete_sequence=colors_palette,      # for discrete scale of colors
        center={"lat": 42, "lon": 13},
        projection='mercator',
        labels={'SalaryCategory': 'Average hourly gross salary'},
        hover_name='Territorio',
        hover_data={'Value':True, 'SalaryCategory':False, 'Territorio': False}          # TODO: improve this (see "hovertemplate")
    )
    fig.update_traces(marker=dict(opacity=1, line=dict(color='black', width=0.1)))      # TODO: look for "hovertemplate, https://plotly.com/python/reference/choropleth/#choropleth-hovertemplate"
    fig.update_layout(
        plot_bgcolor='white',
        font=dict(color='dimgray'),
        title='Salaries in private companies',
        margin={"r":0,"t":0,"l":0,"b":0},
        legend_itemsizing='trace'               # Determines if the legend items symbols scale with their corresponding "trace" attributes or remain "constant" independent of the symbol size on the graph. # TODO: NOT working
    )
    fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations")
    fig.show("notebook")


    # Export the figure
    if exportFigure:
        fig.write_image(f"{figureOutputFolder}/question 1/geoMap{year}.svg")
        fig.write_image(f"{figureOutputFolder}/question 1/geoMap{year}.png", width=outputWidthImage, height=outputHeightImage)

## Question 2
Do women earn less than men in Italy in private companies? Where is the most difference?

In [None]:
df_sex = df_territory.query('Sesso!="totale" & Territorio=="Italia"')[['Sesso','TIME','Value']]
# df_sex = df_sex.drop(['Classe di età','Qualifica contrattuale','Classe di dipendenti'],axis=1)
# df_sex

### Plot line chart

In [None]:
labels = ['Male','Female','Gap']
colors = ['#5b8592','#cf6651','#171717']
first_year = df_sex.TIME.min()
last_year = df_sex.TIME.max()

x_year = np.arange(first_year,last_year+1)
x_data = np.vstack((x_year,)*3)

df_sex.sort_values(by='TIME')
df_mal = df_sex.query('Sesso=="maschi"')['Value'].to_list()
df_fem = df_sex.query('Sesso=="femmine"')['Value'].to_list()
y_mal = []
y_fem = []
gap = []
for i in range(0,last_year-first_year+1):
    gap.append(round(df_mal[i] - df_fem[i],2))
    if i != 0:
        y_mal.append(round(df_mal[i] - df_mal[i-1],2))
        y_fem.append(round(df_fem[i] - df_fem[i-1],2))
    if i == 0:
        y_mal.append(0)
        y_fem.append(0)
y_data = np.array([y_mal,y_fem,gap])

fig = go.Figure()

annotations = []

for i in range(0, len(labels)):
    fig.add_trace(go.Scatter(x=x_data[i], y=y_data[i], mode='lines',
        name=labels[i], line=dict(color=colors[i]), connectgaps=True ))
    # endpoints
    if i==1:
        fig.add_trace(go.Scatter(x=x_data[i], y=y_data[i],
            mode='markers+text', marker=dict(color=colors[i]),
            text=y_data[i] , textposition="top center"))
    else:
        fig.add_trace(go.Scatter(x=x_data[i], y=y_data[i],
            mode='markers+text', marker=dict(color=colors[i]),
            text=y_data[i] , textposition="bottom center"))
    
    # Name of lines
    if i != 0:
        annotations.append(dict(text=labels[i],showarrow=False,
            xref='x', x=x_data[i,1]-0.1, y=y_data[i,1]+0.1, xanchor='right', yanchor='middle', 
            font=dict(family="Bahnschrift",size=16,color=colors[i])))
    if i == 0:
        annotations.append(dict(text=labels[i],showarrow=False,
            xref='x', x=x_data[i,1]-0.1, y=-y_data[i,1]+0.15, xanchor='right', yanchor='middle', 
            font=dict(family="Bahnschrift",size=16,color=colors[i])))

fig.update_layout(annotations=annotations)

fig.update_layout(
    xaxis_title="year",
    yaxis_title="€/h",
    xaxis=dict(showline=True, showticklabels=True, ticks='outside',
        linecolor='rgb(204, 204, 204)', linewidth=2, dtick = 1),
    yaxis=dict(showline=True, showticklabels=True, ticks='outside', 
        linecolor='rgb(204, 204, 204)', linewidth=2, dtick = 1),
        # ,range = [0, max(df_sex['Value']*1.5)], zeroline=True),
    showlegend=False,
    plot_bgcolor='white',
    font=dict(family="Bahnschrift",size=10,color="grey"),
    width=800, height=500
)

fig.show()

# Export the figure
if exportFigure:
    fig.write_image(f"{figureOutputFolder}/question 2/genderGapLine{year}.svg")
    fig.write_image(f"{figureOutputFolder}/question 2/genderGapLine{year}.png", width=outputWidthImage, height=outputHeightImage)

## Question 3
What are the sectors for which the salaries in private companies are highest in Italy?

In [None]:
import plotly.graph_objects as go

In [None]:
df_sectors_tot = df_sectors.query('Sesso=="totale" & `Classe di età`=="totale" & \
                              `Classe di dipendenti`=="totale" & `Qualifica contrattuale`=="totale"'
                              )[['Ateco 2007','TIME','Value']]

### Plot horizontal bar chart for sectors

In [None]:
long_names = [
    'Supply of electricity, gas, steam and air conditioning',
    'Activities of accommodation and catering services',
    'Other services activities',
    'Financial and insurance<br> activities',
    'Rental, travel agencies, business support services'
    ]
br_names = [
    'Supply of electricity, gas,<br> steam and air conditioning',
    'Activities of accommodation<br> and catering services',
    'Other services activities',
    'Financial and insurance activities',
    'Rental, travel agencies,<br> business support services'
    ]
for i in range(0, len(long_names)):
    df_sectors_tot.loc[df_sectors_tot['Ateco 2007']==long_names[i],"Ateco 2007"] = br_names[i]

In [None]:
howManyEls=2
df_new = pd.DataFrame(columns=['Ateco 2007','TIME','Value'])
val_x_axis = max(df_sectors_tot['Value'])

for year in range(2014,2018,1):
  tmp = df_sectors_tot.query(f'TIME=={year}').sort_values(by='Value')
  df_new = df_new.append(tmp.head(howManyEls))
  
  others = {'Ateco 2007':['Others'],'TIME':[year],'Value':[round(np.average(tmp.head(-howManyEls).tail(-howManyEls)["Value"]),2)]}
  tmp_others = pd.DataFrame(others,columns=['Ateco 2007','TIME','Value'])
  df_new = df_new.append(tmp_others)
  
  df_new = df_new.append(tmp.tail(howManyEls))

  # df_new = df_new.sort_values(by='Value').reset_index()

  fig = px.bar(df_new.query(f'TIME=={year}'), x="Value", y="Ateco 2007", text="Value")

  fig.update_traces(texttemplate='%{text:.2f} ', textposition='inside')

  fig.update_traces(marker_color=colors_palette[4])
                  # , marker_line_color='rgb(8,48,107)',marker_line_width=1.5, opacity=0.6)
  # fig.update_layout(title_text='Sectors with higher salary')

  fig.update_layout(
    title_text=f'{year}',
    yaxis_title=None,
    xaxis_title="€/h",
    xaxis=dict(showline=True, showticklabels=True, ticks='outside',
      linecolor='rgb(204, 204, 204)', linewidth=2, dtick = 5,
      range = [0, val_x_axis]),
    yaxis=dict( showgrid=False, showline=False, ),
    paper_bgcolor='white',
    plot_bgcolor='white',
    showlegend=False,
    width=800, height=350
    )
  
  fig.show()

  # Export the figure
  if exportFigure:
    fig.write_image(f"{figureOutputFolder}/question 3/barChartSectors{year}.svg")
    fig.write_image(f"{figureOutputFolder}/question 3/barChartSectors{year}.png", width=outputWidthImage, height=outputHeightImage)

### Plot with slider

In [None]:
howManyEls=3
df_new = pd.DataFrame(columns=['Ateco 2007','TIME','Value'])

for year in range(2014,2018,1):
  tmp = df_sectors_tot.query(f'TIME=={year}').sort_values(by='Value')
  df_new = df_new.append(tmp.head(howManyEls))
  
  others = {'Ateco 2007':['Others'],'TIME':[year],'Value':[round(np.average(tmp.head(-howManyEls).tail(-howManyEls)["Value"]),2)]}
  tmp_others = pd.DataFrame(others,columns=['Ateco 2007','TIME','Value'])
  df_new = df_new.append(tmp_others)
  
  df_new = df_new.append(tmp.tail(howManyEls))

df_new = df_new.sort_values(by='Value').reset_index()

fig = px.bar(df_new, x="Value", y="Ateco 2007", text="Value",
  animation_frame="TIME", range_x=[0,df_new['Value'].max()*1.1])

fig.update_traces(texttemplate='%{text:.2f} ', textposition='inside')

fig.update_layout(
      xaxis=dict( showgrid=False, showline=False ),
      yaxis=dict( showgrid=False, showline=False, ),
      paper_bgcolor='rgb(248, 248, 255)',
      plot_bgcolor='rgb(248, 248, 255)',
    )

fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1000
# fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 1
  
fig.show("notebook")