# 00 - Inizialization

## Intallation modules

In [None]:
# Install modules
! pip install plotly
! pip install matplotlib
! pip install --upgrade nbformat
! pip install kaleido
! pip install geopandas # works on Linux, not on Windows

## Parameters

In [None]:
import geopandas as gpd
import json
import numpy as np
import os
import pandas as pd
import plotly.express as px
import re
import sys
from shapely.wkt import loads
import shutil as sh


import warnings
# warnings.filterwarnings('always')


# PARAMETERS
dataFolderName = 'data'
geoJsonFolder = dataFolderName+'/geoJson/'
figureOutputFolder = 'exported_figures/Test_geoMap'

colors_palette = ['#003a2b','#249e89','#f5f5f5','#d86e58','#6a0000']

dataFileName = dataFolderName + '/DCSC_RACLI_01092021113430630.csv' # for data loading (salaries)

exportFigure = False    # Set to true to export figures

# TODO : define number of categories and corresponding colors (discrete colormap) to use in maps
# TODO : parametrize properties of figures

## Utility functions

### First part of functions

In [None]:
# Utility functions to read data from csv and remove useless columns frome the dataframe

def loadDataFromCSV(forceUpdate=False):
    '''
    Load data about salaries into the program.
    Returns a Pandas Dataframe.
    If the parameter forceUpdate is set to True, this function will
    reload the dataframe from the file even if it was already loaded
    (to be used when suspecting the data are chenged on the file).
    '''

    df_ref = {}

    def closureFun(forceUpdate=False):

        if (forceUpdate==True):
            df_ref.clear() # clear the df
        
        if (len(df_ref)==0):

            df = pd.read_csv(dataFileName).drop_duplicates()

            # Transform the data and remove useless columns
            df['Territorio'] = df['Territorio'].str.replace(' / ','/')
            df = df.drop('TIPO_DATO7', axis=1) # always the same (HOUWAG_ENTEMP_AV_MI)
            df = df.drop('Tipo dato', axis=1)  # always the same (Retribuzione lorda oraria per ora retribuita delle posizioni lavorative dipendenti in euro (media).)
            df = df.drop(['SEXISTAT1', 'ETA1_A','PROFILO_PROF','CLLVT','Seleziona periodo'], axis=1)  # ridondance of information
            df = df[df['Flag Codes'] != 'c'].drop(['Flags','Flag Codes'], axis=1) # delete incomplete data and drop columns with corresponding flag ('c' is the flag for hidden data)

            # Transform data for consistency with datasets of geocoords
            df.loc[df['Territorio']=="Forlì-Cesena", "Territorio"] = "Forli'-Cesena"

            # Save the dataframe
            df_ref[0] = df

        return df_ref[0]
    
    return closureFun

loadDataFromCSV = loadDataFromCSV() # use the closure
    


def getDataAboutTerritory():
    '''
    Returns data about salaries in territories (data about sectors are excluded).
    '''
    return loadDataFromCSV().query('`Ateco 2007`=="TOTALE"').drop(['Ateco 2007', 'ATECO_2007'], axis=1)


def getDataAboutProvinces():
    '''
    Returns data about salaries in provinces (data about sectors, regions, entire Italy are excluded).
    '''
    df_territory = getDataAboutTerritory()
    years = loadDataFromCSV()['TIME'].drop_duplicates()

    # Note: this column is present also in geo-data and can be used to join the datasets
    df_territory["TerritorioAnno"] = df_territory["Territorio"] + df_territory['TIME'].astype(str)
    return df_territory[df_territory['ITTER107'].str.contains('.{5}')].drop('ITTER107', axis=1)   # for provinces, 'ITTER107' code is 5 chars long


def getDataAboutProvincesInDictHavingYearsAsKey(years=-1):
    '''
    Returns data about salaries in provinces (data about sectors, regions, entire Italy are excluded),
    organized in a dictionary having years (the parameters) as keys.
    Params: years, e.g.: years=range(2014,2018).
    If the parameter years is not specified, all the years are considered.
    '''
    dataProvinces = getDataAboutProvinces()
    if(years==-1):
        years = dataProvinces['TIME'].drop_duplicates()

    return {year: dataProvinces.query(f'TIME=={year}').drop_duplicates() for year in years}


def getProvinceSalaryvalue(year=-1):        # TODO: take a list as input parameter
    '''
    Returns a Pandas Dataframe with three columns: one for Province names ("Territorio"), the second for the
    year ("TIME") and the third for the corresponding salary value ("Value"); column names are the ones inside
    the brackets ("Territorio", "TIME", "Value").
    Returned data refer to the year which is given as parameter.
    If the year parameter is not specified, also the column 'TIME' is returned, with the corresponding year
    '''
    df_years = getDataAboutProvincesInDictHavingYearsAsKey([year]) if year!=-1 \
                                                             else getDataAboutProvincesInDictHavingYearsAsKey()
    
    years = sorted(df_years.keys())

    df_years = {year: df_years[year].query("Sesso=='totale' & `Classe di età`=='totale' & `Qualifica contrattuale`=='totale' & `Classe di dipendenti`=='totale'")   \
                                    .drop(['Sesso', 'Classe di età', 'Qualifica contrattuale', 'Classe di dipendenti'], axis=1)                                     \
                for year in years}

    # Categorization of Salary values (grouping in categories)
    valueCountedData = {year: np.floor(df_years[year]["Value"]).astype(int).value_counts() for year in years}

    # NOTE: This part should be part of data transforming? But ranges should adapt to the context?

    salaryCategoryBorders = range(9,20,2)   # same category subdivion for all years
    for year in years:
        oldCategory=0
        df = df_years[year]
        for category in salaryCategoryBorders:
            numberProvinceInThisCategory = sum([valueCountedData[year][key] for key in np.intersect1d(valueCountedData[year].keys().tolist(), range(oldCategory,category))])
            df.loc[(oldCategory<=df['Value']) & ( (df['Value']<category) | (df['Value']>=salaryCategoryBorders[-1]) ), "SalaryCategory"] =                      \
                (f"{oldCategory} ≤ " if oldCategory >= salaryCategoryBorders[0] else "        ")                                                                 \
                + ".."                                                                                                                                          \
                + (f" < {category}"  if category < salaryCategoryBorders[-1] else "        ")                                                                   \
                + f"  €/hr\t({numberProvinceInThisCategory} provinces)"
            oldCategory = category
        
        # sort (needed to respect the range-scale in plots if categorization is used)
        df.sort_values(by=['Value'], ascending=True, inplace=True)
        
        df_years[year] = df

    df = pd.concat(tuple(df_years[year] for year in years))

    # sort (needed to respect the range-scale in plots if categorization is used)
    #   Sort (first) ascending wrt 'TIME' (oldest first) then descending wrt 'Value'
    df['Value'] = -df['Value']  # invert sign, so 'Value' can be sorted descending
    df.sort_values(by=['TIME', 'Value'], ascending=True, inplace=True)
    df['Value'] = -df['Value']  # restore the correct sign
    
    return  df


def categorization(salaryCategoryBorders = range(9,20,2)):
    '''
    Returns a Pandas Dataframe with three columns: one for the year, the second for the
    salary category and the third for the corresponding number of provinces where people
    earn as much as declared in the category.
    You can specifiy the range for the categories as parameter.
    '''

    # TODO : REFACTORING (code duplication with the previous function)

    df_years = getDataAboutProvincesInDictHavingYearsAsKey()
    years = sorted(df_years.keys())
    df_years = {year: df_years[year].query("Sesso=='totale' & `Classe di età`=='totale' & `Qualifica contrattuale`=='totale' & `Classe di dipendenti`=='totale'")   \
                                    .drop(['Sesso', 'Classe di età', 'Qualifica contrattuale', 'Classe di dipendenti'], axis=1)                                     \
                for year in years}

    # Categorization of Salary values (grouping in categories)
    valueCountedData = {year: np.floor(df_years[year]["Value"]).astype(int).value_counts() for year in years}

    df_toReturn = pd.DataFrame(columns=['Year', 'Gross salary  [€/hr]', '#Provinces'])
    columnNames = tuple(df_toReturn.columns)

    for year in years:
        oldCategory=0
        for category in salaryCategoryBorders:
            numberProvinceInThisCategory = sum([valueCountedData[year][key] for key in np.intersect1d(valueCountedData[year].keys().tolist(), range(oldCategory,category))])
            categoryStr = (f"{oldCategory} ≤ " if oldCategory >= salaryCategoryBorders[0] else "        ") + ".."    \
                            + (f" < {category}"  if category < salaryCategoryBorders[-1] else "        ")
            oldCategory = category
            df_toReturn = df_toReturn.append({columnNames[0]: year, columnNames[1]: categoryStr, columnNames[2]: numberProvinceInThisCategory}, ignore_index=True)


        
        # sort (needed to respect the range-scale in plots if categorization is used)
        df_toReturn.sort_values(by=[columnNames[0]], ascending=True, inplace=True)
    
    return  df_toReturn

   
def avgSalary(territory='Italia', year=-1):
    '''
    Returns the average salary value in a given territory for a given year (parameters).
    The default value for the territory is entire Italy.
    If the year is not specified, the average value is computed over all the years which
    are available.
    '''
    query = f"Territorio=='Italia' & Sesso=='totale' & `Classe di età`=='totale' & `Qualifica contrattuale`=='totale' & `Classe di dipendenti`=='totale'"   \
            + (f" & `TIME=={year}" if year!=-1 else "")
    return round(100*getDataAboutTerritory().query(query)['Value'].mean())/100  # round(100*..)/100 is used to have two decimal digits

### See categorization

In [None]:
#getProvinceSalaryvalue()
categorization()

### Second part of functions

In [None]:
# Utility functions for geo-data
def readGeoDataToDictHavingYearAsKey():
    '''
    Import data Geo-data (coordinates) and returns the dictionary having as key
    the year and as values the dataframe with geodata loaded from shape files.
    '''
    map_df = {} # dictionary, year as key
    map_df[2014] = gpd.read_file(f'{dataFolderName}/province_shapes/Prov01012014_g/Prov01012014_g_WGS84.shp')
    map_df[2014]['DEN_PCM'] = map_df[2014]['DEN_PROV']  # duplicate this column to make the dataframe compliant with those of subsequent years 
    map_df[2014].loc[ map_df[2014].DEN_PCM=="Forlì-Cesena","DEN_PCM" ] = "Forli'-Cesena"

    for year in range(2015,2018):
        fp = f'{dataFolderName}/province_shapes/ProvCM01012017_g/ProvCM01012017_g_WGS84.shp' # data updated to 1st Jan 2017 work for our purposes
        map_df[year] = gpd.read_file(fp) #reading the file stored in variable fp
        map_df[year].loc[ map_df[year].DEN_PCM=="Aosta","DEN_PCM" ] = "Valle d'Aosta/Vallée d'Aoste"
        map_df[year].loc[ map_df[year].DEN_PCM=="Massa Carrara","DEN_PCM" ] = "Massa-Carrara"
        map_df[year].loc[ map_df[year].DEN_PCM=="Bolzano","DEN_PCM" ] = "Bolzano/Bozen"

    # Note: territories coords change over the year, hence we save the year near the territory names
    for year in map_df.keys():
        map_df[year]["TerritorioAnno"] = map_df[year]["DEN_PCM"] + str(year)
        map_df[year] = map_df[year][['DEN_PCM','TerritorioAnno','geometry']]
    
    return map_df


# Function to convert (project) coordinates to latitude/longitude
def convertCrsToLatLong(inputGeopandasDf, inplace=False):
    '''
    Convert the geo-coordinates of the iunput GeoPandas Dataframe to EPSG:4326 (latitude and longitude)
    and returns a new GeoPandas dataframe having the data in the new coordinates system.
    You can specify the parameter inplace=True if you want to change the coordinate system "inplace",
    i.e., directly in the input GeoPandas Dataframe.
    '''
    outputGeopandasDf = inputGeopandasDf.set_geometry("geometry") # The original geometry column is replaced with "geometry" (if it was different).
    outputGeopandasDf = outputGeopandasDf.to_crs("EPSG:4326", inplace=inplace)
    return outputGeopandasDf
    

def createGeoJsonFromFile(geoJsonFolder, shapeDataDictYears, convertCrsToLatLongFlag=True):
    '''
    Creates GeoJson files in the folder whose path is specified as parameter as string,
    from the given dictionary having years as keys and the corresponding shape file data
    (GeoPandas dataframe) as values.
    The parameter shapeDataDictYears can also be the shape file data directly, i.e. the
    value of onlyh one record of a dictionary.
    Specify the parameter convertCrsToLatLongFlag=False if you do NOT want to convert the
    geo-coordinate system to EPSG:4326; default is True.
    Returns a dictionary having as key the years (the same as the input dictionary) and
    the corresponding GeoJson data as values.
    '''
    geoJsonData = {}
    if not os.path.exists(geoJsonFolder):
        os.makedirs(geoJsonFolder)              # TODO : check for issues (everything correct? Warning: '"writeGeoJson" is not accessed', as if os.makedirs was never used)

    isInputShapeDataAsDict = type(shapeDataDictYears) is dict # true id a dictionary is given as input parameter
    if(not isInputShapeDataAsDict):
        shapeDataDictYears = {'': shapeDataDictYears}    # converted to dict to use the same code

    for year in shapeDataDictYears.keys():
        if(convertCrsToLatLongFlag):
            shapeDataDictYears[year] = convertCrsToLatLong(shapeDataDictYears[year])
        geoJsonPathThisYear = geoJsonFolder+str(year)+'.json'
        shapeDataDictYears[year].to_file(geoJsonPathThisYear, driver="GeoJSON")
        with open(geoJsonPathThisYear, encoding="utf-8") as geofile:
            geoJsonData[year] = json.load(geofile)    
    
    return geoJsonData if(isInputShapeDataAsDict) \
                       else geoJsonData[[v for v in shapeDataDictYears.keys()][0]]


def loadDataMultipleYears(provinceNames=[], years=[], compress=-1, simplify=-1):
    '''
    Returns the GeoJson data and the dataframe of provinces (only with territories, economic sectors
    excluded) for all the years. The two dataframes (geoJsonData, df_province) have to be unpacked.
    This function can be used to rapidly load both geo-data and data about salaries in provinces, over
    all the years (province granularity only).
    If the parameter provinceNames is specified, only data about the desired provinces will be loaded
    (a list is expected).
    If the parameter years (a list is expected) is specified, only data about selected years will be
    returned.
    If the parameter 'compress' is specified and set to a positive value, then a compressed version of the GeoJson
    data will be provided. The compression is given by rounding the precision of the geo coordinates to
    the specified number of decimal digits.
    Similarly, you can specify a tolerance value for the parameter 'simplify'.
    See: https://geopandas.org/docs/user_guide/geometric_manipulations.html#GeoSeries.simplify
    '''

    # Read geo-data
    map_df = readGeoDataToDictHavingYearAsKey() # dictionary, year as key

    # Load data about salaries for each province
    df_province = getProvinceSalaryvalue()

    if(len(years)>0):   # filter according to years
        df_province = df_province.query(' | '.join({f"(TIME=={year})" for year in years}))
        map_df = {year: map_df[year] for year in years}
    else:
        years = map_df.keys()

    if(len(provinceNames)>0):
        df_province = df_province.query(' | '.join({f'(Territorio=="{provinceName}")' for provinceName in provinceNames}))
        map_df = {year: map_df[year].query(' | '.join({f'(DEN_PCM=="{provinceName}")' for provinceName in provinceNames})) for year in years}

    # Union over years of geodata and conversion of coordinates
    geoData = pd.concat(tuple(convertCrsToLatLong(map_df[year]) for year in years))

    # Compression of geo data (from: https://gis.stackexchange.com/a/321531)
    if compress>=0:
        # Round coordinates to the specified number of decimal digits. Topology may not be preserved
        simpledec = re.compile(r"\d*\.\d+")
        geoData['geometry'] = geoData['geometry'].apply(lambda x: loads(re.sub(simpledec, lambda match: f"{float(match.group()):.{compress}f}", x.wkt)))    \
                                                 .simplify(0) # 0 means no tolerance
    if simplify>0:
        geoData['geometry'] = geoData['geometry'].simplify(simplify)
    
    # Create GeoJson from SHP dataframe (union over years of shp files)
    geoJsonData = createGeoJsonFromFile(geoJsonFolder, geoData)

    return geoJsonData, df_province


# 01 - Other stuff

## Create geo-data ecc.

In [None]:
# Create geo-data, year by year, correcting input data to make them compatible with our data

map_df = readGeoDataToDictHavingYearAsKey() # dictionary, year as key

years = tuple(map_df.keys())
print('Input Coordinate Reference System: \t ' + str(map_df[years[0]].crs)) # print the Coordinate Reference System (CRS), EPSG:32632 is WGS 84 / UTM zone 32N (WGS=World Geodetic System, UTM=Universal Transverse Mercator)

# Create a GeoJson file from the *.shp and read it
geoJsonData = createGeoJsonFromFile(geoJsonFolder, map_df)
print('GeoJson Coordinate Reference System: \t ' + str(map_df[years[0]].crs) + '\n') # print the Coordinate Reference System (CRS)

# Plot the map (unnecessary, just to show that it works)
map_df[years[0]].plot()

print(map_df[years[0]].head())  # NOTE: 'DEN_PCM' COLUMNS contains distinct values
for year in map_df:
    print(f'Year {year}:\t' + str(len(map_df[year])) + ' ' + str(len(map_df[year]['DEN_PCM'].drop_duplicates())))
    # # Print duplicates:
    # print(map_df[year] [map_df[year].duplicated('DEN_PCM')])

## Load data etc.

In [None]:
# Load data about provinces
df_province = getDataAboutProvinces()
# print(df_province["Territorio"].drop_duplicates().sort_values().tolist())
print(df_province.head())

# Create a dictionary having years as keys
df_province_years = getDataAboutProvincesInDictHavingYearsAsKey(years)
# print(pd.DataFrame(df_province_years[years[0]]).head())

print(f"Average salary in Italy over years: " + str(avgSalary()) + " €/hr")

## Test data for the map

In [None]:
# Test data for the map
test_year = 2017        # year to use for tests
df_test = getProvinceSalaryvalue(test_year)
geoJsonData_test = geoJsonData[test_year]

print(df_province_years[test_year]["Territorio"].drop_duplicates().sort_values().tolist())
print()
print(map_df[test_year]["DEN_PCM"].drop_duplicates().sort_values().tolist())
print()
print('\n\t\tproperties.DEN_PCM:\n' + str(sorted([prov['properties']['DEN_PCM'] for prov in geoJsonData_test['features']])) + '\n\n')

print(df_test.head())
print(geoJsonData_test.keys())
print('\tfeatures:\t' + str(geoJsonData_test['features'][0].keys()))
print('\t\tproperties:\t' + str(geoJsonData_test['features'][0]['properties'].keys()))

print(geoJsonData_test['features'][0]['properties'].values())
# print(geoJsonData_test['features'][0]['geometry']['coordinates'])
print(geoJsonData_test['features'][1]['properties'].values())


print('Same lenght of dataframes? ' + str(len(df_test)==len(geoJsonData_test['features'])) + ' (' + str(len(df_test)) + ')')

print(geoJsonData_test['features'][0]['properties'].values())

# 02 - Some test choropleth

### By categories

In [None]:
# Choropleth by categories

fig = px.choropleth(
    data_frame=df_test, 
    geojson=geoJsonData_test, 
    locations='Territorio',              # name of dataframe column
    featureidkey='properties.DEN_PCM',   # path to field in GeoJSON feature object with which to match the values passed in to locations
    color='SalaryCategory',
    # color_continuous_scale="rdylgn",                                                                          # for continuos scale of colors
    color_discrete_sequence=['#4dac26', '#b8e186', '#fefee9', '#f1b6da', '#d01c8b'],      # for discrete scale of colors
    center={"lat": 42, "lon": 13},
    projection='mercator',
    labels={'SalaryCategory': 'Average hourly gross salary'},
    hover_name='Territorio',
    hover_data={'Value':True, 'SalaryCategory':False, 'Territorio': False}          # TODO: improve this (see "hovertemplate")
)
fig.update_traces(marker=dict(opacity=1, line=dict(color='black', width=0.1)))      # TODO: look for "hovertemplate, https://plotly.com/python/reference/choropleth/#choropleth-hovertemplate"
fig.update_layout(
    plot_bgcolor='white',
    font=dict(color='dimgray'),
    title='Salaries in private companies',
    margin={"r":0,"t":0,"l":0,"b":0},
    legend_itemsizing='trace'               # Determines if the legend items symbols scale with their corresponding "trace" attributes or remain "constant" independent of the symbol size on the graph. # TODO: NOT working
)
fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations")
fig.show("notebook")



# Export images
if exportFigure:
    figureOutputFolder_this = figureOutputFolder + '/choroplethByCategories'
    if os.path.exists(figureOutputFolder_this): # remove old data
        sh.rmtree(figureOutputFolder_this)
    os.makedirs(figureOutputFolder_this)
    fig.write_image(f"{figureOutputFolder_this}/testGeoMap{test_year}.svg")
    fig.write_image(f"{figureOutputFolder_this}/testGeoMap{test_year}.png")#, width=3000, height=2000)
    del figureOutputFolder_this


# TODO : solve export issues (only on some platforms)
# TODO : check data values
# TODO : legend symbol sizes proportional to number of provinces in the category
# TODO : show (infografic or similar) best and worst province salary and/or values of biggest cities (Milan, Rome, Turin, Naples, Palermo, Venice, Trieste)
# TODO : show data divided by principal;employeers / workers / apprentinces

### second?

In [None]:
salaryCategories = range(11,20,2)
df_year_salaryCategory_nProvs = categorization(salaryCategories)
df_year_salaryCategory_nProvs.iloc[:,1] = "    " + df_year_salaryCategory_nProvs.iloc[:,1] + "   " # update the text
df_year_salaryCategory_nProvs = {year: df_year_salaryCategory_nProvs.query(f"{df_year_salaryCategory_nProvs.columns[0]}=={year}").iloc[:,1:3].sort_values(by=[df_year_salaryCategory_nProvs.columns[1]], ascending=True) for year in df_year_salaryCategory_nProvs.iloc[:,0].drop_duplicates()}  # create a dictionary {year: [salaryCategory, numberOfProvinces]}
print(df_year_salaryCategory_nProvs[2014])

years = range(2014,2018)
max_x_val = max(df_year_salaryCategory_nProvs[year].iloc[:,1].max() for year in years)

if exportFigure:
    figureOutputFolder_this = figureOutputFolder + '/legend_barChartSectors'
    if os.path.exists(figureOutputFolder_this): # remove old data
        sh.rmtree(figureOutputFolder_this)
    os.makedirs(figureOutputFolder_this)
    print(figureOutputFolder_this)

for year in years:

    fig = px.bar(
        data_frame = df_year_salaryCategory_nProvs[year],
        x = df_year_salaryCategory_nProvs[year].columns[1],
        y = df_year_salaryCategory_nProvs[year].columns[0],
        orientation = 'h',  # horizontal bar chart
        text=df_year_salaryCategory_nProvs[year].columns[1],
        height=300,
        width=450,
        # log_x=True  # logarithmic scale
    )

    fig.update_traces(
        marker_color=[color for color in reversed(colors_palette)],
        marker_line_color='black', marker_line_width=1, opacity=1,
        # texttemplate='%{text:d} ', textposition='inside'
    )

    fig.update_layout(
        title_text=f'{year}',
        yaxis_title=df_year_salaryCategory_nProvs[year].columns[0],
        xaxis_title="Number of provinces",
        xaxis=dict(showline=True, showticklabels=True, ticks='outside',
            linecolor='rgb(204, 204, 204)', linewidth=2, dtick = 10,
            range = [max_x_val, 0]),  # reversed xaxis
            # range = [2, 0]),  # reversed xaxis if log xaxis
        yaxis=dict( showgrid=False, showline=False, side='right'),              # yaxis on the right side
        paper_bgcolor='white',
        plot_bgcolor='white',
        showlegend=False,
        hovermode=False
    )

    fig.show("notebook")

    # Export images
    if exportFigure:
        fig.write_image(f"{figureOutputFolder_this}/{year}.svg")
        fig.write_image(f"{figureOutputFolder_this}/{year}.png")

if exportFigure:
    del figureOutputFolder_this

### mapbox

In [None]:
# Choropleth mapbox
fig = px.choropleth_mapbox(
    df_test,
    geojson=geoJsonData_test,       
    color='Value',                      # name of a dataframe column
    locations='Territorio',             # name of a dataframe column
    featureidkey="properties.DEN_PCM",
    center={"lat": 42, "lon": 13},
    mapbox_style="carto-positron",
    zoom=4.4
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations")
fig.show()

## Slider 1

In [None]:
# Choropleth with slider

# Load both geodata and data about salaries
geoJsonData, df_province = loadDataMultipleYears()
print(df_province.head())

print("Size of dataframe: " + str(sys.getsizeof(df_province)))
print("Size of geojson:   " + str(sys.getsizeof(geoJsonData)))

# Choropleth with slider animation over years
fig = px.choropleth(
    data_frame=df_province, 
    geojson=geoJsonData, 
    locations='TerritorioAnno',                 # name of dataframe column
    hover_name='Territorio',
    hover_data={'Value':True, 'TerritorioAnno':False},
    featureidkey='properties.TerritorioAnno',   # path to field in GeoJSON feature object with which to match the values passed in to locations # TODO : discrete color map
    color='Value',
    color_continuous_scale="Magma",
    center={"lat": 42, "lon": 13},
    labels={'Value':'Salary in € ', 'TIME':'Year'},
    projection='mercator',
    animation_frame="TIME"
)
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1000
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 100
fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show("notebook")

# Export images
if exportFigure:
    figureOutputFolder_this = figureOutputFolder + '/slider'
    if os.path.exists(figureOutputFolder_this): # remove old data
        sh.rmtree(figureOutputFolder_this)
    os.makedirs(figureOutputFolder_this)
    fig.write_image(f"{figureOutputFolder_this}/testSliderGeoMap.svg")
    fig.write_image(f"{figureOutputFolder_this}/testSliderGeoMap.png")
    fig.write_html(f"{figureOutputFolder_this}/testSliderGeoMap.html")
    del figureOutputFolder_this

## Slider 2

In [None]:
# Show map for only specified provinces
provinceNames = ["Milano", "Torino", "Cuneo", "Novara", "Vercelli"]
years = [2017]  # TODO : animation not working with add_scattergeo

# Load both geodata and data about salaries (only for the desired province)
geoJsonData, df_province = loadDataMultipleYears(provinceNames, years)


# Choropleth with slider animation over years
fig = px.choropleth(
    data_frame=df_province, 
    geojson=geoJsonData, 
    locations='TerritorioAnno',                 # name of dataframe column
    featureidkey='properties.TerritorioAnno',   # path to field in GeoJSON feature object with which to match the values passed in to locations
    color='SalaryCategory',
    color_discrete_sequence=['#4dac26', '#b8e186', '#fefee9', '#f1b6da', '#d01c8b'],      # for discrete scale of colors
    center={"lat": 42, "lon": 13},
    projection='mercator',
    labels={'SalaryCategory': 'Average gross salary in €/hr', 'Value': 'Salary [€/hr]'},
    hover_name='Territorio',
    hover_data={'Value':True, 'SalaryCategory':False, 'Territorio': False, 'TerritorioAnno': False},          # TODO: improve this (see "hovertemplate")
    animation_frame="TIME"
)
fig.add_scattergeo(
    geojson=geoJsonData,
    locations = df_province['TerritorioAnno'],
    text = df_province['Territorio'],
    featureidkey="properties.TerritorioAnno",
    hoverinfo='skip',
    mode = 'text',
    showlegend=False
)
if(len(years)>1):
    fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1000      # for animations
    fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 100  # for animations
fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations")
fig.update_layout(
    margin={"r":0,"t":0,"l":0,"b":0},
)
fig.update_traces(marker=dict(opacity=1, line=dict(color='black', width=0.8)))
fig.show("notebook")

## Slider 3

In [None]:
yearToShow = 2017
# Load both geodata and data about salaries
geoJsonData, df_province = loadDataMultipleYears(years=[yearToShow])

fig = px.choropleth(
    data_frame=df_province, 
    geojson=geoJsonData, 
    locations='TerritorioAnno',                 # name of dataframe column
    featureidkey='properties.TerritorioAnno',   # path to field in GeoJSON feature object with which to match the values passed in to locations
    color='SalaryCategory',
    color_discrete_sequence=['#4dac26', '#b8e186', '#fefee9', '#f1b6da', '#d01c8b'],      # for discrete scale of colors
    center={"lat": 42, "lon": 13},
    projection='mercator',
    labels={'SalaryCategory': 'Average gross salary in €/hr'},
    hover_name='Territorio',
    hover_data={'Value':True, 'SalaryCategory':False, 'Territorio': False}          # TODO: improve this (see "hovertemplate")
)
fig.update_traces(marker=dict(opacity=1, line=dict(color='black', width=1)))      # TODO: look for "hovertemplate, https://plotly.com/python/reference/choropleth/#choropleth-hovertemplate"
fig.update_layout(
    plot_bgcolor='white',
    font=dict(color='dimgray'),
    title='Salaries in private companies',
    margin={"r":0,"t":0,"l":0,"b":0},
    legend_itemsizing='trace'               # Determines if the legend items symbols scale with their corresponding "trace" attributes or remain "constant" independent of the symbol size on the graph. # TODO: NOT working
)
fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations")
fig.add_scattergeo(
    geojson=geoJsonData,
    locations = df_province['TerritorioAnno'],
    text = df_province['Territorio'],
    featureidkey="properties.TerritorioAnno",
    hoverinfo='skip',
    mode = 'text',
    showlegend=False
)
fig.show("notebook")

# Final Choropleth

## First slider

In [None]:
# Choropleth with slider

# Load both geodata and data about salaries
geoJsonData, df_province = loadDataMultipleYears()

# Save index in salary categories to save space
salaryCategoryBorders = range(9,20,2)   # category subdivision
oldCategory=0
for category in salaryCategoryBorders:
    df_province.loc[(oldCategory<=df_province['Value']) & ( (df_province['Value']<category) | (df_province['Value']>=salaryCategoryBorders[-1]) ), "SalaryCategoryIndex"] =                      \
        (f"{oldCategory} ≤ " if oldCategory >= salaryCategoryBorders[0] else "        ")                                                                 \
        + ".."                                                                                                                                           \
        + (f" < {category}"  if category < salaryCategoryBorders[-1] else "        ")
    oldCategory = category
del oldCategory, category
salaryCategories = df_province['SalaryCategoryIndex'].drop_duplicates().sort_values().tolist()
salaryCategories = [salaryCategories[-1]] + salaryCategories[:-1]     # correct sorting order
salaryCategories = {i: salaryCategories[i] for i in range(0,len(salaryCategories))}
# salaryCategories = {str(i): salaryCategories[i] for i in range(0,len(salaryCategories))}
# NOTE (TODO): using keys of type string make the final map to be very big in terms of megabytes (~500MB)
#              but using keys of type int make the color bar of the legend in the final map not working properly
#              (continuos color map is shown even if discrete color maps is set)
#              See the parameter 'color' of the choropleth
print("Salary categories: " + str(salaryCategories) )
df_province['SalaryCategoryIndex'] = tuple(i for aCategory in df_province['SalaryCategoryIndex'] \
                                             for i in salaryCategories.keys()                    \
                                             if salaryCategories[i]==aCategory )
df_province['SalaryCategoryIndex'] = df_province['SalaryCategoryIndex'].astype(str) # string needed for discrete colormap

# sort (needed to respect the range-scale in plots if categorization is used)
df_province.sort_values(by=['TIME','Value'], ascending=True, inplace=True)


# Choropleth with slider animation over years
fig = px.choropleth(
    data_frame=df_province, 
    geojson=geoJsonData, 
    locations='TerritorioAnno',                 # name of dataframe column
    hover_name='Territorio',
    hover_data={'SalaryCategory':False, 'Value':True, 'TerritorioAnno':False},
    featureidkey='properties.TerritorioAnno',   # path to field in GeoJSON feature object with which to match the values passed in to locations # TODO : discrete color map
    color='SalaryCategoryIndex',
    color_discrete_sequence=['#4dac26', '#b8e186', '#fefee9', '#f1b6da', '#d01c8b'],      # for discrete scale of colors
    center={"lat": 42, "lon": 13},
    labels={'SalaryCategoryIndex':'Average hourly gross salary', 'TIME':'Year'},
    projection='mercator',
    animation_frame="TIME"
)
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1200
# fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 100
fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
# fig.show("notebook")

# Export images
if exportFigure:
    figureOutputFolder_this = figureOutputFolder + '/slider2'
    if os.path.exists(figureOutputFolder_this): # remove old data
        sh.rmtree(figureOutputFolder_this)
    os.makedirs(figureOutputFolder_this)
    fig.write_html(f"{figureOutputFolder_this}/testSliderGeoMap.html")
    del figureOutputFolder_this

In [None]:
print({year: df_province[df_province['TIME']==year]['SalaryCategoryIndex'].drop_duplicates().tolist() for year in df_province['TIME'].drop_duplicates()})
print(df_province['SalaryCategoryIndex'].drop_duplicates())

## Another chroplet

In [None]:
# Choropleth with slider

# Load both geodata and data about salaries
geoJsonData, df_province = loadDataMultipleYears()

# sort (needed to respect the range-scale in plots if categorization is used)
df_province['Value'] = -df_province['Value']
df_province.sort_values(by=['TIME','Value'], ascending=True, inplace=True)
df_province['Value'] = -df_province['Value']


if exportFigure:
    figureOutputFolder_this = figureOutputFolder + '/slider2/custom'
    if os.path.exists(figureOutputFolder_this): # remove old data
        sh.rmtree(figureOutputFolder_this)
    os.makedirs(figureOutputFolder_this)

# One choropleth for each year
years = range(2014,2018)
color_palette = ['#4dac26', '#b8e186', '#fefee9', '#f1b6da', '#d01c8b']
for year in years:
    fig = px.choropleth(
        data_frame=df_province.query(f'TIME=={year}')[['Territorio', 'TerritorioAnno','SalaryCategory','TIME','Value']], 
        geojson=geoJsonData, 
        locations='TerritorioAnno',                 # name of dataframe column
        hover_name='Territorio',
        hover_data={'SalaryCategory':False, 'Value':True, 'TerritorioAnno':False},
        featureidkey='properties.TerritorioAnno',   # path to field in GeoJSON feature object with which to match the values passed in to locations # TODO : discrete color map
        color='SalaryCategory',
        color_discrete_sequence=color_palette,      # for discrete scale of colors
        center={"lat": 42, "lon": 13},
        labels={'SalaryCategory':'Average hourly gross salary', 'TIME':'Year'},
        projection='mercator'
    )
    fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations")
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    # fig.show("notebook")

    # Export images
    if exportFigure:
        fig.write_html(f"{figureOutputFolder_this}/testGeoMap{year}.html")

if exportFigure:
    del figureOutputFolder_this

## Another chroplet (memory space saving)

In [None]:
# Choropleth with slider

# Load both geodata and data about salaries
simplifyTolerance=0.01       # TODO : should be a parameter?
geoJsonData, df_province = loadDataMultipleYears(simplify=simplifyTolerance) # simplify geodata (to save memory space)
# decimalDigitsCompression = 2 # TODO : should be a parameter?
# geoJsonData, df_province = loadDataMultipleYears(compress=decimalDigitsCompression) # simplify geodata (to save memory space)

# Keep only the salary category (drop out the number of provinces belonging to it)
import re
df_province['SalaryCategory'] = tuple( aMatch[0] for aMatch in re.findall(r"(\s*[0-9]*\s*([≤][ ])?[.]{2}([ ][<])?\s*[0-9]*)", ''.join(df_province['SalaryCategory'].tolist()) ) )
salaryCategories = df_province['SalaryCategory'].drop_duplicates().sort_values().tolist()
salaryCategories = tuple([salaryCategories[-1]] + salaryCategories[:-1])
print(salaryCategories)
years = range(2014,2018)
color_palette = ('#d01c8b', '#f1b6da', '#fefee9', '#b8e186', '#4dac26')

if len(salaryCategories)!=len(color_palette):
    raise Exception('Number of colors is different than the number of categories')

print("Going to create the map...") # TODO: delete this line (only for test purposes)
fig = px.choropleth(
    data_frame=df_province[['Territorio', 'TerritorioAnno','SalaryCategory','TIME','Value']],
    geojson=geoJsonData, 
    locations='TerritorioAnno',                 # name of dataframe column
    featureidkey='properties.TerritorioAnno',   # path to field in GeoJSON feature object with which to match the values passed in to locations # TODO : discrete color map
    color='SalaryCategory',
    color_discrete_map={salaryCategories[i]: color_palette[i] for i in range(0,len(color_palette))},
    category_orders={'category': salaryCategories},
    center={"lat": 42, "lon": 13},
    labels={'Value':'Gross salary', 'SalaryCategory':'Average hourly gross salary', 'TIME':'Year'},
    projection='mercator',
    locationmode='geojson-id',
    animation_frame="TIME",
    title='Salary distribution in Italian private companies',
    hover_name='Territorio',
    hover_data={'SalaryCategory':False, 'Value':True, 'TerritorioAnno':False},
)
print("Going to update the map...") # TODO: delete this line (only for test purposes)
fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations", subunitcolor='white')
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, showlegend=True, legend=dict(orientation='v'),geo=dict(bgcolor='rgba(0,0,0,0)', lakecolor='#e0fffe'), legend_title_text='Salary categories',)
print("Going to show the map...") # TODO: delete this line (only for test purposes)
fig.show("notebook")

# Export images
if exportFigure:
    figureOutputFolder_this = figureOutputFolder + '/slider2/custom2'
    if os.path.exists(figureOutputFolder_this): # remove old data
        sh.rmtree(figureOutputFolder_this)
    os.makedirs(figureOutputFolder_this)
    print("Going to export the map...") # TODO: delete this line (only for test purposes)
    fig.write_html(f"{figureOutputFolder_this}/testGeoMap.html")
    del figureOutputFolder_this