In [None]:
import geopandas as gpd
import json
import os
import pandas as pd
import plotly.express as px


# PARAMETERS
dataFolderName = 'data'
geoJsonFolder = dataFolderName+'/geoJson/'

In [None]:
# Create geo-data, year by year, correcting input data to make them compatible with our data

map_df = {} # dictionary, year as key
map_df[2014] = gpd.read_file(f'{dataFolderName}/province_shapes/Prov01012014_g/Prov01012014_g_WGS84.shp')
map_df[2014]['DEN_PCM'] = map_df[2014]['DEN_PROV']  # duplicate this column to make the dataframe compliant with those of subsequent years 
map_df[2014].loc[ map_df[2014].DEN_PCM=="Forlì-Cesena","DEN_PCM" ] = "Forli'-Cesena"

for year in range(2015,2018):
    fp = f'{dataFolderName}/province_shapes/ProvCM01012017_g/ProvCM01012017_g_WGS84.shp' # data updated to 1st Jan 2017 work for our purposes
    map_df[year] = gpd.read_file(fp) #reading the file stored in variable fp
    map_df[year].loc[ map_df[year].DEN_PCM=="Aosta","DEN_PCM" ] = "Valle d'Aosta/Vallée d'Aoste"
    map_df[year].loc[ map_df[year].DEN_PCM=="Massa Carrara","DEN_PCM" ] = "Massa-Carrara"
    map_df[year].loc[ map_df[year].DEN_PCM=="Bolzano","DEN_PCM" ] = "Bolzano/Bozen"



# Change the coordinate system

print('Input Coordinate Reference System: \t ' + str(map_df[2014].crs)) # print the Coordinate Reference System (CRS), EPSG:32632 is WGS 84 / UTM zone 32N (WGS=World Geodetic System, UTM=Universal Transverse Mercator)

# Function to convert (project) coordinates to latitude/longitude
def convertCrsToLatLong(inputGeopandasDf):
    outputGeopandasDf = inputGeopandasDf.set_geometry("geometry") # The original geometry column is replaced with "geometry" (if it was different).
    outputGeopandasDf = outputGeopandasDf.to_crs("EPSG:4326")
    return outputGeopandasDf

# Create a GeoJson file from the *.shp and read it
geoJsonData = {}
if not os.path.exists(geoJsonFolder):
    os.makedirs(geoJsonFolder)
for year in range(2014, 2018):
    map_df[year] = convertCrsToLatLong(map_df[year])
    geoJsonPathThisYear = geoJsonFolder+str(year)+'.json'
    map_df[year].to_file(geoJsonPathThisYear, driver="GeoJSON")
    with open(geoJsonPathThisYear, encoding="utf-8") as geofile:
        geoJsonData[year] = json.load(geofile)

print('Converted Coordinate Reference System: \t ' + str(map_df[2014].crs) + '\n') # print the Coordinate Reference System (CRS)

# Plot the map (unnecessary, just to show that it works)
map_df[2014].plot()

print(map_df[2014].head())  # NOTE: 'DEN_PCM' COLUMNS contains distinct values
for year in map_df:
    print(f'Year {year}:\t' + str(len(map_df[year])) + ' ' + str(len(map_df[year]['DEN_PCM'].drop_duplicates())))
    # # Print duplicates:
    # print(map_df[year] [map_df[year].duplicated('DEN_PCM')])

In [None]:
# Load data from CSV to the program
dataFileName = dataFolderName + '/DCSC_RACLI_01092021113430630.csv'
df = pd.read_csv(dataFileName).drop_duplicates()

# Transform the data and remove useless columns
df['Territorio'] = df['Territorio'].str.replace(' / ','/')
df = df.drop('TIPO_DATO7', axis=1) # always the same (HOUWAG_ENTEMP_AV_MI)
df = df.drop('Tipo dato', axis=1)  # always the same (Retribuzione lorda oraria per ora retribuita delle posizioni lavorative dipendenti in euro (media).)
df = df.drop(['SEXISTAT1', 'ETA1_A','PROFILO_PROF','CLLVT','Seleziona periodo'], axis=1)  # ridondance of information
df = df[df['Flag Codes'] != 'c'].drop(['Flags','Flag Codes'], axis=1) # delete incomplete data and drop columns with corresponding flag ('c' is the flag for hidden data)

# Get data about territory (discard the sector)
df_territory = df.query('`Ateco 2007`=="TOTALE"').drop(['Ateco 2007', 'ATECO_2007'], axis=1)

# Consider only data about provinces
df_province = df_territory[df_territory['ITTER107'].str.contains('.{5}')].drop('ITTER107', axis=1)   # for provinces, 'ITTER107' code is 5 chars long
df_province.loc[df_province['Territorio']=="Forlì-Cesena", "Territorio"] = "Forli'-Cesena"           # transform data for consistency with datasets of geocoords

print(df_province["Territorio"].drop_duplicates().sort_values().tolist())
print(df_province.head())

# Create a dictionary having years as keys
df_province_years = {year: df_province.query(f'TIME=={year}').drop('TIME', axis=1).drop_duplicates() for year in range(2014,2018)}
print(pd.DataFrame(df_province_years[2014]).head())

In [None]:
# Test data for the map
test_year = 2017        # year to use as test
df_test = df_province_years[test_year].query("Sesso=='totale' & `Classe di età`=='totale' & `Qualifica contrattuale`=='totale' & `Classe di dipendenti`=='totale'")    \
                                      .drop(['Sesso', 'Classe di età', 'Qualifica contrattuale', 'Classe di dipendenti'], axis=1)
geoJsonData_test = geoJsonData[test_year]

print(df_province_years[test_year]["Territorio"].drop_duplicates().sort_values().tolist())
print()
print(map_df[test_year]["DEN_PCM"].drop_duplicates().sort_values().tolist())
print()
print('\n\t\tproperties.DEN_PCM:\n' + str(sorted([prov['properties']['DEN_PCM'] for prov in geoJsonData_test['features']])) + '\n\n')

print(df_test.head())
print(geoJsonData_test.keys())
print('\tfeatures:\t' + str(geoJsonData_test['features'][0].keys()))
print('\t\tproperties:\t' + str(geoJsonData_test['features'][0]['properties'].keys()))

print(geoJsonData_test['features'][0]['properties'].values())
# print(geoJsonData_test['features'][0]['geometry']['coordinates'])
print(geoJsonData_test['features'][1]['properties'].values())


print('Same lenght of dataframes? ' + str(len(df_test)==len(geoJsonData_test['features'])) + ' (' + str(len(df_test)) + ')')

print(geoJsonData_test['features'][0]['properties'].values())

In [None]:
fig = px.choropleth(    # NOTE: This map has an incorrect aspect (width vs height) ratio
    data_frame=df_test, 
    geojson=geoJsonData_test, 
    locations='Territorio',              # name of dataframe column
    featureidkey='properties.DEN_PCM',   # path to field in GeoJSON feature object with which to match the values passed in to locations
    color='Value',
    color_continuous_scale="Magma",
    center={"lat": 42, "lon": 13},
    labels={'Value':'Average hourly gross salary'},
    projection='mercator'
)
fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show("notebook")

In [None]:
# choropleth mapbox
fig = px.choropleth_mapbox(
    df_test,
    geojson=geoJsonData_test,       
    color='Value',                      # name of a dataframe column
    locations='Territorio',             # name of a dataframe column
    featureidkey="properties.DEN_PCM",
    center={"lat": 42, "lon": 13},
    mapbox_style="carto-positron",
    zoom=4.4
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds="locations")
fig.show()