# Data Visualization Exam

In [None]:
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
import numpy as np

figureOutputFolder = 'exported_figures'

## Data acquisition
CSV file creation from raw data and loading the CSV file into the program.

In [None]:
dataFolderName = "data"
fileName = dataFolderName + '/DCSC_RACLI_01092021113430630.csv'
df = pd.read_csv(fileName)      # load data from CSV to program
df.head() # data loaded

## Data parsing

In [None]:
# TODO: Check for errors in data
#       total is equal to the arithmetic mean of the parts? not seem
#       find missing data
#   	check that value apprendista < operaio < dirigente (for territory)

print('\nBefore remove duplicates: ' + str(len(df)) + ' rows')
df.drop_duplicates()
print('After remove duplicates:  ' + str(len(df)) + ' rows')

# are values reasonable?
print('\nMin value is ' + str(df['Value'].min()))
print('Max value is ' + str(df['Value'].max()))

# TODO: Change type

# TODO: Choose the level for hierachical data


In [None]:
# TODO: Transform the data
df['Territorio'] = df['Territorio'].str.replace(' / ','/')

# TODO: rename and translate df fields
# rename sectors in english
it_sec_names = df.query('`Ateco 2007`!="TOTALE" & `ATECO_2007`>="A" & `ATECO_2007`<="Z"')['Ateco 2007'].drop_duplicates().reset_index(drop=True)

en_sec_names = []

import googletrans #--->pip install googletrans==4.0.0-rc1
from googletrans import Translator,constants
translator = Translator()

# print(sectors_name[0])
for sector in it_sec_names:
    translation = translator.translate(sector, src="it", dest="en")
    en_sec_names.append(translation.text)

for i in range(0, len(it_sec_names)):
    df.loc[df['Ateco 2007']==it_sec_names[i],"Ateco 2007"] = en_sec_names[i]

# df_sectors_tot


## Data filtering

In [None]:
df2 = df.copy()

# unique data
del df2['TIPO_DATO7'] # always the same (HOUWAG_ENTEMP_AV_MI)
del df2['Tipo dato']  # always the same (Retribuzione lorda oraria per ora retribuita delle posizioni lavorative dipendenti in euro (media).)

# ridondance of information
df2 = df2.drop(['SEXISTAT1', 'ETA1_A','PROFILO_PROF','CLLVT','Seleziona periodo'], axis=1)
# del df2['ATECO_2007']

df2 = df2[df2['Flag Codes'] != 'c'] # delete incomplete data

del df2['Flags']
del df2['Flag Codes']

df2.head()

## Data mining

In [None]:
# granularity of sectors exists only for entire Italy (no territorial granularity)
df_sectors = df2.query('`Ateco 2007`!="TOTALE"')

# choose granularity of sectors
df_sectors = df_sectors.query('`ATECO_2007`>="A" & `ATECO_2007`<="Z"')

df_sectors = df_sectors.drop(['Territorio', 'ATECO_2007'], axis=1)

In [None]:
df_territory = df2.query('`Ateco 2007`=="TOTALE"')
df_territory = df_territory.drop(['Ateco 2007', 'ATECO_2007'], axis=1)

In [None]:
# TODO: start with univariate analysis (one variable at a time), continue with multivariate analysis

## Question 1
In private companies, are salaries higher in northern Italy than in the south? (Where do people earn more? Maybe divide by principal/worker/apprentice)

## Question 2
Do women earn less than men in Italy in private companies? Where is the most difference?

In [None]:
df_sex = df_territory.query('Sesso!="totale"')
df_sex = df_sex.drop(['Classe di età','Qualifica contrattuale','Classe di dipendenti'],axis=1)
len(df_sex)

### Plot line chart

In [None]:
df_sex_tot = df_sex.query('Territorio=="Italia"')

title = 'Gender difference salary in Italy'
source = 'Source: Istat'
labels = ['Male','Female']
colors = ['rgb(0,0,115)', 'rgb(115,0,0)']

x_year = np.arange(df_sex_tot.TIME.min(), df_sex_tot.TIME.max()+1)
x_data = np.vstack((x_year,)*2)

df_sex_tot.sort_values(by='TIME')
y_tot_m = df_sex_tot.query('Sesso=="maschi"')
y_tot_w = df_sex_tot.query('Sesso=="femmine"')
y_data = np.array([y_tot_m['Value'],y_tot_w['Value']])

In [None]:
fig = go.Figure()

for i in range(0, 2):
    fig.add_trace(go.Scatter(x=x_data[i], y=y_data[i], mode='lines',
        name=labels[i], line=dict(color=colors[i]), connectgaps=True ))
    # endpoints
    fig.add_trace(go.Scatter(x=x_data[i], y=y_data[i],
        mode='markers+text', marker=dict(color=colors[i]),
        text=y_data[i] , textposition="bottom center"))

fig.update_layout(
    xaxis_title="year",
    yaxis_title="€/h",
    xaxis=dict(showline=True, showticklabels=True, ticks='outside',
        linecolor='rgb(204, 204, 204)', linewidth=2, dtick = 1),
    yaxis=dict(showline=True, showticklabels=True, ticks='outside', 
        linecolor='rgb(204, 204, 204)', linewidth=2, dtick = 5,
        range = [0, max(df_sex_tot['Value']*1.5)]),
    showlegend=False,
    plot_bgcolor='white',
    font=dict(family="Bahnschrift",size=10,color="grey"),

    
)

annotations = []

# # Title
# annotations.append(dict(text=title, showarrow=False,
#     xref='paper', yref='paper', x=0.0, y=1.05, xanchor='left', yanchor='bottom', 
#     font=dict(family='Arial', size=30, color='rgb(37,37,37)'), ))
# # Source
# annotations.append(dict(text=source,showarrow=False,
#     xref='paper', yref='paper', x=0.5, y=-0.1, xanchor='center', yanchor='top', 
#     font=dict(family='Arial', size=12, color='rgb(150,150,150)'), ))

# Name of lines
annotations.append(dict(text=labels[0],showarrow=False,
    xref='paper', x=0.3, y=y_data[0,2]+1.5, xanchor='right', yanchor='middle', 
    font=dict(family="Bahnschrift",size=16,color=colors[0])))
annotations.append(dict(text=labels[1],showarrow=False,
    xref='paper', x=0.3, y=y_data[1,2]-1.5, xanchor='right', yanchor='middle', 
        font=dict(family="Bahnschrift",size=16,color=colors[1])))

# annotations.append(dict(text='ciaoooo',showarrow=False,
#     xref='paper', x=-0.3, y=2015, xanchor='right', yanchor='middle', 
#     font=dict(size=16,color='black')))

fig.update_layout(annotations=annotations)

fig.show()

fig.write_image(f"{figureOutputFolder}/line_MF_gap.svg")

### Plot donuts chart

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = ["<11", "11..13", "13..15", "15..17", ">17"]

# Create subplots: use 'domain' type for Pie subplot
# fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

gender_i = ['maschi','femmine','totale']
gender_e = ['male','female','tot']
band = [1,11,13,15,17,100]
df_sex_province = df_territory[df_territory['ITTER107'].str.contains('.{5}')]


for year in range (2014,2018):
    for g in range(0,len(gender_i)):
        tmp_gen = df_sex_province.query(f'Sesso=="{gender_i[g]}" & TIME=={year}')
        tmp_len=[]
        
        for i in range(0,len(band)-1):
            tmp_len.append(len(tmp_gen.query(
                f'Value>={band[i]} & Value<{band[i+1]}')))#*100/tmp_tot_len)
        # print(tmp_len)

        layout = go.Layout(
            {
                # "title":f"{gender_e[g]} retribution's range {year}",
                # "grid": {"rows": 1, "columns": 2},
                "annotations": [
                    {
                        "font": {"size": 50},"showarrow": False,
                        "text": f"{gender_e[g]}"#\n {year}",
                        #"x": 0.20,"y": 0.5
                    }
                ]
            }
        )

        
        fig = go.Figure(data=[go.Pie(labels=labels, values=tmp_len, hole=.5,
            sort=False,direction ='clockwise')],layout=layout)
         #name=f"{gender[g]} retribution's range {year}")])

        colors_donut=['#4dac26', '#b8e186', '#fefee9', '#f1b6da', '#d01c8b'][::-1]

        fig.update_traces(textfont_size=30,#hoverinfo='label+percent', textinfo='value', 
                  marker=dict(colors=colors_donut, 
                  line=dict(color='#000000', width=0.1)
                  ))

        fig.update_layout(showlegend=False, width=500, height=500,
                            margin=dict(l=20, r=20, t=20, b=20),)

        if year==2014 and g==0:
        #     for i in range(0,len(band)-1):
        #         print(f"Band[{band[i],band[i+1]}]")
        #         print(tmp_gen.query(f'Value>={band[i]} & Value<{band[i+1]}')['Value'])
        #         print()
            fig.show()

        # fig.write_image(f"{figureOutputFolder}/donut_chart/{gender_e[g]}_{year}.svg")
        # fig.write_image(f"{figureOutputFolder}/donut_chart/{gender_e[g]}_{year}.png")

        #         # Export the figure
        # if exportFigure:
        #     fig.write_image(f"{figureOutputFolder}/question 2/{gender_e[g]}_{year}.svg")
        #     fig.write_image(f"{figureOutputFolder}/question 2/{gender_e[g]}_{year}.png", 
        #         width=outputWidthImage, height=outputHeightImage)

### Plot gender map

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import geopandas as gpd
from geopandas import GeoDataFrame

In [None]:
map_df = {}

map_df["map_2014"] = gpd.read_file(f'{dataFolderName}/province_shapes/Prov01012014_g/Prov01012014_g_WGS84.shp')

for year in range(2015,2018):
    fp = f'{dataFolderName}/province_shapes/ProvCM01012017_g/ProvCM01012017_g_WGS84.shp'
    
    map_df[f"map_{year}"] = gpd.read_file(fp) #reading the file stored in variable fp

    map_df[f"map_{year}"].loc[map_df[f"map_{year}"].DEN_PCM=="Aosta","DEN_PCM"] = "Valle d'Aosta/Vallée d'Aoste"
    map_df[f"map_{year}"].loc[map_df[f"map_{year}"].DEN_PCM=="Forli'-Cesena","DEN_PCM"] = "Forlì-Cesena"
    map_df[f"map_{year}"].loc[map_df[f"map_{year}"].DEN_PCM=="Massa Carrara","DEN_PCM"] = "Massa-Carrara"
    map_df[f"map_{year}"].loc[map_df[f"map_{year}"].DEN_PCM=="Bolzano","DEN_PCM"] = "Bolzano/Bozen"

    # print(map_df[f"map_{year}"].plot())
# map_df['map_2015']

In [None]:
# consider only province data
df_sex_province = df_sex[df_sex['ITTER107'].str.contains('.{5}')]

df_sex_province_year = {}
for year in range(2014,2018):
    temp = df_sex_province.query(f'TIME=={year}')
    temp_f = temp.query('Sesso=="maschi"').set_index('ITTER107').drop('Sesso',axis=1)
    temp_d = temp.query('Sesso=="femmine"').set_index('ITTER107')
    temp_f['Value'] = [temp_f['Value'][prov]-temp_d['Value'][prov] for prov in temp['ITTER107'].drop_duplicates()]
    df_sex_province_year[f'group_{year}'] = temp_f

del temp, temp_d, temp_f
# df_sex_province_year["group_2014"]

In [None]:
# merge data with map shapes
merged = {}
for year in range(2014,2018):
    merged[f'merged_{year}'] = df_sex_province_year[f'group_{year}'].set_index('Territorio').join(
        map_df[f'map_{year}'].set_index('DEN_PROV' if year==2014 else 'DEN_PCM'))
    # if year==2015: print(merged[f'merged_{year}'][merged[f'merged_{year}'].isnull().any(axis=1)].drop_duplicates())

In [None]:
vmax = 0
for year in range(2014,2018):
    merged[f'merged_{year}'] = GeoDataFrame(merged[f'merged_{year}'])
    temp = round(merged[f'merged_{year}']['Value'].abs().max())
    if temp > vmax: vmax = temp

vmin, vcenter = -vmax, 0           # set the range for the choropleth

variable = 'Value'
color_map = 'RdBu'

fig, ax = plt.subplots(1, figsize=(10, 6))  # create figure and axes for Matplotlib
ax.axis('off')  # remove the axis
# ax.set_title('Gender salary difference', fontdict={'fontsize': '25', 'fontweight' : '3'}) # add a title

# create an annotation for the data source
# ax.annotate('Source: Istat', xy=(0.1, .08), xycoords='figure fraction', horizontalalignment='left',
#     verticalalignment='top', fontsize=12, color='#555555')

norm = TwoSlopeNorm(vmin=vmin, vcenter=vcenter, vmax=vmax)

merged[f'merged_2015'].plot(column=variable, cmap=color_map, linewidth=0.01, ax=ax, edgecolor='0.8', norm=norm)

sm = plt.cm.ScalarMappable(norm=norm, cmap=color_map)
cbar = fig.colorbar(sm)                                     # add the colorbar to the figure

# fig.savefig('map_export.png', dpi=300)        #saving our map as .png file.

## Question 3
What are the sectors for which the salaries in private companies are highest in Italy?

In [None]:
import plotly.graph_objects as go
from ipywidgets import interact

In [None]:
df_sectors_tot = df_sectors.query('Sesso=="totale" & `Classe di età`=="totale" & \
                              `Classe di dipendenti`=="totale" & `Qualifica contrattuale`=="totale"'
                              )[['Ateco 2007','TIME','Value']]

In [None]:
long_names = [
    'Supply of electricity, gas, steam and air conditioning',
    'Activities of accommodation and catering services',
    'Other services activities',
    'Financial and insurance activities',
    'Rental, travel agencies, business support services'
    ]
br_names = [
    'Supply of electricity, gas,'+'<br>'+'steam and air conditioning',
    'Activities of accommodation'+'<br>'+'and catering services',
    'Other services activities',
    'Financial and insurance'+'<br>'+'activities',
    'Rental, travel agencies,'+'<br>'+'business support services'
    ]
for i in range(0, len(long_names)):
    df_sectors_tot.loc[df_sectors_tot['Ateco 2007']==long_names[i],"Ateco 2007"] = br_names[i]

In [None]:
colors_palette = ['#003a2b','#249e89','#f5f5f5','#d86e58','#6a0000']
colors_barchart = ['#c0c0c0','#c0c0c0','#c0c0c0',colors_palette[1],colors_palette[1]]

howManyEls=2
df_new = pd.DataFrame(columns=['Ateco 2007','TIME','Value'])
val_x_axis = max(df_sectors_tot['Value'])

for year in range(2014,2018,1):
  tmp = df_sectors_tot.query(f'TIME=={year}').sort_values(by='Value')
  df_new = df_new.append(tmp.head(howManyEls))
  
  others = {'Ateco 2007':['Others'],'TIME':[year],'Value':[round(np.average(tmp.head(-howManyEls).tail(-howManyEls)["Value"]),2)]}
  tmp_others = pd.DataFrame(others,columns=['Ateco 2007','TIME','Value'])
  df_new = df_new.append(tmp_others)
  
  df_new = df_new.append(tmp.tail(howManyEls))

  # df_new = df_new.sort_values(by='Value').reset_index()

  fig = px.bar(df_new.query(f'TIME=={year}'), x="Value", y="Ateco 2007", text="Value")

  fig.update_traces(texttemplate='%{text:.2f} ', textposition='inside')

  fig.update_traces(marker_color= colors_barchart)
                  # , marker_line_color='rgb(8,48,107)',marker_line_width=1.5, opacity=0.6)

  fig.update_layout(
    title_text=f'{year}',
    yaxis_title=None,
    xaxis_title="€/h",
    xaxis=dict(showline=True, showticklabels=True, ticks='outside',
      linecolor='rgb(204, 204, 204)', linewidth=2, dtick = 5,
      range = [0, val_x_axis]),
    yaxis=dict( showgrid=False, showline=False, ),
    paper_bgcolor='white',
    plot_bgcolor='white',
    showlegend=False,
    width=800, height=350
    )
  
  fig.show()

In [None]:
howManyEls=2
df_new = pd.DataFrame(columns=['Ateco 2007','TIME','Value'])

for year in range(2014,2018,1):
  tmp = df_sectors_tot.query(f'TIME=={year}').sort_values(by='Value')
  df_new = df_new.append(tmp.head(howManyEls))
  
  others = {'Ateco 2007':['Others<br>Acca<tab>'],'TIME':[year],'Value':[round(np.average(tmp.head(-howManyEls).tail(-howManyEls)["Value"]),2)]}
  tmp_others = pd.DataFrame(others,columns=['Ateco 2007','TIME','Value'])
  df_new = df_new.append(tmp_others)
  
  df_new = df_new.append(tmp.tail(howManyEls))

df_new = df_new.sort_values(by='Value').reset_index()

fig = px.bar(df_new, x="Value", y="Ateco 2007", text="Value",
  animation_frame="TIME", range_x=[0,df_new['Value'].max()*1.1])

fig.update_traces(texttemplate='%{text:.2f} ', textposition='inside')

fig.update_layout(
      xaxis=dict( showgrid=False, showline=False ),
      yaxis=dict( showgrid=False, showline=False, ),
      paper_bgcolor='white',
      plot_bgcolor='white',
    )

fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1000
# fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 1

# figureOutputFolder = 'exported_figures'
fig.write_html(f"{figureOutputFolder}/barchart.svg")
  
fig.show("notebook")