	
# Population and employment

## 4. Datos relativos a población y empleo

Indicadores clave auxiliares, su uso deriva en agregados principales de PIB per capita, productividad y coste unitario de fuerza laboral


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import eurostat
import plotly.express as px # Graphics
from MyFunctions import EUcountries
from pyspark.sql import SparkSession 
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType 
from pyspark.sql import functions as func

In [2]:
spark = SparkSession.builder.appName("PopulationAndEmpl").master("local") .getOrCreate() 

### Exploración DF

In [3]:
# # Load data on Population and employment
import eurostat
PopEmpl = eurostat.get_data_df('namq_10_pe')
PopEmpl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1264 entries, 0 to 1263
Columns: 191 entries, unit to 1975Q1
dtypes: float64(187), object(4)
memory usage: 1.8+ MB


In [4]:
print("Dataset actualizado a: 29 de Octubre 2021")
print("Nº de filas / entradas: ", PopEmpl.shape[0])
print("Nº de columnas / variables: ", PopEmpl.shape[1])
print("Tipo de datos para cada variable: ")
display(PopEmpl.dtypes)
print("Total valores nulos: ", PopEmpl.isnull().sum().sum())
print("Variables con valores nulos: ")
print("  2021Q3: ", PopEmpl['2021Q3'].isnull().sum().sum(), "valores nulos")
print("  2018Q3: ", PopEmpl['2018Q3'].isnull().sum().sum(),  "valores nulos")

Dataset actualizado a: 29 de Octubre 2021
Nº de filas / entradas:  1264
Nº de columnas / variables:  191
Tipo de datos para cada variable: 


unit         object
s_adj        object
na_item      object
geo\time     object
2021Q3      float64
             ...   
1976Q1      float64
1975Q4      float64
1975Q3      float64
1975Q2      float64
1975Q1      float64
Length: 191, dtype: object

Total valores nulos:  109075
Variables con valores nulos: 
  2021Q3:  1204 valores nulos
  2018Q3:  66 valores nulos


In [5]:
PopEmpl.sample(5)

Unnamed: 0,unit,s_adj,na_item,geo\time,2021Q3,2021Q2,2021Q1,2020Q4,2020Q3,2020Q2,...,1977Q2,1977Q1,1976Q4,1976Q3,1976Q2,1976Q1,1975Q4,1975Q3,1975Q2,1975Q1
371,PCH_SM_PER,NSA,SAL_DC,BG,,2.2,-1.8,-2.5,-3.8,-4.6,...,,,,,,,,,,
1083,THS_PER,SCA,EMP_DC,BE,,4963.9,4926.7,4912.3,4882.5,4873.7,...,,,,,,,,,,
996,THS_PER,NSA,SELF_NC,CH,,441.33,438.01,441.03,442.72,433.18,...,,,,,,,,,,
107,PCH_PRE_PER,SCA,EMP_NC,SI,,0.3,0.6,0.5,0.5,-2.5,...,,,,,,,,,,
73,PCH_PRE_PER,SCA,EMP_DC,IE,,3.7,-1.8,0.4,3.3,-6.0,...,,,,,,,,,,


In [6]:
#2021Q3 Tiene demasiados datos null 
PopEmpl.drop(['2021Q3'], axis=1, inplace=True)

In [7]:
print('Descripción Variables: ')
print('Unit: Unidad de medida')
print(PopEmpl.unit.unique())
print('Tipo: Object')
print('--------------------------------------------------------------------')
print('s_adj: Ajuste estacional')
print(PopEmpl.s_adj.unique())
print('Tipo: Object')
print('--------------------------------------------------------------------')
print('na_item: Indicador de cuentas nacionales')
print(PopEmpl.na_item.unique())
print('Tipo: Object')
print('--------------------------------------------------------------------')


Descripción Variables: 
Unit: Unidad de medida
['PCH_PRE_PER' 'PCH_SM_PER' 'THS_PER']
Tipo: Object
--------------------------------------------------------------------
s_adj: Ajuste estacional
['SA' 'SCA' 'CA' 'NSA']
Tipo: Object
--------------------------------------------------------------------
na_item: Indicador de cuentas nacionales
['EMP_DC' 'EMP_NC' 'POP_NC' 'SAL_DC' 'SAL_NC' 'SELF_DC' 'SELF_NC']
Tipo: Object
--------------------------------------------------------------------


### Limpieza y preparación

**FILTROS**:

* Primer filtro: trimestres en este caso dejaré datos del año 2018 en adelante  

In [8]:
#Primer filtro: trimestres en este caso dejaré datos del año 2007 en adelante, por ser el ultimo año en el que algún país ingresó formalment a la UE
PopEmpl.drop(PopEmpl.iloc[:,18:], inplace = True, axis = 1) 

In [9]:
#Rename columns

PopEmpl.columns = ['unit','s_adj', 'na_item', 'country', '2021Q2', '2021Q1', '2020Q4', '2020Q3', '2020Q2', '2020Q1', '2019Q4',
                '2019Q3', '2019Q2', '2019Q1', '2018Q4',  '2018Q3', '2018Q2', '2018Q1']

Segundo filtro: filtrar por los paises miembros de la UE

In [10]:
# Segundo filtro: filtrar por los paises miembros de la UE
# from MyFunctions import DF EU countries.

PopEmpl = PopEmpl[PopEmpl.country.isin(EUcountries().Code)]

#Verico que estén los 27
PopEmpl['country'].unique()

array(['CZ', 'EL', 'FR', 'MT', 'PL', 'PT', 'IT', 'AT', 'BE', 'BG', 'CY',
       'DE', 'DK', 'EE', 'ES', 'FI', 'HR', 'HU', 'IE', 'LT', 'LU', 'LV',
       'NL', 'RO', 'SE', 'SI', 'SK'], dtype=object)

In [11]:
PopEmpl.sample(4)

Unnamed: 0,unit,s_adj,na_item,country,2021Q2,2021Q1,2020Q4,2020Q3,2020Q2,2020Q1,2019Q4,2019Q3,2019Q2,2019Q1,2018Q4,2018Q3,2018Q2,2018Q1
1261,THS_PER,SCA,SELF_NC,SI,200.02,200.2,199.8,199.95,196.78,199.21,198.86,198.05,197.4,196.49,195.62,194.8,194.0,193.33
118,PCH_PRE_PER,SCA,POP_NC,FI,0.1,0.0,0.1,0.1,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
299,PCH_SM_PER,NSA,EMP_NC,BE,1.8,0.4,-0.2,-0.5,-0.3,0.9,1.6,1.6,1.5,1.6,1.4,1.4,1.4,1.7
614,PCH_SM_PER,SCA,EMP_NC,LV,1.0,-6.5,-3.0,-2.7,-1.5,-0.2,0.4,-0.2,-0.4,0.7,0.8,1.8,1.9,1.6


In [12]:
#Busco null values
PopEmpl.isnull().sum().sum()

84

In [13]:
#Muestra de algunos valores nulos
PopEmpl[PopEmpl.isnull().any(1)].sample(5)

# despuues de varias samples coinciden con código na_item POP_NC Total population national concept y SA Seasonally adjusted data, not calendar adjusted data                         

Unnamed: 0,unit,s_adj,na_item,country,2021Q2,2021Q1,2020Q4,2020Q3,2020Q2,2020Q1,2019Q4,2019Q3,2019Q2,2019Q1,2018Q4,2018Q3,2018Q2,2018Q1
20,PCH_PRE_PER,SA,POP_NC,IT,,,,,,,,,,,,,,
1046,THS_PER,SA,POP_NC,IT,,,,,,,,,,,,,,
1048,THS_PER,SA,POP_NC,PL,,,,,,,,,,,,,,
22,PCH_PRE_PER,SA,POP_NC,PL,,,,,,,,,,,,,,
533,PCH_SM_PER,SA,POP_NC,IT,,,,,,,,,,,,,,


In [14]:
# Cargo los diccionarios para PopEmpl desde el bulkdown de Eurostat
unit = eurostat.get_dic('unit')
s_adj = eurostat.get_dic('s_adj')
na_item =eurostat.get_dic('na_item')
geo =eurostat.get_dic('geo')

In [15]:
#Uso la variable broadcast porque solo me interesan los datos presentes en el dataframe
broadcast_unit = spark.sparkContext.broadcast(unit)
broadcast_s_adj = spark.sparkContext.broadcast(s_adj)
broadcast_na_item = spark.sparkContext.broadcast(na_item)
broadcast_geo = spark.sparkContext.broadcast(geo)

In [16]:
PopAndEmp = spark.createDataFrame(PopEmpl)

In [17]:
PopAndEmp.printSchema()

root
 |-- unit: string (nullable = true)
 |-- s_adj: string (nullable = true)
 |-- na_item: string (nullable = true)
 |-- country: string (nullable = true)
 |-- 2021Q2: double (nullable = true)
 |-- 2021Q1: double (nullable = true)
 |-- 2020Q4: double (nullable = true)
 |-- 2020Q3: double (nullable = true)
 |-- 2020Q2: double (nullable = true)
 |-- 2020Q1: double (nullable = true)
 |-- 2019Q4: double (nullable = true)
 |-- 2019Q3: double (nullable = true)
 |-- 2019Q2: double (nullable = true)
 |-- 2019Q1: double (nullable = true)
 |-- 2018Q4: double (nullable = true)
 |-- 2018Q3: double (nullable = true)
 |-- 2018Q2: double (nullable = true)
 |-- 2018Q1: double (nullable = true)



In [18]:
from pyspark.sql import functions as func

# Look for the codes for each variable, the meaning of the codes and count how many times apears in the DF
UnitText = PopAndEmp.groupBy("unit").count()
s_adjText = PopAndEmp.groupBy("s_adj").count()
na_itemText = PopAndEmp.groupBy("na_item").count()
geoText = PopAndEmp.groupBy("country").count()

# Function to look up code names from broadcasted variables
###...Problem: I've not been able to use just one Function and replace the variables. def lookup(x)
def lookupUnit(unit):
    return broadcast_unit.value[unit]

def lookupS_adj(s_adj):
    return broadcast_s_adj.value[s_adj]

def lookupNa_item(na_item):
    return broadcast_na_item.value[na_item]

def lookupGeo(geo):
    return broadcast_geo.value[geo]

lookupUnitUDF = func.udf(lookupUnit)
lookupS_adjUDF = func.udf(lookupS_adj)
lookupNa_itemUDF = func.udf(lookupNa_item)
lookupGeoUDF = func.udf(lookupGeo)

# Add a new column

UnitDesc = UnitText.withColumn("UnitDesc", lookupUnitUDF(func.col("unit")))
s_adjDesc = s_adjText.withColumn("s_adjDesc", lookupS_adjUDF(func.col("s_adj")))
na_itemDesc = na_itemText.withColumn("na_itemDesc", lookupNa_itemUDF(func.col("na_item")))
geoDesc = geoText.withColumn("CountryName", lookupGeoUDF(func.col("Country")))

print("----------------------------------------------")
print("Codes in Population and Employmnet DataFrame")
print("----------------------------------------------")

UnitDesc.show(truncate=False)
s_adjDesc.show(truncate=False)
na_itemDesc.show(truncate=False)
geoDesc.show(27, truncate=False)

----------------------------------------------
Codes in Population and Employmnet DataFrame
----------------------------------------------
+-----------+-----+-----------------------------------------------------------------------------+
|unit       |count|UnitDesc                                                                     |
+-----------+-----+-----------------------------------------------------------------------------+
|PCH_SM_PER |398  |Percentage change compared to same period in previous year (based on persons)|
|THS_PER    |398  |Thousand persons                                                             |
|PCH_PRE_PER|188  |Percentage change on previous period (based on persons)                      |
+-----------+-----+-----------------------------------------------------------------------------+

+-----+-----+-----------------------------------------------------------------------------+
|s_adj|count|s_adjDesc                                                            

In [19]:
#De este DataFrame quiero todos los datos con NSA y THS_PER
#Elimino la columna s_adj y unit

s_adjconcept= ['NSA'] #Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data) 
unitconcept = ['THS_PER'] # Thousand persons

PopAndEmp = PopAndEmp.filter((PopAndEmp.s_adj.isin(s_adjconcept)) & (PopAndEmp.unit.isin(unitconcept)))
PopAndEmp = PopAndEmp.select([c for c in PopAndEmp.columns if c not in {'s_adj'}])
PopAndEmp = PopAndEmp.select([c for c in PopAndEmp.columns if c not in {'unit'}])

In [20]:
#verifico cant paises
print(PopAndEmp.select(['Country']).distinct().count())

27


In [21]:
PopAndEmp.printSchema()

root
 |-- na_item: string (nullable = true)
 |-- country: string (nullable = true)
 |-- 2021Q2: double (nullable = true)
 |-- 2021Q1: double (nullable = true)
 |-- 2020Q4: double (nullable = true)
 |-- 2020Q3: double (nullable = true)
 |-- 2020Q2: double (nullable = true)
 |-- 2020Q1: double (nullable = true)
 |-- 2019Q4: double (nullable = true)
 |-- 2019Q3: double (nullable = true)
 |-- 2019Q2: double (nullable = true)
 |-- 2019Q1: double (nullable = true)
 |-- 2018Q4: double (nullable = true)
 |-- 2018Q3: double (nullable = true)
 |-- 2018Q2: double (nullable = true)
 |-- 2018Q1: double (nullable = true)



In [22]:
PopAndEmp = PopAndEmp.selectExpr('na_item', 'country', """stack(14,'2021Q2', 2021Q2, '2021Q1', 2021Q1, '2020Q4', 2020Q4, '2020Q3', 2020Q3, '2020Q2', 2020Q2, '2020Q1', 2020Q1, '2019Q4', 2019Q4,
                '2019Q3', 2019Q3, '2019Q2', 2019Q2, '2019Q1', 2019Q1, '2018Q4', 2018Q4, '2018Q3', 2018Q3, '2018Q2', 2018Q2, '2018Q1', 2018Q1) as (Date, cant)""")

In [23]:
PopAndEmp.printSchema()

root
 |-- na_item: string (nullable = true)
 |-- country: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- cant: double (nullable = true)



In [24]:
PopAndEmp.summary().show()

+-------+-------+-------+------+------------------+
|summary|na_item|country|  Date|              cant|
+-------+-------+-------+------+------------------+
|  count|   2646|   2646|  2646|              2646|
|   mean|   null|   null|  null| 6750.356636432363|
| stddev|   null|   null|  null|12111.722871965178|
|    min| EMP_DC|     AT|2018Q1|             21.39|
|    25%|   null|   null|  null|            655.07|
|    50%|   null|   null|  null|            2633.2|
|    75%|   null|   null|  null|           5458.74|
|    max|SELF_NC|     SK|2021Q2|           83194.0|
+-------+-------+-------+------+------------------+



In [25]:
PopAndEmp.select([func.count(func.when(func.isnan(c) | func.col(c).isNull(), c)).alias(c) for c in PopAndEmp.columns]).show()

+-------+-------+----+----+
|na_item|country|Date|cant|
+-------+-------+----+----+
|      0|      0|   0|   0|
+-------+-------+----+----+



In [26]:
PopAndEmp_F = PopAndEmp.withColumn("Indicador", lookupNa_itemUDF(func.col("na_item"))).withColumn("CountryName", lookupGeoUDF(func.col("Country"))).orderBy('Country', "date")

In [27]:
PopAndEmp_F.show()

+-------+-------+------+-------+--------------------+-----------+
|na_item|country|  Date|   cant|           Indicador|CountryName|
+-------+-------+------+-------+--------------------+-----------+
|SELF_DC|     AT|2018Q1| 540.75|Self-employed dom...|    Austria|
| EMP_DC|     AT|2018Q1|4404.95|Total employment ...|    Austria|
|SELF_NC|     AT|2018Q1| 540.75|Self-employed nat...|    Austria|
| POP_NC|     AT|2018Q1|8824.65|Total population ...|    Austria|
| SAL_DC|     AT|2018Q1|3864.19|Employees domesti...|    Austria|
| EMP_NC|     AT|2018Q1|4296.55|Total employment ...|    Austria|
| SAL_NC|     AT|2018Q1|3755.79|Employees nationa...|    Austria|
| SAL_DC|     AT|2018Q2|3933.31|Employees domesti...|    Austria|
|SELF_DC|     AT|2018Q2| 545.38|Self-employed dom...|    Austria|
| SAL_NC|     AT|2018Q2|3835.72|Employees nationa...|    Austria|
| EMP_NC|     AT|2018Q2|4381.09|Total employment ...|    Austria|
| POP_NC|     AT|2018Q2|8830.12|Total population ...|    Austria|
|SELF_NC| 

In [28]:
Germanie_Issue = PopAndEmp_F.toPandas()
Germanie_Issue['CountryName'].replace(
    to_replace=['Germany (until 1990 former territory of the FRG)'],
    value='Germany',
    inplace=True
)
PopAndEmp_F = spark.createDataFrame(Germanie_Issue)

In [29]:
#Guardar en HDFS row data

PopAndEmp_F.write.mode('overwrite').parquet("hdfs://localhost:9000//TFM_CEE/output/PopAndEmp.parquet")