# Employment A*10 industry breakdowns

## 6. Datos relativos a empleo por actividad económica




In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import eurostat
import plotly.express as px # Graphics
from MyFunctions import EUcountries
from pyspark.sql import SparkSession 
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType 
from pyspark.sql import functions as func

In [2]:
spark = SparkSession.builder.appName("EmploymentById").master("local") .getOrCreate() 

### Exploración DF

In [3]:
# Employment A*10 industry breakdowns
emplInd = eurostat.get_data_df('namq_10_a10_e')
emplInd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26992 entries, 0 to 26991
Columns: 192 entries, unit to 1975Q1
dtypes: float64(187), object(5)
memory usage: 39.5+ MB


In [4]:
print("Dataset actualizado a: 29 de Octubre 2021")
print("Nº de filas / entradas: ", emplInd.shape[0])
print("Nº de columnas / variables: ", emplInd.shape[1])
print("Tipo de datos para cada variable: ")
display(emplInd.dtypes)
print("Total valores nulos: ", emplInd.isnull().sum().sum())
print("Variables con valores nulos: ")
print("  2021Q3: ", emplInd['2021Q3'].isnull().sum().sum(), "valores nulos")
print("  2018Q3: ", emplInd['2018Q3'].isnull().sum().sum(),  "valores nulos")

Dataset actualizado a: 29 de Octubre 2021
Nº de filas / entradas:  26992
Nº de columnas / variables:  192
Tipo de datos para cada variable: 


unit         object
nace_r2      object
s_adj        object
na_item      object
geo\time     object
             ...   
1976Q1      float64
1975Q4      float64
1975Q3      float64
1975Q2      float64
1975Q1      float64
Length: 192, dtype: object

Total valores nulos:  2339218
Variables con valores nulos: 
  2021Q3:  25627 valores nulos
  2018Q3:  1551 valores nulos


In [5]:
emplInd.sample(3)

Unnamed: 0,unit,nace_r2,s_adj,na_item,geo\time,2021Q3,2021Q2,2021Q1,2020Q4,2020Q3,...,1977Q2,1977Q1,1976Q4,1976Q3,1976Q2,1976Q1,1975Q4,1975Q3,1975Q2,1975Q1
12814,PCH_SM_PER,F,NSA,SAL_DC,CZ,,4.3,1.1,1.3,0.1,...,,,,,,,,,,
14215,PCH_SM_PER,O-Q,NSA,EMP_DC,NO,,2.5,1.3,1.2,0.2,...,,,,,,,,,,
20230,PC_TOT_PER,O-Q,CA,EMP_DC,LV,,20.9,23.0,21.5,21.0,...,,,,,,,,,,


In [6]:
#2021Q3 Tiene demasiados datos null 
emplInd.drop(['2021Q3'], axis=1, inplace=True)

In [7]:
print('Descripción Variables: ')
print('Unit: Unidad de medida')
print(emplInd.unit.unique())
print('Tipo: Object')
print('--------------------------------------------------------------------')
print('nace_r2: Ajuste estacional')
print(emplInd.nace_r2.unique())
print('Tipo: Object')
print('--------------------------------------------------------------------')
print('s_adj: Ajuste estacional')
print(emplInd.s_adj.unique())
print('Tipo: Object')
print('--------------------------------------------------------------------')
print('na_item: Indicador de cuentas nacionales')
print(emplInd.na_item.unique())
print('Tipo: Object')
print('--------------------------------------------------------------------')


Descripción Variables: 
Unit: Unidad de medida
['I15_HW' 'I15_JOB' 'I15_PER' 'PCH_PRE_HW' 'PCH_PRE_JOB' 'PCH_PRE_PER'
 'PCH_SM_HW' 'PCH_SM_JOB' 'PCH_SM_PER' 'PC_TOT_HW' 'PC_TOT_JOB'
 'PC_TOT_PER' 'THS_HW' 'THS_JOB' 'THS_PER']
Tipo: Object
--------------------------------------------------------------------
nace_r2: Ajuste estacional
['A' 'B-E' 'C' 'F' 'G-I' 'J' 'K' 'L' 'M_N' 'O-Q' 'R-U' 'TOTAL']
Tipo: Object
--------------------------------------------------------------------
s_adj: Ajuste estacional
['CA' 'NSA' 'SA' 'SCA']
Tipo: Object
--------------------------------------------------------------------
na_item: Indicador de cuentas nacionales
['EMP_DC' 'SAL_DC' 'SELF_DC']
Tipo: Object
--------------------------------------------------------------------


### Limpieza y preparación

**FILTROS**:

* Primer filtro: trimestres en este caso dejaré datos del año 2018 en adelante  



In [8]:
#Primer filtro: trimestres en este caso dejaré datos del año 2018 en adelante, por ser el ultimo año en el que algún país ingresó formalment a la UE
emplInd.drop(emplInd.iloc[:,19:], inplace = True, axis = 1) 


* Segundo filtro: filtrar por los paises miembros de la UE  


In [9]:
#Rename columns

emplInd.columns = ['unit','nace_r2', 's_adj', 'na_item', 'country', '2021Q2', '2021Q1', '2020Q4', '2020Q3', '2020Q2', '2020Q1', '2019Q4',
                '2019Q3', '2019Q2', '2019Q1', '2018Q4',  '2018Q3', '2018Q2', '2018Q1']

In [10]:
# Segundo filtro: filtrar por los paises miembros de la UE
# from MyFunctions import DF EU countries.

emplInd = emplInd[emplInd.country.isin(EUcountries().Code)]

#Verico que estén los 27
emplInd['country'].unique()

array(['FI', 'IT', 'LU', 'LV', 'AT', 'BG', 'CY', 'CZ', 'DE', 'DK', 'EE',
       'EL', 'ES', 'FR', 'HR', 'HU', 'IE', 'LT', 'MT', 'NL', 'PL', 'PT',
       'RO', 'SE', 'SI', 'SK', 'BE'], dtype=object)

In [11]:
#Busco null values
emplInd.isnull().sum().sum()

6581

In [12]:
#Muestra de algunos valores nulos
emplInd[emplInd.isnull().any(1)].sample(5)



Unnamed: 0,unit,nace_r2,s_adj,na_item,country,2021Q2,2021Q1,2020Q4,2020Q3,2020Q2,2020Q1,2019Q4,2019Q3,2019Q2,2019Q1,2018Q4,2018Q3,2018Q2,2018Q1
22472,THS_HW,K,SA,EMP_DC,CZ,,,,,,,,,,,,,,
16420,PC_TOT_HW,K,SA,EMP_DC,CZ,,,,,,,,,,,,,,
15444,PC_TOT_HW,C,NSA,SELF_DC,BE,,,,,,,,,,,,,,
22622,THS_HW,L,NSA,EMP_DC,MT,,,,,,,,,,,,,,
22708,THS_HW,L,SA,EMP_DC,MT,,,,,,,,,,,,,,


In [13]:
# Cargo los diccionarios para emplInd desde el bulkdown de Eurostat
unit = eurostat.get_dic('unit')
nace_r2 = eurostat.get_dic('nace_r2')
s_adj = eurostat.get_dic('s_adj')
na_item =eurostat.get_dic('na_item')
geo =eurostat.get_dic('geo')

In [14]:
#Uso la variable broadcast porque solo me interesan los datos presentes en el dataframe
broadcast_unit = spark.sparkContext.broadcast(unit)
broadcast_nace_r2 = spark.sparkContext.broadcast(nace_r2)
broadcast_s_adj = spark.sparkContext.broadcast(s_adj)
broadcast_na_item = spark.sparkContext.broadcast(na_item)
broadcast_geo = spark.sparkContext.broadcast(geo)

In [15]:
#From pandas to spark
empByInd = spark.createDataFrame(emplInd)
empByInd.printSchema()

root
 |-- unit: string (nullable = true)
 |-- nace_r2: string (nullable = true)
 |-- s_adj: string (nullable = true)
 |-- na_item: string (nullable = true)
 |-- country: string (nullable = true)
 |-- 2021Q2: double (nullable = true)
 |-- 2021Q1: double (nullable = true)
 |-- 2020Q4: double (nullable = true)
 |-- 2020Q3: double (nullable = true)
 |-- 2020Q2: double (nullable = true)
 |-- 2020Q1: double (nullable = true)
 |-- 2019Q4: double (nullable = true)
 |-- 2019Q3: double (nullable = true)
 |-- 2019Q2: double (nullable = true)
 |-- 2019Q1: double (nullable = true)
 |-- 2018Q4: double (nullable = true)
 |-- 2018Q3: double (nullable = true)
 |-- 2018Q2: double (nullable = true)
 |-- 2018Q1: double (nullable = true)



In [16]:
from pyspark.sql import functions as func

# Look for the codes for each variable, the meaning of the codes and count how many times apears in the DF
UnitText = empByInd.groupBy("unit").count()
nace_r2Text = empByInd.groupBy("nace_r2").count()
s_adjText = empByInd.groupBy("s_adj").count()
na_itemText = empByInd.groupBy("na_item").count()
geoText = empByInd.groupBy("country").count()

# Function to look up code names from broadcasted variables
###...Problem: I've not been able to use just one Function and replace the variables. def lookup(x)
def lookupUnit(unit):
    return broadcast_unit.value[unit]

def lookupNace_r2(nace_r2):
    return broadcast_nace_r2.value[nace_r2]

def lookupS_adj(s_adj):
    return broadcast_s_adj.value[s_adj]

def lookupNa_item(na_item):
    return broadcast_na_item.value[na_item]

def lookupGeo(geo):
    return broadcast_geo.value[geo]

lookupUnitUDF = func.udf(lookupUnit)
lookupNace_r2UDF = func.udf(lookupNace_r2)
lookupS_adjUDF = func.udf(lookupS_adj)
lookupNa_itemUDF = func.udf(lookupNa_item)
lookupGeoUDF = func.udf(lookupGeo)

# Add a new column

UnitDesc = UnitText.withColumn("UnitDesc", lookupUnitUDF(func.col("unit")))
nace_r2Desc = nace_r2Text.withColumn("UnitDesc", lookupNace_r2UDF(func.col("nace_r2")))
s_adjDesc = s_adjText.withColumn("s_adjDesc", lookupS_adjUDF(func.col("s_adj")))
na_itemDesc = na_itemText.withColumn("na_itemDesc", lookupNa_itemUDF(func.col("na_item")))
geoDesc = geoText.withColumn("CountryName", lookupGeoUDF(func.col("Country")))

print("----------------------------------------------")
print("Codes in Employment A*10 industry breakdowns")
print("----------------------------------------------")

UnitDesc.show(truncate=False)
nace_r2Desc.show(truncate=False)
s_adjDesc.show(truncate=False)
na_itemDesc.show(truncate=False)
geoDesc.show(27, truncate=False)

----------------------------------------------
Codes in Employment A*10 industry breakdowns
----------------------------------------------
+-----------+-----+----------------------------------------------------------------------------------+
|unit       |count|UnitDesc                                                                          |
+-----------+-----+----------------------------------------------------------------------------------+
|PCH_SM_PER |2050 |Percentage change compared to same period in previous year (based on persons)     |
|PC_TOT_PER |2052 |Percentage of total (based on persons)                                            |
|I15_HW     |2066 |Index, 2015=100 (based on hours worked)                                           |
|I15_PER    |2042 |Index, 2015=100 (based on persons)                                                |
|PCH_SM_HW  |2098 |Percentage change compared to same period in previous year (based on hours worked)|
|THS_HW     |2100 |Thousand hours wor

In [17]:
# Para la variable unit me quedo con thousands 
# s_adj solo not adjusted data
# De la variable na_item me quedo con total employment domestic concept
# Elimino las tres variables filtradas

unitconcept = ['THS_PER'] #Thousand persons
s_adjconcept= ['NSA'] #not adjusted data 
na_itemconcept= ['EMP_DC'] #Total employment domestic concept
empByInd = empByInd.filter((empByInd.unit.isin(unitconcept)) & (empByInd.s_adj.isin(s_adjconcept)) & (empByInd.na_item.isin(na_itemconcept)))
empByInd = empByInd.select([c for c in empByInd.columns if c not in {'unit', 's_adj', 'na_item'}])

In [18]:
empByInd.show()

+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
|nace_r2|country|2021Q2|2021Q1|2020Q4|2020Q3|2020Q2|2020Q1|2019Q4|2019Q3|2019Q2|2019Q1|2018Q4|2018Q3|2018Q2|2018Q1|
+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
|      A|     AT|157.02|145.45|151.85|161.05|155.49|142.05|147.19|155.76|150.76|145.17|153.71|161.37|162.23|151.61|
|      A|     BE|  59.0|  54.5|  59.9|  65.8|  59.8|  53.9|  59.1|  64.7|  59.5|  54.1|  58.4|  64.3|  59.2|  53.5|
|      A|     BG|581.22|476.49|531.15|680.64|645.14| 525.9|536.31|664.33|640.91|545.45|518.81| 724.4|680.29|574.14|
|      A|     CY| 15.28| 14.75| 14.89| 15.09| 15.16| 14.82| 14.98| 15.15| 15.22| 14.69| 14.77|  14.9| 14.88| 14.89|
|      A|     CZ|157.82|145.66|156.05|176.81|160.92| 144.9|154.35|173.85|163.15|150.01|161.65|181.49|167.95|150.99|
|      A|     DE| 601.0| 533.0| 549.0| 601.0| 616.0| 555.0| 572.0| 619.0

In [19]:
print(empByInd.select(['Country']).distinct().count())

27


In [20]:
empByInd = empByInd.selectExpr('nace_r2', 'country',  """stack(14,'2021Q2', 2021Q2, '2021Q1', 2021Q1, '2020Q4', 2020Q4, '2020Q3', 2020Q3, '2020Q2', 2020Q2, '2020Q1', 2020Q1, '2019Q4', 2019Q4,
                '2019Q3', 2019Q3, '2019Q2', 2019Q2, '2019Q1', 2019Q1, '2018Q4', 2018Q4, '2018Q3', 2018Q3, '2018Q2', 2018Q2, '2018Q1', 2018Q1) as (Date, cant)""")

In [21]:
empByInd.printSchema()

root
 |-- nace_r2: string (nullable = true)
 |-- country: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- cant: double (nullable = true)



In [22]:
empByInd.summary().show()

+-------+-------+-------+------+------------------+
|summary|nace_r2|country|  Date|              cant|
+-------+-------+-------+------+------------------+
|  count|   4536|   4536|  4536|              4536|
|   mean|   null|   null|  null|1372.0202491181642|
| stddev|   null|   null|  null|3843.6437858169106|
|    min|      A|     AT|2018Q1|              2.09|
|    25%|   null|   null|  null|              83.2|
|    50%|   null|   null|  null|            286.23|
|    75%|   null|   null|  null|             998.3|
|    max|  TOTAL|     SK|2021Q2|           45559.0|
+-------+-------+-------+------+------------------+



In [23]:
empByInd.select([func.count(func.when(func.isnan(c) | func.col(c).isNull(), c)).alias(c) for c in empByInd.columns]).show()

+-------+-------+----+----+
|nace_r2|country|Date|cant|
+-------+-------+----+----+
|      0|      0|   0|   0|
+-------+-------+----+----+



In [24]:
empByInd_F = empByInd.withColumn("EconomicAct", lookupNace_r2UDF(func.col("nace_r2"))).withColumn("CountryName", lookupGeoUDF(func.col("Country"))).orderBy("country","date")


In [25]:
empByInd_F.show()

+-------+-------+------+-------+--------------------+-----------+
|nace_r2|country|  Date|   cant|         EconomicAct|CountryName|
+-------+-------+------+-------+--------------------+-----------+
|      A|     AT|2018Q1| 151.61|Agriculture, fore...|    Austria|
|    B-E|     AT|2018Q1| 699.27|Industry (except ...|    Austria|
|    R-U|     AT|2018Q1| 196.16|Arts, entertainme...|    Austria|
|  TOTAL|     AT|2018Q1|4404.95|Total - all NACE ...|    Austria|
|    G-I|     AT|2018Q1|1204.07|Wholesale and ret...|    Austria|
|      J|     AT|2018Q1| 121.38|Information and c...|    Austria|
|      L|     AT|2018Q1|  62.24|Real estate activ...|    Austria|
|      K|     AT|2018Q1| 122.97|Financial and ins...|    Austria|
|      C|     AT|2018Q1| 643.33|       Manufacturing|    Austria|
|    M_N|     AT|2018Q1| 527.51|Professional, sci...|    Austria|
|    O-Q|     AT|2018Q1|1056.68|Public administra...|    Austria|
|      F|     AT|2018Q1| 263.07|        Construction|    Austria|
|      L| 

In [26]:
Germanie_Issue = empByInd_F.toPandas()
Germanie_Issue['CountryName'].replace(
    to_replace=['Germany (until 1990 former territory of the FRG)'],
    value='Germany',
    inplace=True
)
empByInd_F = spark.createDataFrame(Germanie_Issue)

In [27]:
#Save parquet file in hdfs whit unit thousends of people, solo adjusted data y total employment domestic concept


empByInd_F.write.mode('overwrite').parquet("hdfs://localhost:9000/TFM_CEE/row/EmpByIndust.parquet")