# Population by sex, age, citizenship and labour status

## 5. Datos relativos a población por sexo, edad, ciudadanía y empleo

In [1]:
import pandas as pd
import eurostat
import plotly.express as px # Graphics
from MyFunctions import EUcountries
from pyspark.sql import SparkSession 
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType 
from pyspark.sql import functions as func

In [2]:
spark = SparkSession.builder.appName("PopulationByAge").master("local") .getOrCreate() 

### Exploración DF

In [3]:
# Load data on Population by sex, age, citizenship and labour status
PopByAge = eurostat.get_data_df('lfsq_pganws')
PopByAge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188206 entries, 0 to 188205
Data columns (total 100 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   unit      188206 non-null  object 
 1   sex       188206 non-null  object 
 2   citizen   188206 non-null  object 
 3   age       188206 non-null  object 
 4   wstatus   188206 non-null  object 
 5   geo\time  188206 non-null  object 
 6   2021Q2    58429 non-null   float64
 7   2021Q1    58592 non-null   float64
 8   2020Q4    59905 non-null   float64
 9   2020Q3    62079 non-null   float64
 10  2020Q2    62058 non-null   float64
 11  2020Q1    61965 non-null   float64
 12  2019Q4    108409 non-null  float64
 13  2019Q3    108877 non-null  float64
 14  2019Q2    108823 non-null  float64
 15  2019Q1    108885 non-null  float64
 16  2018Q4    109044 non-null  float64
 17  2018Q3    108785 non-null  float64
 18  2018Q2    108711 non-null  float64
 19  2018Q1    109373 non-null  float64
 20  201

In [4]:
print("Dataset actualizado a: 29 de Octubre 2021")
print("Nº de filas / entradas: ", PopByAge.shape[0])
print("Nº de columnas / variables: ", PopByAge.shape[1])
print("Tipo de datos para cada variable: ")
display(PopByAge.dtypes)
print("Total valores nulos: ", PopByAge.isnull().sum().sum())
print("Variables con valores nulos: ")
print("  2021Q2: ", PopByAge['2021Q2'].isnull().sum().sum(), "valores nulos")
print("  2018Q3: ", PopByAge['2018Q3'].isnull().sum().sum(),  "valores nulos")

Dataset actualizado a: 29 de Octubre 2021
Nº de filas / entradas:  188206
Nº de columnas / variables:  100
Tipo de datos para cada variable: 


unit        object
sex         object
citizen     object
age         object
wstatus     object
            ...   
1999Q1     float64
1998Q4     float64
1998Q3     float64
1998Q2     float64
1998Q1     float64
Length: 100, dtype: object

Total valores nulos:  10289711
Variables con valores nulos: 
  2021Q2:  129777 valores nulos
  2018Q3:  79421 valores nulos


In [5]:
PopByAge.sample(3)

Unnamed: 0,unit,sex,citizen,age,wstatus,geo\time,2021Q2,2021Q1,2020Q4,2020Q3,...,2000Q2,2000Q1,1999Q4,1999Q3,1999Q2,1999Q1,1998Q4,1998Q3,1998Q2,1998Q1
156894,THS,T,NEU15_FOR,Y25-64,POP,CY,,,,,...,17.9,,,,15.2,,,,,
64924,THS,M,EU15_FOR,Y35-39,UNE,EA19,,,,,...,,,,,,,,,,
68920,THS,M,EU27_2020_FOR,Y15-74,INAC,EU28,,,,,...,,,,,,,,,,


### Limpieza y preparación

**FILTROS**:

Primer filtro: trimestres en este caso dejaré datos del año 2018 en adelante  

Segundo filtro: filtrar por los paises miembros de la UE  



In [6]:
#Primer filtro: trimestres en este caso dejaré datos del año 2007 en adelante, por ser el ultimo año en el que algún país ingresó formalment a la UE
PopByAge.drop(PopByAge.iloc[:,20:], inplace = True, axis = 1) 

In [7]:
#Rename columns

PopByAge.columns = ['unit','sex', 'citizen', 'age', 'wstatus', 'country', '2021Q2', '2021Q1', '2020Q4', '2020Q3', '2020Q2', '2020Q1', '2019Q4',
                '2019Q3', '2019Q2', '2019Q1', '2018Q4',  '2018Q3', '2018Q2', '2018Q1']

In [8]:
# Segundo filtro: filtrar por los paises miembros de la UE
# from MyFunctions import DF EU countries.

PopByAge = PopByAge[PopByAge.country.isin(EUcountries().Code)]

#Verico que estén los 27
PopByAge['country'].unique()

array(['AT', 'BE', 'CY', 'CZ', 'DE', 'DK', 'EL', 'ES', 'FI', 'FR', 'IE',
       'IT', 'LU', 'MT', 'NL', 'PT', 'SE', 'SK', 'BG', 'EE', 'HU', 'LT',
       'LV', 'PL', 'RO', 'SI', 'HR'], dtype=object)

In [9]:
#Busco null values
PopByAge.isnull().sum().sum()

1001857

In [10]:
#Muestra de algunos valores nulos
PopByAge[PopByAge.isnull().any(1)].sample(5)

#Un montón pffff

Unnamed: 0,unit,sex,citizen,age,wstatus,country,2021Q2,2021Q1,2020Q4,2020Q3,2020Q2,2020Q1,2019Q4,2019Q3,2019Q2,2019Q1,2018Q4,2018Q3,2018Q2,2018Q1
133475,THS,T,EU27_2020_FOR,Y40-44,EMP,LV,,,,,,,,,,,,,,
128317,THS,T,EU15_FOR,Y45-49,UNE,EE,,,,,,,,,,,,,,
32228,THS,F,NEU15_FOR,Y40-44,POP,LT,,,,,,,,,,,,,,
181699,THS,T,STLS,Y_GE50,EMP,EE,,,,,,,,,,,,,,
39917,THS,F,NEU27_2020_FOR,Y65-69,ACT,CY,,,,,,,,,,,,,,


In [11]:
# Cargo los diccionarios para PopByAge desde el bulkdown de Eurostat
unit = eurostat.get_dic('unit')
sex = eurostat.get_dic('sex')
citizen =eurostat.get_dic('citizen')
age = eurostat.get_dic('age')
wstatus = eurostat.get_dic('wstatus')
geo =eurostat.get_dic('geo')

In [12]:
#Uso la variable broadcast porque solo me interesan los datos presentes en el dataframe

broadcast_unit = spark.sparkContext.broadcast(unit)
broadcast_sex = spark.sparkContext.broadcast(sex)
broadcast_citizen = spark.sparkContext.broadcast(citizen)
broadcast_age = spark.sparkContext.broadcast(age)
broadcast_wstatus = spark.sparkContext.broadcast(wstatus)
broadcast_geo = spark.sparkContext.broadcast(geo)

In [13]:
#From pandas to spark
Pop_Age = spark.createDataFrame(PopByAge)
Pop_Age.printSchema()

root
 |-- unit: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- citizen: string (nullable = true)
 |-- age: string (nullable = true)
 |-- wstatus: string (nullable = true)
 |-- country: string (nullable = true)
 |-- 2021Q2: double (nullable = true)
 |-- 2021Q1: double (nullable = true)
 |-- 2020Q4: double (nullable = true)
 |-- 2020Q3: double (nullable = true)
 |-- 2020Q2: double (nullable = true)
 |-- 2020Q1: double (nullable = true)
 |-- 2019Q4: double (nullable = true)
 |-- 2019Q3: double (nullable = true)
 |-- 2019Q2: double (nullable = true)
 |-- 2019Q1: double (nullable = true)
 |-- 2018Q4: double (nullable = true)
 |-- 2018Q3: double (nullable = true)
 |-- 2018Q2: double (nullable = true)
 |-- 2018Q1: double (nullable = true)



In [14]:
#Codes in Pop_Age DF: unit, sex, citizen, age, Wstatus   


# Look for the codes for each variable, the meaning of the codes and count how many times apears in the DF
unitText = Pop_Age.groupBy("unit").count()
sexText = Pop_Age.groupBy("sex").count()
citizenText = Pop_Age.groupBy("citizen").count()
ageText = Pop_Age.groupBy("age").count()
wstatusText = Pop_Age.groupBy("wstatus").count()
geoText = Pop_Age.groupBy("country").count()

# Function to look up code names from broadcasted variables
###...Problem: I've not been able to use just one Function and replace the variables. def lookup(x)
def lookupUnit(unit):
    return broadcast_unit.value[unit]

def lookupSex(sex):
    return broadcast_sex.value[sex]

def lookupCitizen(citizen):
    return broadcast_citizen.value[citizen]

def lookupAge(age):
    return broadcast_age.value[age]

def lookupWstatus(wstatus):
    return broadcast_wstatus.value[wstatus]

def lookupGeo(geo):
    return broadcast_geo.value[geo]

lookupUnitUDF = func.udf(lookupUnit)
lookupSexUDF = func.udf(lookupSex)
lookupCitizenUDF = func.udf(lookupCitizen)
lookupAgeUDF = func.udf(lookupAge)
lookupWstatusUDF = func.udf(lookupWstatus)
lookupGeoUDF = func.udf(lookupGeo)

# Add a new column

UnitDesc = unitText.withColumn("UnitDesc", lookupUnitUDF(func.col("unit")))
SexDesc = sexText.withColumn("SexDesc", lookupSexUDF(func.col("sex")))
CitizenDesc = citizenText.withColumn("CitizenDesc", lookupCitizenUDF(func.col("citizen")))
AgeDesc = ageText.withColumn("AgeDesc", lookupAgeUDF(func.col("age")))
WstatusDesc = wstatusText.withColumn("WstatusDesc", lookupWstatusUDF(func.col("wstatus")))
geoDesc = geoText.withColumn("CountryName", lookupGeoUDF(func.col("Country")))

print("----------------------------------------------")
print("Codes in Population by sex, age, citizenship and labour status DataFrame")
print("----------------------------------------------")

UnitDesc.show(truncate=False)
SexDesc.show(truncate=False)
CitizenDesc.show(truncate=False)
AgeDesc.show(80, truncate=False)
WstatusDesc.show(truncate=False)
geoDesc.show(27, truncate=False)

----------------------------------------------
Codes in Population by sex, age, citizenship and labour status DataFrame
----------------------------------------------
+----+------+--------+
|unit|count |UnitDesc|
+----+------+--------+
|THS |133612|Thousand|
+----+------+--------+

+---+-----+-------+
|sex|count|SexDesc|
+---+-----+-------+
|F  |44085|Females|
|T  |45045|Total  |
|M  |44482|Males  |
+---+-----+-------+

+--------------+-----+----------------------------------------------------+
|citizen       |count|CitizenDesc                                         |
+--------------+-----+----------------------------------------------------+
|NEU28_FOR     |12578|Non-EU28 countries (2013-2020) nor reporting country|
|NRP           |10781|No response                                         |
|EU28_FOR      |12395|EU28 countries (2013-2020) except reporting country |
|STLS          |9060 |Stateless                                           |
|EU27_2020_FOR |12446|EU27 countries (from 2

In [15]:
#De este DataFrame elimino la columna unit porque solo tiene un valor: thousands 
#La variable age la dejo tal cual
#De la variables citizen solo me interesan TOTAL, filtro por esta y elimino la columna
#De la variable age me interesa la població en edad de trabajar, entre 15 y 64, y por rangos de edades 
#De la variable wstatus todos menos unknown 

citizenconcept= ['TOTAL']
ageconcept = ['Y15-64', 'Y20-64', 'Y20-24', 'Y25-29', 'Y30-34', 'Y35-39', 'Y40-59']
wstatutsconcept = ['POP', 'INAC', 'ACT', 'EMP', 'UNE']

Pop_Age = Pop_Age.filter((Pop_Age.citizen.isin(citizenconcept)) & (Pop_Age.age.isin(ageconcept)) & (Pop_Age.wstatus.isin(wstatutsconcept)))
Pop_Age = Pop_Age.select([c for c in Pop_Age.columns if c not in {'unit', 'citizen'}])

In [16]:
Pop_Age.select([func.count(func.when(func.isnan(c) | func.col(c).isNull(), c)).alias(c) for c in Pop_Age.columns]).show()

+---+---+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
|sex|age|wstatus|country|2021Q2|2021Q1|2020Q4|2020Q3|2020Q2|2020Q1|2019Q4|2019Q3|2019Q2|2019Q1|2018Q4|2018Q3|2018Q2|2018Q1|
+---+---+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
|  0|  0|      0|      0|    49|    39|   144|   143|   147|   155|    55|    53|    58|    48|    48|    48|    52|    40|
+---+---+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+



In [17]:
# Problema, Alemania no ha reportado datos para el año 2020

Pop_Age.filter((Pop_Age.country == 'DE') & (Pop_Age.sex == 'T')).show()

+---+------+-------+-------+-------+-------+------+------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+
|sex|   age|wstatus|country| 2021Q2| 2021Q1|2020Q4|2020Q3|2020Q2|2020Q1| 2019Q4| 2019Q3| 2019Q2| 2019Q1| 2018Q4| 2018Q3| 2018Q2| 2018Q1|
+---+------+-------+-------+-------+-------+------+------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+
|  T|Y15-64|    ACT|     DE|41581.0|41789.7|   NaN|   NaN|   NaN|   NaN|42938.5|42498.6|42002.6|42269.5|42267.1|42329.2|41760.4|42019.5|
|  T|Y15-64|    EMP|     DE|40059.4|40036.8|   NaN|   NaN|   NaN|   NaN|41599.4|41153.3|40681.2|40826.4|40901.9|40893.8|40280.9|40466.3|
|  T|Y15-64|   INAC|     DE|11429.3|11680.0|   NaN|   NaN|   NaN|   NaN|11053.3|10952.0|11200.8|11264.7|11180.6|11279.4|11630.6|11627.7|
|  T|Y15-64|    POP|     DE|53010.4|53469.6|   NaN|   NaN|   NaN|   NaN|53991.9|53450.6|53203.4|53534.3|53447.6|53608.6|53391.0|53647.2|
|  T|Y15-64|    UNE|     DE| 1521.6| 1752

In [18]:
countryconcept= ['DE']

Pop_Age_withoutDE = Pop_Age.filter(~Pop_Age.country.isin(countryconcept)) 


In [19]:
Pop_Age_withoutDE.select([func.count(func.when(func.isnan(c) | func.col(c).isNull(), c)).alias(c) for c in Pop_Age_withoutDE.columns]).show()

+---+---+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
|sex|age|wstatus|country|2021Q2|2021Q1|2020Q4|2020Q3|2020Q2|2020Q1|2019Q4|2019Q3|2019Q2|2019Q1|2018Q4|2018Q3|2018Q2|2018Q1|
+---+---+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
|  0|  0|      0|      0|    49|    39|    39|    38|    42|    50|    55|    53|    58|    48|    48|    48|    52|    40|
+---+---+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+



In [20]:
Pop_Age_withoutDE.filter(Pop_Age_withoutDE['2021Q2'].isNull()).show()

+---+---+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
|sex|age|wstatus|country|2021Q2|2021Q1|2020Q4|2020Q3|2020Q2|2020Q1|2019Q4|2019Q3|2019Q2|2019Q1|2018Q4|2018Q3|2018Q2|2018Q1|
+---+---+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
+---+---+-------+-------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+



In [21]:
Pop_Age = Pop_Age.selectExpr('sex', 'age', 'wstatus', 'country', """stack(14,'2021Q2', 2021Q2, '2021Q1', 2021Q1, '2020Q4', 2020Q4, '2020Q3', 2020Q3, '2020Q2', 2020Q2, '2020Q1', 2020Q1, '2019Q4', 2019Q4,
                '2019Q3', 2019Q3, '2019Q2', 2019Q2, '2019Q1', 2019Q1, '2018Q4', 2018Q4, '2018Q3', 2018Q3, '2018Q2', 2018Q2, '2018Q1', 2018Q1) as (Date, cant)""")

In [22]:
Pop_Age.printSchema()

root
 |-- sex: string (nullable = true)
 |-- age: string (nullable = true)
 |-- wstatus: string (nullable = true)
 |-- country: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- cant: double (nullable = true)



In [23]:
Pop_Age.summary().show()

+-------+-----+------+-------+-------+------+------+
|summary|  sex|   age|wstatus|country|  Date|  cant|
+-------+-----+------+-------+-------+------+------+
|  count|39690| 39690|  39690|  39690| 39690| 39690|
|   mean| null|  null|   null|   null|  null|   NaN|
| stddev| null|  null|   null|   null|  null|   NaN|
|    min|    F|Y15-64|    ACT|     AT|2018Q1|   0.9|
|    25%| null|  null|   null|   null|  null|  63.4|
|    50%| null|  null|   null|   null|  null| 276.4|
|    75%| null|  null|   null|   null|  null|1287.6|
|    max|    T|Y40-59|    UNE|     SK|2021Q2|   NaN|
+-------+-----+------+-------+-------+------+------+



In [24]:
Pop_Age_F = Pop_Age.withColumn("Estado_empleo", lookupWstatusUDF(func.col("wstatus"))).withColumn("CountryName", lookupGeoUDF(func.col("Country"))).orderBy('Country', "date")

In [25]:
Pop_Age_F.show()

+---+------+-------+-------+------+------+--------------------+-----------+
|sex|   age|wstatus|country|  Date|  cant|       Estado_empleo|CountryName|
+---+------+-------+-------+------+------+--------------------+-----------+
|  F|Y25-29|    POP|     AT|2018Q1| 290.9|          Population|    Austria|
|  M|Y15-64|    POP|     AT|2018Q1|2897.8|          Population|    Austria|
|  F|Y25-29|    UNE|     AT|2018Q1|  10.6|  Unemployed persons|    Austria|
|  F|Y15-64|    EMP|     AT|2018Q1|1980.0|    Employed persons|    Austria|
|  F|Y30-34|    ACT|     AT|2018Q1| 248.0|Persons in the la...|    Austria|
|  F|Y15-64|   INAC|     AT|2018Q1| 824.3|Persons outside t...|    Austria|
|  F|Y30-34|    EMP|     AT|2018Q1| 235.0|    Employed persons|    Austria|
|  F|Y15-64|    UNE|     AT|2018Q1| 100.6|  Unemployed persons|    Austria|
|  F|Y30-34|   INAC|     AT|2018Q1|  44.6|Persons outside t...|    Austria|
|  F|Y20-24|    EMP|     AT|2018Q1| 163.3|    Employed persons|    Austria|
|  F|Y30-34|

In [26]:
Germanie_Issue = Pop_Age_F.toPandas()
Germanie_Issue['CountryName'].replace(
    to_replace=['Germany (until 1990 former territory of the FRG)'],
    value='Germany',
    inplace=True
)
Pop_Age_F = spark.createDataFrame(Germanie_Issue)

In [27]:
#Guardar en HDFS row data

Pop_Age_F.write.mode('overwrite').parquet("hdfs://localhost:9000//TFM_CEE/row/PopWStatusByAge.parquet")