In [1]:
import pandas as pd
import numpy as np



Create country list for translation of country codes

In [2]:
countries = [
'European Union [EU27_2020]',
'European Union [EU28]',
'Austria [AT]', 
'Belgium [BE]',
'Bulgaria [BG]',
'Czechia [CZ]',
'Denmark [DK]',
'Germany [DE]',
'Estonia [EE]',
'Ireland [IE]',
'Greece [EL]',
'Spain [ES]',
'France [FR]',
'Croatia [HR]',
'Italy [IT]',
'Cyprus [CY]',
'Latvia [LV]',
'Lithuania [LT]',
'Luxembourg [LU]',
'Hungary [HU]',
'Malta [MT]',
'Netherlands [NL]', 
'Poland [PL]',
'Portugal [PT]',
'Romania [RO]',
'Slovenia [SI]',
'Slovakia [SK]',
'Finland [FI]',
'Sweden [SE]',
'Iceland [IS]',
'Liechtenstein [LI]',
'Norway [NO]',
'Switzerland [CH]',
'United Kingdom [UK]']

Create dataset of countries and their codes

In [3]:
def create_country_df(countries):
    
    df = pd.DataFrame(columns=['country_code', 'country_name'], dtype=str)
    codes, names = [], []
    for country in countries:
        code, name = country.split('[')[1].strip()[:-1], country.split('[')[0].strip()
        codes.append(code)
        names.append(name)
    df['country_code'], df['country_name'] = codes, names
    
    return df

df = create_country_df(countries)
df.head(4)

Unnamed: 0,country_code,country_name
0,EU27_2020,European Union
1,EU28,European Union
2,AT,Austria
3,BE,Belgium


Add general information for each country

In [10]:
area = pd.read_csv('../data/raw/general/area_2022.tsv', sep = '\t')
population = pd.read_csv('../data/raw/general/population_2022.tsv', sep = '\t')

print('AREA __ ', area.shape[0], ' lines\ncolumns:', *area.columns, '\n        ', *area.dtypes, '\n')
print('POPULATION __ ', population.shape[0], ' lines\ncolumns:', *population.columns, '\n        ', *population.dtypes, '\n')

area = area.set_axis(['country_code', 'area_km2'], axis=1, inplace=False)
population = population.set_axis(['country_code', 'population'], axis=1, inplace=False)

area.head(3)

AREA __  28  lines
columns: freq,landuse,unit,geo\TIME_PERIOD 2022  
         object int64 

POPULATION __  28  lines
columns: freq,indic_de,geo\TIME_PERIOD 2022  
         object object 



Unnamed: 0,country_code,area_km2
0,"A,L0008,KM2,AT",82519
1,"A,L0008,KM2,BE",30452
2,"A,L0008,KM2,BG",110001


In [20]:
def convert_to_float(x):
    try:        return float(x)
    except:
        x = x.translate({ord(i): '' for i in ' edp:cs'})
        try:    return float(x)
        except: return np.nan
    
def get_country(x):
    return x.strip()[-2:]

def clean_table(df):
    df.iloc[:,0] = df.iloc[:,0].apply(get_country)
    df.iloc[:,1] = df.iloc[:,1].apply(convert_to_float)
    return df

area_n, population_n = clean_table(area), clean_table(population)

both = pd.merge(area_n, population_n, on='country_code')
full = pd.merge(df, both, on='country_code')
full.head()

Unnamed: 0,country_code,country_name,area_km2,population
0,AT,Austria,82519.0,8978929.0
1,BE,Belgium,30452.0,11631136.0
2,BG,Bulgaria,110001.0,6838937.0
3,CZ,Czechia,77212.0,10516707.0
4,DK,Denmark,41987.0,5873420.0


Save countries info

In [19]:
full.to_csv('../data/dataframes/country_info.csv', index=False)