In [1]:
import pandas as pd 
employment = pd.read_csv('eurostat_employment.csv')
# keep only rows where 'wstatus' column is 'EMP' and 'nace_r2' column is 'TOTAL'
employment = employment[(employment['wstatus'] == 'EMP') & (employment['nace_r2'] == 'TOTAL')]
employment.head()

Unnamed: 0,DATAFLOW,LAST UPDATE,freq,unit,wstatus,nace_r2,geo,TIME_PERIOD,OBS_VALUE,OBS_FLAG
424842,ESTAT:NAMA_10R_3EMPERS(1.0),13/03/24 23:00:00,A,THS,EMP,TOTAL,AT,2000,3755.0,
424843,ESTAT:NAMA_10R_3EMPERS(1.0),13/03/24 23:00:00,A,THS,EMP,TOTAL,AT,2001,3782.0,
424844,ESTAT:NAMA_10R_3EMPERS(1.0),13/03/24 23:00:00,A,THS,EMP,TOTAL,AT,2002,3778.4,
424845,ESTAT:NAMA_10R_3EMPERS(1.0),13/03/24 23:00:00,A,THS,EMP,TOTAL,AT,2003,3803.0,
424846,ESTAT:NAMA_10R_3EMPERS(1.0),13/03/24 23:00:00,A,THS,EMP,TOTAL,AT,2004,3826.8,


In [2]:
employment = employment[['geo', 'TIME_PERIOD', 'OBS_VALUE']]

# keep rows where 'geo' is comprised of 5 characters
employment = employment[employment['geo'].str.len() == 4]

# change column names 
employment.columns = ['eu_nuts_id', 'year', 'eurostat_employment']

# drop duplicates 
employment = employment.drop_duplicates()
employment

Unnamed: 0,eu_nuts_id,year,eurostat_employment
424888,AT11,2000,101.10
424889,AT11,2001,101.40
424890,AT11,2002,102.50
424891,AT11,2003,103.80
424892,AT11,2004,103.90
...,...,...,...
461216,SK04,2018,573.53
461217,SK04,2019,583.97
461218,SK04,2020,573.73
461219,SK04,2021,567.78


In [3]:
gdp = pd.read_csv('eurostat_gdp.csv')

# keep only rows where 'unit' column is 'EUR_HAB' 
gdp = gdp[gdp['unit'] == 'EUR_HAB']
gdp

Unnamed: 0,DATAFLOW,LAST UPDATE,freq,unit,geo,TIME_PERIOD,OBS_VALUE,OBS_FLAG
0,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,AL,2008,3000.0,
1,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,AL,2009,3000.0,
2,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,AL,2010,3100.0,
3,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,AL,2011,3200.0,
4,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,AL,2012,3300.0,
...,...,...,...,...,...,...,...,...
37753,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,TRC34,2018,3500.0,
37754,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,TRC34,2019,4200.0,
37755,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,TRC34,2020,4100.0,
37756,ESTAT:NAMA_10R_3GDP(1.0),19/02/24 23:00:00,A,EUR_HAB,TRC34,2021,3800.0,


In [4]:
gdp = gdp[['geo', 'TIME_PERIOD', 'OBS_VALUE']]
gdp = gdp[gdp['geo'].str.len() == 4]

# change column names
gdp.columns = ['eu_nuts_id', 'year', 'eurostat_GDP']

# drop duplicates
gdp = gdp.drop_duplicates()
gdp

Unnamed: 0,eu_nuts_id,year,eurostat_GDP
28,AL01,2008,2300.0
29,AL01,2009,2400.0
30,AL01,2010,2600.0
31,AL01,2011,2600.0
32,AL01,2012,2700.0
...,...,...,...
37677,TRC3,2018,4000.0
37678,TRC3,2019,4200.0
37679,TRC3,2020,4000.0
37680,TRC3,2021,4100.0


In [5]:
check = gdp[gdp['eu_nuts_id'] == 'AT11']
check.head(50)

Unnamed: 0,eu_nuts_id,year,eurostat_GDP
284,AT11,2000,17400.0
285,AT11,2001,17900.0
286,AT11,2002,18800.0
287,AT11,2003,19300.0
288,AT11,2004,20300.0
289,AT11,2005,20500.0
290,AT11,2006,21100.0
291,AT11,2007,22400.0
292,AT11,2008,22700.0
293,AT11,2009,22700.0


In [6]:
# merge the two dataframes
eurostat = pd.merge(employment, gdp, on=['eu_nuts_id', 'year'], how = 'outer')
eurostat

Unnamed: 0,eu_nuts_id,year,eurostat_employment,eurostat_GDP
0,AL01,2008,,2300.0
1,AL01,2009,,2400.0
2,AL01,2010,,2600.0
3,AL01,2011,,2600.0
4,AL01,2012,,2700.0
...,...,...,...,...
6822,TRC3,2018,,4000.0
6823,TRC3,2019,,4200.0
6824,TRC3,2020,,4000.0
6825,TRC3,2021,,4100.0


In [7]:
# slice the data to only include data from 2010 to 2021
eurostat = eurostat[(eurostat['year'] >= 1980) & (eurostat['year'] <= 2021)]
eurostat['year'].unique()

array([2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,
       2019, 2020, 2021, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
       1995, 1996, 1997, 1998, 1999], dtype=int64)

In [8]:
# create 'period' column 
def assign_period(pubyear):
    if 1981 <= pubyear <= 1985:
        return 1
    elif 1986 <= pubyear <= 1990:
        return 2 
    elif 1991 <= pubyear <= 1995:
        return 3 
    elif 1996 <= pubyear <= 2000:
        return 4
    elif 2001 <= pubyear <= 2005:
        return 5
    elif 2006 <= pubyear <= 2010:
        return 6
    elif 2011 <= pubyear <= 2015:
        return 7
    elif 2016 <= pubyear <= 2020:
        return 8
    else:
        return 0 
    
eurostat['period'] = eurostat['year'].apply(assign_period)
eurostat.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eurostat['period'] = eurostat['year'].apply(assign_period)


Unnamed: 0,eu_nuts_id,year,eurostat_employment,eurostat_GDP,period
0,AL01,2008,,2300.0,6
1,AL01,2009,,2400.0,6
2,AL01,2010,,2600.0,6
3,AL01,2011,,2600.0,7
4,AL01,2012,,2700.0,7


In [9]:
# drop 'year' column
eurostat.drop('year', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eurostat.drop('year', axis=1, inplace=True)


In [10]:
# check NaN values
eurostat.isnull().sum()

eu_nuts_id               0
eurostat_employment    581
eurostat_GDP           489
period                   0
dtype: int64

In [11]:
# group by 'eu_nuts_id' and 'period' columns, calculate the mean 
eurostat = eurostat.groupby(['eu_nuts_id', 'period']).mean().reset_index()
eurostat

Unnamed: 0,eu_nuts_id,period,eurostat_employment,eurostat_GDP
0,AL01,0,,4300.000000
1,AL01,6,,2433.333333
2,AL01,7,,2720.000000
3,AL01,8,,3520.000000
4,AL02,0,,6500.000000
...,...,...,...,...
1769,TRC3,0,,4100.000000
1770,TRC3,5,,2250.000000
1771,TRC3,6,,3120.000000
1772,TRC3,7,,4420.000000


In [12]:
eurostat.isnull().sum()

eu_nuts_id               0
period                   0
eurostat_employment    163
eurostat_GDP           119
dtype: int64

In [13]:
eurostat.to_csv('eurostat_merged_data.csv', index=False)