## 1. Import Libraries and Data

In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:
# Import data
harmonized15_24 = pd.read_csv('../data/other_tsv/teilm021.tsv', sep='\t')
harmonized15_24.head()

Unnamed: 0,"s_adj,age,sex,unit,geo\time",2019M09,2019M10,2019M11,2019M12,2020M01,2020M02,2020M03,2020M04,2020M05,2020M06,2020M07,2020M08
0,"SA,Y_LT25,F,PC_ACT,AT",7.8 u,7.6 u,7.7 u,8.0 u,8.9 u,9.5 u,9.6 u,10.3 u,10.2 u,9.6 u,9.2 u,:
1,"SA,Y_LT25,F,PC_ACT,BE",11.8,13.2,13.2,13.2,13.0,13.0,13.0,13.8,13.8,13.8,:,:
2,"SA,Y_LT25,F,PC_ACT,BG",6.9,7.1,7.2,7.3,10.1,11.2,11.6,14.8,13.1,10.7,9.6,:
3,"SA,Y_LT25,F,PC_ACT,CH",8.4,8.2,8.1,7.9,7.2,6.8,7.3,8.2,8.7,8.4,:,:
4,"SA,Y_LT25,F,PC_ACT,CY",13.8,14.2,14.2,14.2,11.1,11.1,11.1,18.4,18.4,18.4,:,:


## 2. Data Wrangling

For better accessibility of columns, **column names** are harmonized. Furthermore multiple information in categorical columns is assigned to **individual columns** for deeper analysis.

In [3]:
# Strip column names of white spaces
harmonized15_24.columns = (harmonized15_24.columns.str.strip()
                                                  .str.replace('/', '_'))
harmonized15_24.columns 

Index(['s_adj,age,sex,unit,geo\time', '2019M09', '2019M10', '2019M11',
       '2019M12', '2020M01', '2020M02', '2020M03', '2020M04', '2020M05',
       '2020M06', '2020M07', '2020M08'],
      dtype='object')

In [4]:
# Split first column
# 's_adj' means 'seasonally adjusted form'
harmonized15_24[['s_adj', 'age','sex','unit', 'geo_time']] = harmonized15_24.iloc[:,0].str.split(',', expand=True)
harmonized15_24.head()

Unnamed: 0,"s_adj,age,sex,unit,geo\time",2019M09,2019M10,2019M11,2019M12,2020M01,2020M02,2020M03,2020M04,2020M05,2020M06,2020M07,2020M08,s_adj,age,sex,unit,geo_time
0,"SA,Y_LT25,F,PC_ACT,AT",7.8 u,7.6 u,7.7 u,8.0 u,8.9 u,9.5 u,9.6 u,10.3 u,10.2 u,9.6 u,9.2 u,:,SA,Y_LT25,F,PC_ACT,AT
1,"SA,Y_LT25,F,PC_ACT,BE",11.8,13.2,13.2,13.2,13.0,13.0,13.0,13.8,13.8,13.8,:,:,SA,Y_LT25,F,PC_ACT,BE
2,"SA,Y_LT25,F,PC_ACT,BG",6.9,7.1,7.2,7.3,10.1,11.2,11.6,14.8,13.1,10.7,9.6,:,SA,Y_LT25,F,PC_ACT,BG
3,"SA,Y_LT25,F,PC_ACT,CH",8.4,8.2,8.1,7.9,7.2,6.8,7.3,8.2,8.7,8.4,:,:,SA,Y_LT25,F,PC_ACT,CH
4,"SA,Y_LT25,F,PC_ACT,CY",13.8,14.2,14.2,14.2,11.1,11.1,11.1,18.4,18.4,18.4,:,:,SA,Y_LT25,F,PC_ACT,CY


In [5]:
# Reorder columns
harmonized15_24_copy = harmonized15_24[['s_adj', 'age', 'sex', 'unit', 'geo_time', '2019M09', '2019M10', '2019M11', '2019M12', '2020M01', '2020M02', '2020M03', '2020M04', '2020M05','2020M06', '2020M07', '2020M08']]
harmonized15_24_copy.head()

Unnamed: 0,s_adj,age,sex,unit,geo_time,2019M09,2019M10,2019M11,2019M12,2020M01,2020M02,2020M03,2020M04,2020M05,2020M06,2020M07,2020M08
0,SA,Y_LT25,F,PC_ACT,AT,7.8 u,7.6 u,7.7 u,8.0 u,8.9 u,9.5 u,9.6 u,10.3 u,10.2 u,9.6 u,9.2 u,:
1,SA,Y_LT25,F,PC_ACT,BE,11.8,13.2,13.2,13.2,13.0,13.0,13.0,13.8,13.8,13.8,:,:
2,SA,Y_LT25,F,PC_ACT,BG,6.9,7.1,7.2,7.3,10.1,11.2,11.6,14.8,13.1,10.7,9.6,:
3,SA,Y_LT25,F,PC_ACT,CH,8.4,8.2,8.1,7.9,7.2,6.8,7.3,8.2,8.7,8.4,:,:
4,SA,Y_LT25,F,PC_ACT,CY,13.8,14.2,14.2,14.2,11.1,11.1,11.1,18.4,18.4,18.4,:,:


The dataset does not contain any **null values**.

In [6]:
# Check data types
harmonized15_24_copy.dtypes

s_adj       object
age         object
sex         object
unit        object
geo_time    object
2019M09     object
2019M10     object
2019M11     object
2019M12     object
2020M01     object
2020M02     object
2020M03     object
2020M04     object
2020M05     object
2020M06     object
2020M07     object
2020M08     object
dtype: object

In [7]:
# Investigate object columns that should be numeric
harmonized15_24_copy['2020M01'].value_counts()

15.5     6
:        4
11.9     3
19.4     3
15.6     3
        ..
22.8     1
7.2      1
24.9     1
15.4     1
10.1     1
Name: 2020M01, Length: 84, dtype: int64

Since only few values have a letter, the whole dataframe is set to **numeric** in the month columns.

In [8]:
cols = ['2019M09','2019M10', '2019M11', '2019M12', '2020M01', '2020M02', '2020M03', '2020M04', '2020M05', '2020M06', '2020M07', '2020M08']
harmonized15_24_copy[cols] = harmonized15_24_copy[cols].apply(pd.to_numeric, errors='coerce')
harmonized15_24_copy.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,s_adj,age,sex,unit,geo_time,2019M09,2019M10,2019M11,2019M12,2020M01,2020M02,2020M03,2020M04,2020M05,2020M06,2020M07,2020M08
0,SA,Y_LT25,F,PC_ACT,AT,,,,,,,,,,,,
1,SA,Y_LT25,F,PC_ACT,BE,11.8,13.2,13.2,13.2,13.0,13.0,13.0,13.8,13.8,13.8,,
2,SA,Y_LT25,F,PC_ACT,BG,6.9,7.1,7.2,7.3,10.1,11.2,11.6,14.8,13.1,10.7,9.6,
3,SA,Y_LT25,F,PC_ACT,CH,8.4,8.2,8.1,7.9,7.2,6.8,7.3,8.2,8.7,8.4,,
4,SA,Y_LT25,F,PC_ACT,CY,13.8,14.2,14.2,14.2,11.1,11.1,11.1,18.4,18.4,18.4,,


## 3. Export Cleaned Data File

In [9]:
harmonized15_24_copy.to_csv('teilm021_cleaned.csv', index = False)