## Data preparation
 Original dataset is in TSV format and values are coded in a single string.
 Raw Dataset has "wide" format.
 This notebook parses the "key" string into additional columns and changes table format into "long". Null values, originally coded as ":" will be changed to `null`

In [67]:
!pip -q install pandas

7466.52s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [68]:
imigration_data_path = "Immigration_raw.csv"
integration_data_path = ""

In [None]:
import pandas as pd

immigration_ds = pd.read_csv(imigration_data_path, sep=",")

columns = immigration_ds.columns

# address empty cells and "p", "pe" ect. tags
immigration_ds = immigration_ds.map(lambda x: x.strip() if isinstance(x, str) else x)


def extract_leading_int(val):
    if isinstance(val, str):
        parts = val.strip().split()
        if parts and parts[0].isdigit():
            return int(parts[0])
    return val


for col in columns[1:]:
    immigration_ds[col] = immigration_ds[col].apply(extract_leading_int)
immigration_ds = immigration_ds.replace(":", pd.NA)

# parse dimention_key
columns_clean = [
    "freq",
    "age",
    "agedef",
    "c_birth",
    "unit",
    "sex",
    "geo",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
    "2018",
    "2019",
    "2020",
    "2021",
    "2022",
    "2023",
]

KeyError: "['unit'] not found in axis"

In [None]:
immigration_ds.head(5)

Unnamed: 0,"freq,age,agedef,c_birth,unit,sex,geo\TIME_PERIOD",2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,"A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,AT",4085,4093,4723,4827,,4130,4286,,,,
1,"A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,BE",1407,953,1017,1004,,1364,1514,,,,
2,"A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,BG",743,577,559,586,,1413,1659,,,,
3,"A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,CH",2758,2766,2876,3109,,3209,3198,,,,
4,"A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,CZ",155,106,120,414,,736,1013,,,,


In [None]:
rows = []
for idx, row in immigration_ds.iterrows():
    complex_key = row.iloc[0]
    parsed = complex_key.split(",")
    rest = row.iloc[1:].tolist()
    all_values = parsed + rest
    rows.append(all_values)

new_ds = pd.DataFrame(rows, columns=columns_clean)

In [None]:
new_ds.head(5)

Unnamed: 0,freq,age,agedef,c_birth,unit,sex,geo,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,AT,4085,4093,4723,4827,,4130,4286,,,,
1,A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,BE,1407,953,1017,1004,,1364,1514,,,,
2,A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,BG,743,577,559,586,,1413,1659,,,,
3,A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,CH,2758,2766,2876,3109,,3209,3198,,,,
4,A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,CZ,155,106,120,414,,736,1013,,,,


In [None]:
# Drop "unit" as it has only one value
immigration_ds = immigration_ds.drop(columns="unit")

### Now invert format from wide to long

In [None]:
long_columns = ["freq", "age", "agedef", "c_birth", "unit", "sex", "geo", "year", "nr"]
rows = []

years = ["2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
expected_length = len(years) * new_ds.shape[0]
i, one_perc = 0, int(expected_length / 100)
for idx, row in new_ds.iterrows():
    if i > one_perc:
        print(f"Processing row {idx}/{len(new_ds)} {(idx * 100 / len(new_ds)):.0f} %")
        i = 0
    for year in years:
        values = row.iloc[:7].tolist()
        nr = row[year]
        values.extend([year, nr])
        rows.append(values)
    i += 1

long_df = pd.DataFrame(rows, columns=long_columns)

Processing row 7415/67404 11 %
Processing row 14830/67404 22 %
Processing row 22245/67404 33 %
Processing row 29660/67404 44 %
Processing row 37075/67404 55 %
Processing row 44490/67404 66 %
Processing row 51905/67404 77 %
Processing row 59320/67404 88 %
Processing row 66735/67404 99 %


In [None]:
long_df.head(5)

Unnamed: 0,freq,age,agedef,c_birth,unit,sex,geo,year,nr
0,A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,AT,2013,4085.0
1,A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,AT,2014,4093.0
2,A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,AT,2015,4723.0
3,A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,AT,2016,4827.0
4,A,TOTAL,COMPLET,CC5_13_FOR_X_IS,NR,F,AT,2017,


In [None]:
long_df.to_csv("Immigration.csv", index=False)

In [70]:
long_df.c_birth.unique()

array(['CC5_13_FOR_X_IS', 'CC5_15_FOR', 'CC8_22_FOR', 'CC9_23_FOR',
       'EFTA_FOR', 'EU27_2020_FOR', 'EU28_FOR', 'EXT_FOR_HDI',
       'EXT_FOR_HDI_H', 'EXT_FOR_HDI_L', 'EXT_FOR_HDI_M',
       'EXT_FOR_HDI_VH', 'NAT', 'NEU27_2020_FOR', 'NEU28_FOR', 'TOTAL',
       'UNK'], dtype=object)