# Data implementation, reshaping and merging

## Table of contents

### [1. Libraries and Data importations](#1)
### [2. Data Implementation](#2)
### [3. Data reshaping - melt/pivot](#3)
### [4. Data Merging - creating final dataframe](#4)
### [## 5. Exporting the final merged dataframe](#5)

## 1. Libraries and Data importations
<div id='1'></div>

In [4]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import os

##to ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Creating a path to the data folder of the project
path = r'C:\Users\dacol\Documents\Data Project - careerfoundry\Europe health-deaths analysis\02 Data'

# Importing the pickle files
df_cod = pd.read_pickle(os.path.join(path,'Prepared Data','cod_cleaned.pkl'))
df_bed = pd.read_pickle(os.path.join(path,'Prepared Data','bed_cleaned.pkl'))
df_phy = pd.read_pickle(os.path.join(path,'Prepared Data','phy_cleaned.pkl'))
df_exp = pd.read_pickle(os.path.join(path,'Prepared Data','exp_cleaned.pkl'))
df_surv = pd.read_pickle(os.path.join(path,'Prepared Data','surv_cleaned.pkl'))

## 2. Data Implementation
<div id='2'></div>

### Implementing the reasons of death in english

In [7]:
# Importing the code list of reasons of deaths in english
list_cod = pd.read_csv(os.path.join(path,'Original Data','Code lists','ESTAT_ICD10_5.1.tsv'), sep='\t', usecols=['CODE','Label - English'])

#Merging the list to the codes in df_cod
df_cod_complete = pd.merge(left = df_cod, right = list_cod, left_on = 'icd10', right_on ='CODE', how = 'left')

#combining the code to its english name in a single cell
df_cod_complete['cause_of_death'] = df_cod_complete[['icd10', 'Label - English']].agg(': '.join, axis =1)

# droping the unnecessary columns from the merge
df_cod_complete.drop(['unit','icd10','CODE','Label - English'], axis = 1, inplace=True)

df_cod_complete.head()

Unnamed: 0,sex,age,geo\TIME_PERIOD,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,cause_of_death
0,F,TOTAL,AT,920.47,964.83,950.52,927.37,971.67,927.98,960.74,949.4,934.95,1010.36,996.31,A-R_V-Y: All causes of death (A00-Y89) excludi...
1,F,TOTAL,AT1,979.66,1033.16,994.49,968.39,995.66,961.75,983.32,976.84,942.91,1009.08,1023.13,A-R_V-Y: All causes of death (A00-Y89) excludi...
2,F,TOTAL,AT11,1081.55,1179.17,1155.7,1167.13,1164.92,1138.07,1145.81,1141.03,1136.24,1233.96,1185.65,A-R_V-Y: All causes of death (A00-Y89) excludi...
3,F,TOTAL,AT12,995.7,1070.88,1018.99,999.36,1062.94,1023.26,1055.39,1044.82,1031.08,1078.42,1131.08,A-R_V-Y: All causes of death (A00-Y89) excludi...
4,F,TOTAL,AT13,948.17,974.83,946.26,908.85,909.08,879.93,894.9,892.04,836.12,914.04,903.91,A-R_V-Y: All causes of death (A00-Y89) excludi...


### Implementing the reasons of unmet medical needs in english

In [9]:
# Importing the code list of reasons of unmet medical needs in english
list_umn = pd.read_csv(os.path.join(path,'Original Data','Code lists','ESTAT_REASON_21.0.tsv'), sep='\t', usecols=['CODE','Label - English'])

#Merging the list to the codes in df_surv
df_surv_complete = pd.merge(left = df_surv, right = list_umn, left_on = 'reason', right_on ='CODE', how = 'left')

#Changing the column holding the reason in english
df_surv_complete = df_surv_complete.rename(columns={'Label - English':'reason_unmet_medical_need'})

# droping the unnecessary columns from the merge
df_surv_complete.drop(['unit','reason','CODE'], axis = 1, inplace=True)

df_surv_complete.head()

Unnamed: 0,geo\TIME_PERIOD,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,reason_unmet_medical_need
0,AL,,,,,,,0.3,0.5,0.4,0.3,0.2,"Fear of doctor, hospital, examination or treat..."
1,AL01,,,,,,,0.1,0.1,0.3,0.4,0.3,"Fear of doctor, hospital, examination or treat..."
2,AL02,,,,,,,0.3,0.4,0.3,0.2,0.1,"Fear of doctor, hospital, examination or treat..."
3,AL03,,,,,,,0.4,0.9,0.5,0.4,0.3,"Fear of doctor, hospital, examination or treat..."
4,AT,0.2,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,"Fear of doctor, hospital, examination or treat..."


## 3. Data reshaping - melt/pivot
<div id='3'></div>

### Reshaping the df causes of deaths, which will be the base for the final merge

In [12]:
# #reshaping the causes of deaths dataframe - years to rows
df_cod_melt = df_cod_complete.melt(id_vars=['geo\TIME_PERIOD','cause_of_death', 'sex', 'age'], value_vars=['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021'], var_name = 'year')

df_cod_melt.head()

Unnamed: 0,geo\TIME_PERIOD,cause_of_death,sex,age,year,value
0,AT,A-R_V-Y: All causes of death (A00-Y89) excludi...,F,TOTAL,2011,920.47
1,AT1,A-R_V-Y: All causes of death (A00-Y89) excludi...,F,TOTAL,2011,979.66
2,AT11,A-R_V-Y: All causes of death (A00-Y89) excludi...,F,TOTAL,2011,1081.55
3,AT12,A-R_V-Y: All causes of death (A00-Y89) excludi...,F,TOTAL,2011,995.7
4,AT13,A-R_V-Y: All causes of death (A00-Y89) excludi...,F,TOTAL,2011,948.17


In [13]:
#Pivot on the causes of deaths, keeping only ages lower than 65, greater than 65 and the total - Causes of deaths to columns
df_cod_pivot = df_cod_melt.loc[(df_cod_melt['age']=='TOTAL') |(df_cod_melt['age']=='Y_GE65') | (df_cod_melt['age']=='Y_LT65')].pivot(index=['geo\TIME_PERIOD','year', 'sex', 'age'], columns=['cause_of_death']).reset_index()

In [14]:
#pd.option_context('display.multi_sparse', False)
df_cod_pivot.head()

Unnamed: 0_level_0,geo\TIME_PERIOD,year,sex,age,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
cause_of_death,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,A-R_V-Y: All causes of death (A00-Y89) excluding S00-T98,A15-A19_B90: Tuberculosis,"ACC: Accidents (V01-X59, Y85, Y86)","ACC_OTH: Other accidents (W20-W64, W75-X39, X50-X59, Y86)",A_B: Certain infectious and parasitic diseases (A00-B99),A_B_OTH: Other infectious and parasitic diseases (remainder of A00-B99),...,"U_COV19_OTH: COVID-19, other",V01-Y89: External causes of morbidity and mortality (V01-Y89),V01-Y89_OTH: Other external causes of morbidity and mortality (remainder of V01-Y89),"V_Y85: Transport accidents (V01-V99, Y85)",W00-W19: Falls,W65-W74: Accidental drowning and submersion,X40-X49: Accidental poisoning by and exposure to noxious substances,X60-X84_Y870: Intentional self-harm,X85-Y09_Y871: Assault,Y10-Y34_Y872: Event of undetermined intent
0,AT,2011,F,TOTAL,920.47,0.46,24.45,11.02,8.27,4.16,...,,36.21,2.35,3.37,9.32,0.53,0.21,7.3,0.46,1.65
1,AT,2011,F,Y_GE65,4051.37,1.96,101.39,51.15,34.18,17.9,...,,127.6,9.24,6.24,41.8,1.5,0.69,13.16,0.69,3.12
2,AT,2011,F,Y_LT65,131.62,0.09,5.06,0.9,1.75,0.7,...,,13.18,0.61,2.65,1.13,0.29,0.09,5.82,0.41,1.28
3,AT,2011,M,TOTAL,880.37,0.83,35.93,12.84,7.88,3.06,...,,66.43,2.74,10.22,11.54,1.03,0.29,24.02,0.64,3.11
4,AT,2011,M,Y_GE65,4402.11,4.03,125.77,53.46,32.37,16.43,...,,192.28,10.95,17.55,51.37,3.06,0.32,51.69,0.64,3.22


### Reshaping the other df to be merged to the causes of deaths

In [16]:
#reshaping the df with the number of physicians - years to rows
df_phy_melt = df_phy.melt(id_vars=['geo\TIME_PERIOD','unit'], value_vars=['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021'], var_name = 'year')

#Pivot on the physicians - units to columns
df_phy_pivot = df_phy_melt.pivot(index=['geo\TIME_PERIOD','year'], columns=['unit']).reset_index().rename(columns={'HAB_P':'inhabitants_per_physician', 'NR': 'total_physicians','P_HTHAB':'physicians_per_100K_inhabitants'})

df_phy_pivot.head(10)

Unnamed: 0_level_0,geo\TIME_PERIOD,year,value,value,value
unit,Unnamed: 1_level_1,Unnamed: 2_level_1,inhabitants_per_physician,total_physicians,physicians_per_100K_inhabitants
0,AT,2011,207.45,40452.0,482.05
1,AT,2012,205.23,41076.0,487.26
2,AT,2013,201.42,42100.0,496.47
3,AT,2014,199.27,42889.0,501.84
4,AT,2015,197.43,43775.0,506.5
5,AT,2016,195.79,44623.0,510.76
6,AT,2017,193.92,45366.0,515.67
7,AT,2018,191.71,46115.0,521.63
8,AT,2019,188.99,46987.0,529.14
9,AT,2020,188.03,47422.0,531.82


In [17]:
#reshaping the df with the number of beds - years to rows
df_bed_melt = df_bed.melt(id_vars=['geo\TIME_PERIOD','unit'], value_vars=['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021'], var_name = 'year')

#Pivot on the beds - units to columns
df_bed_pivot = df_bed_melt.pivot(index=['geo\TIME_PERIOD','year'], columns=['unit']).reset_index().rename(columns={'HAB_P':'inhabitants_per_bed', 'NR': 'total_beds','P_HTHAB':'beds_per_100K_inhabitants'})

df_bed_pivot.head(10)

Unnamed: 0_level_0,geo\TIME_PERIOD,year,value,value,value
unit,Unnamed: 1_level_1,Unnamed: 2_level_1,inhabitants_per_bed,total_beds,beds_per_100K_inhabitants
0,AT,2011,130.27,64417.0,767.63
1,AT,2012,130.31,64691.0,767.39
2,AT,2013,130.81,64825.0,764.46
3,AT,2014,131.86,64815.0,758.39
4,AT,2015,132.68,65138.0,753.68
5,AT,2016,134.75,64838.0,742.14
6,AT,2017,135.75,64805.0,736.62
7,AT,2018,137.52,64285.0,727.16
8,AT,2019,139.1,63838.0,718.9
9,AT,2020,141.82,62873.0,705.1


In [18]:
#reshaping the df with the healthcare expenditures by providers - years to rows
df_exp_melt = df_exp.melt(id_vars=['geo\TIME_PERIOD','unit','icha11_hp'], value_vars=['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021'], var_name = 'year')

#Pivot on the expenditures, keeping opnly euros per inhabitant, for all providers - units to columns
df_exp_pivot = df_exp_melt.loc[(df_exp_melt['unit']=='EUR_HAB') & (df_exp_melt['icha11_hp']=='TOTAL')].pivot(index=['geo\TIME_PERIOD','year'], columns=['unit']).reset_index().rename(columns={'EUR_HAB':'euros_per_inhabitant'}).drop(columns='icha11_hp')

df_exp_pivot.head(10)

Unnamed: 0_level_0,geo\TIME_PERIOD,year,value
unit,Unnamed: 1_level_1,Unnamed: 2_level_1,euros_per_inhabitant
0,AT,2011,3705.34
1,AT,2012,3855.27
2,AT,2013,3928.93
3,AT,2014,4041.59
4,AT,2015,4129.77
5,AT,2016,4237.37
6,AT,2017,4359.69
7,AT,2018,4510.08
8,AT,2019,4690.47
9,AT,2020,4833.86


In [19]:
#reshaping the df with the number of reasons for unmet medical needs (UMN)
df_surv_melt = df_surv_complete.melt(id_vars=['geo\TIME_PERIOD','reason_unmet_medical_need'], value_vars=['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021'], var_name = 'year')

#Pivot on the UMN - units to columns
df_surv_pivot = df_surv_melt.pivot(index=['geo\TIME_PERIOD','year'], columns=['reason_unmet_medical_need']).reset_index()

df_surv_pivot.head()

Unnamed: 0_level_0,geo\TIME_PERIOD,year,value,value,value,value,value,value,value,value,value,value
reason_unmet_medical_need,Unnamed: 1_level_1,Unnamed: 2_level_1,Didn't know any good doctor or specialist,"Fear of doctor, hospital, examination or treatment",No time,No unmet needs to declare,Other reason,Too expensive,Too expensive or too far to travel or waiting list,Too far to travel,Waiting list,Wanted to wait and see if problem got better on its own
0,AL,2011,,,,,,,,,,
1,AL,2012,,,,,,,,,,
2,AL,2013,,,,,,,,,,
3,AL,2014,,,,,,,,,,
4,AL,2015,,,,,,,,,,


## 4. Data Merging - creating final dataframe
<div id='4'></div>

### Creating the merging keys

In [22]:
#Key on causes of death
df_cod_pivot['key']= df_cod_pivot[['geo\TIME_PERIOD', 'year', 'sex','age']].agg('.'.join, axis =1)
df_cod_pivot['key'].head()

0     AT.2011.F.TOTAL
1    AT.2011.F.Y_GE65
2    AT.2011.F.Y_LT65
3     AT.2011.M.TOTAL
4    AT.2011.M.Y_GE65
Name: key, dtype: object

In [23]:
# key on the physicians
df_phy_pivot['key']= df_phy_pivot[['geo\TIME_PERIOD', 'year']].agg('.'.join, axis =1)+['.T.TOTAL']
df_phy_pivot['key'].head()

0    AT.2011.T.TOTAL
1    AT.2012.T.TOTAL
2    AT.2013.T.TOTAL
3    AT.2014.T.TOTAL
4    AT.2015.T.TOTAL
Name: key, dtype: object

In [24]:
# key on the beds
df_bed_pivot['key']= df_bed_pivot[['geo\TIME_PERIOD', 'year']].agg('.'.join, axis =1)+['.T.TOTAL']
df_bed_pivot['key'].head()

0    AT.2011.T.TOTAL
1    AT.2012.T.TOTAL
2    AT.2013.T.TOTAL
3    AT.2014.T.TOTAL
4    AT.2015.T.TOTAL
Name: key, dtype: object

In [25]:
# key on the expanditures
df_exp_pivot['key']= df_exp_pivot[['geo\TIME_PERIOD', 'year']].agg('.'.join, axis =1)+['.T.TOTAL']
df_exp_pivot['key'].head()

0    AT.2011.T.TOTAL
1    AT.2012.T.TOTAL
2    AT.2013.T.TOTAL
3    AT.2014.T.TOTAL
4    AT.2015.T.TOTAL
Name: key, dtype: object

In [26]:
# key on the survey of unmet medical needs
df_surv_pivot['key']= df_surv_pivot[['geo\TIME_PERIOD', 'year']].agg('.'.join, axis =1)+['.T.TOTAL']
df_surv_pivot['key'].head()

0    AL.2011.T.TOTAL
1    AL.2012.T.TOTAL
2    AL.2013.T.TOTAL
3    AL.2014.T.TOTAL
4    AL.2015.T.TOTAL
Name: key, dtype: object

### Merging all dataframes
Only the totals for all genders and ages will match, the remaining Nan will be incremented by a derived ratio from the total numbers for each region and year

In [28]:
#Merging the dfs from the key

#Cause of deaths with physicians
df_cod_phy_merged = df_cod_pivot.merge(df_phy_pivot[['key','value']], on ='key', how = 'left')
df_cod_phy_merged.shape

(48609, 100)

In [29]:
#adding the expenditures from the healthcare providers
df_cod_phy_exp_merged = df_cod_phy_merged.merge(df_exp_pivot[['key','value']], on ='key', how = 'left')
df_cod_phy_exp_merged.shape

(48609, 101)

In [30]:
#adding the number of available hospital beds
df_cod_phy_exp_bed_merged = df_cod_phy_exp_merged.merge(df_bed_pivot[['key','value']], on ='key', how = 'left')
df_cod_phy_exp_bed_merged.shape

(48609, 104)

In [31]:
#adding the unmet medical needs from the survey
df_merged_all = df_cod_phy_exp_bed_merged.merge(df_surv_pivot[['key','value']], on ='key', how = 'left')
df_merged_all.shape

(48609, 114)

In [32]:
df_merged_all.head()

Unnamed: 0_level_0,geo\TIME_PERIOD,year,sex,age,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,A-R_V-Y: All causes of death (A00-Y89) excluding S00-T98,A15-A19_B90: Tuberculosis,"ACC: Accidents (V01-X59, Y85, Y86)","ACC_OTH: Other accidents (W20-W64, W75-X39, X50-X59, Y86)",A_B: Certain infectious and parasitic diseases (A00-B99),A_B_OTH: Other infectious and parasitic diseases (remainder of A00-B99),...,Didn't know any good doctor or specialist,"Fear of doctor, hospital, examination or treatment",No time,No unmet needs to declare,Other reason,Too expensive,Too expensive or too far to travel or waiting list,Too far to travel,Waiting list,Wanted to wait and see if problem got better on its own
0,AT,2011,F,TOTAL,920.47,0.46,24.45,11.02,8.27,4.16,...,,,,,,,,,,
1,AT,2011,F,Y_GE65,4051.37,1.96,101.39,51.15,34.18,17.9,...,,,,,,,,,,
2,AT,2011,F,Y_LT65,131.62,0.09,5.06,0.9,1.75,0.7,...,,,,,,,,,,
3,AT,2011,M,TOTAL,880.37,0.83,35.93,12.84,7.88,3.06,...,,,,,,,,,,
4,AT,2011,M,Y_GE65,4402.11,4.03,125.77,53.46,32.37,16.43,...,,,,,,,,,,


In [33]:
# Reseting the multi-indexed columns in order to make it on a single level

df_merged_all.columns = df_merged_all.columns.map('_'.join).str.replace('value_','').str.strip('_')

In [34]:
df_merged_all.head()

Unnamed: 0,geo\TIME_PERIOD,year,sex,age,A-R_V-Y: All causes of death (A00-Y89) excluding S00-T98,A15-A19_B90: Tuberculosis,"ACC: Accidents (V01-X59, Y85, Y86)","ACC_OTH: Other accidents (W20-W64, W75-X39, X50-X59, Y86)",A_B: Certain infectious and parasitic diseases (A00-B99),A_B_OTH: Other infectious and parasitic diseases (remainder of A00-B99),...,Didn't know any good doctor or specialist,"Fear of doctor, hospital, examination or treatment",No time,No unmet needs to declare,Other reason,Too expensive,Too expensive or too far to travel or waiting list,Too far to travel,Waiting list,Wanted to wait and see if problem got better on its own
0,AT,2011,F,TOTAL,920.47,0.46,24.45,11.02,8.27,4.16,...,,,,,,,,,,
1,AT,2011,F,Y_GE65,4051.37,1.96,101.39,51.15,34.18,17.9,...,,,,,,,,,,
2,AT,2011,F,Y_LT65,131.62,0.09,5.06,0.9,1.75,0.7,...,,,,,,,,,,
3,AT,2011,M,TOTAL,880.37,0.83,35.93,12.84,7.88,3.06,...,,,,,,,,,,
4,AT,2011,M,Y_GE65,4402.11,4.03,125.77,53.46,32.37,16.43,...,,,,,,,,,,


### Adding the code names to the countries and regions

In [36]:
# Importing the code list for the regions and countries
list_country = pd.read_csv(os.path.join(path,'Original Data','Code lists','ESTAT_GEO_22.0.tsv'), sep='\t', usecols=['CODE','Label - English'])

#Merging the list to the codes in df_merged_all

df_all = df_merged_all=pd.merge(left = df_merged_all, right = list_country[['CODE','Label - English']], left_on = 'geo\TIME_PERIOD', right_on ='CODE', how = 'left')

df_all.head()

Unnamed: 0,geo\TIME_PERIOD,year,sex,age,A-R_V-Y: All causes of death (A00-Y89) excluding S00-T98,A15-A19_B90: Tuberculosis,"ACC: Accidents (V01-X59, Y85, Y86)","ACC_OTH: Other accidents (W20-W64, W75-X39, X50-X59, Y86)",A_B: Certain infectious and parasitic diseases (A00-B99),A_B_OTH: Other infectious and parasitic diseases (remainder of A00-B99),...,No time,No unmet needs to declare,Other reason,Too expensive,Too expensive or too far to travel or waiting list,Too far to travel,Waiting list,Wanted to wait and see if problem got better on its own,CODE,Label - English
0,AT,2011,F,TOTAL,920.47,0.46,24.45,11.02,8.27,4.16,...,,,,,,,,,AT,Austria
1,AT,2011,F,Y_GE65,4051.37,1.96,101.39,51.15,34.18,17.9,...,,,,,,,,,AT,Austria
2,AT,2011,F,Y_LT65,131.62,0.09,5.06,0.9,1.75,0.7,...,,,,,,,,,AT,Austria
3,AT,2011,M,TOTAL,880.37,0.83,35.93,12.84,7.88,3.06,...,,,,,,,,,AT,Austria
4,AT,2011,M,Y_GE65,4402.11,4.03,125.77,53.46,32.37,16.43,...,,,,,,,,,AT,Austria


In [37]:
# droping the new 'CODE' column and renaming the column with the english of the countries and regions, and placing it to the second position

df_all = df_all.rename(columns={'Label - English':'country_region', 'geo\TIME_PERIOD':'geo_code'}).drop(columns='CODE')

df_all.insert(1, 'country_region', df_all.pop('country_region'))

df_all.head()

Unnamed: 0,geo_code,country_region,year,sex,age,A-R_V-Y: All causes of death (A00-Y89) excluding S00-T98,A15-A19_B90: Tuberculosis,"ACC: Accidents (V01-X59, Y85, Y86)","ACC_OTH: Other accidents (W20-W64, W75-X39, X50-X59, Y86)",A_B: Certain infectious and parasitic diseases (A00-B99),...,Didn't know any good doctor or specialist,"Fear of doctor, hospital, examination or treatment",No time,No unmet needs to declare,Other reason,Too expensive,Too expensive or too far to travel or waiting list,Too far to travel,Waiting list,Wanted to wait and see if problem got better on its own
0,AT,Austria,2011,F,TOTAL,920.47,0.46,24.45,11.02,8.27,...,,,,,,,,,,
1,AT,Austria,2011,F,Y_GE65,4051.37,1.96,101.39,51.15,34.18,...,,,,,,,,,,
2,AT,Austria,2011,F,Y_LT65,131.62,0.09,5.06,0.9,1.75,...,,,,,,,,,,
3,AT,Austria,2011,M,TOTAL,880.37,0.83,35.93,12.84,7.88,...,,,,,,,,,,
4,AT,Austria,2011,M,Y_GE65,4402.11,4.03,125.77,53.46,32.37,...,,,,,,,,,,


In [43]:
#Quick cleanig before exportation
print(df_all.shape)
#droping the key column
df_all_cl = df_all.drop(columns='key')

#deleting all rows with only Nan values in the float variables
num_cols = df_all_cl.select_dtypes(include=np.number).columns.tolist()
df_all_cl = df_all_cl.dropna(subset= num_cols, how='all')
print(df_all_cl.shape)

(48609, 115)
(45692, 114)


## 5. Exporting the final merged dataframe
<div id='5'></div>

In [45]:
#exporting hte final dataframe to the folder for prepared data
df_all_cl.to_pickle(os.path.join(path,'Prepared Data','cod_merged_full.pkl'))