In [2]:
import requests
import pandas as pd


In [3]:
url = "https://api.worldbank.org/v2/country/all/indicator/SL.EMP.SMGT.FE.ZS?format=json&per_page=20000"

response = requests.get(url)
data = response.json()


In [4]:
type(data), len(data)


(list, 2)

In [5]:
records = data[1]
df = pd.DataFrame(records)

df.head()


Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal
0,"{'id': 'SL.EMP.SMGT.FE.ZS', 'value': 'Female s...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2024,,,,0
1,"{'id': 'SL.EMP.SMGT.FE.ZS', 'value': 'Female s...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2023,,,,0
2,"{'id': 'SL.EMP.SMGT.FE.ZS', 'value': 'Female s...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2022,,,,0
3,"{'id': 'SL.EMP.SMGT.FE.ZS', 'value': 'Female s...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2021,,,,0
4,"{'id': 'SL.EMP.SMGT.FE.ZS', 'value': 'Female s...","{'id': 'ZH', 'value': 'Africa Eastern and Sout...",AFE,2020,,,,0


In [6]:
df_clean = df[[
    "countryiso3code",
    "country",
    "date",
    "value"
]].copy()

df_clean.rename(columns={
    "countryiso3code": "country_code",
    "date": "year",
    "value": "female_share_management"
}, inplace=True)

df_clean.head()


Unnamed: 0,country_code,country,year,female_share_management
0,AFE,"{'id': 'ZH', 'value': 'Africa Eastern and Sout...",2024,
1,AFE,"{'id': 'ZH', 'value': 'Africa Eastern and Sout...",2023,
2,AFE,"{'id': 'ZH', 'value': 'Africa Eastern and Sout...",2022,
3,AFE,"{'id': 'ZH', 'value': 'Africa Eastern and Sout...",2021,
4,AFE,"{'id': 'ZH', 'value': 'Africa Eastern and Sout...",2020,


In [7]:
output_path = "../data/raw/women_management_worldbank.csv"
df_clean.to_csv(output_path, index=False)

output_path


'../data/raw/women_management_worldbank.csv'

### Paid Parental Leave Dataset – Initial Assessment

- The dataset was successfully loaded from the World Bank source.
- It contains country-year observations measuring the length of paid parental leave in calendar days.
- Coverage spans approximately 1970–2023, overlapping with the management representation dataset.
- Missing data is present in several country-year observations, particularly for earlier years.
- This dataset will be used to construct a composite parental leave policy strength score in later stages.


In [13]:
#Length of paid leave data
import pandas as pd

leave_df = pd.read_csv("../data/raw/length_paid_leave.csv")

leave_df.head()

leave_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30618 entries, 0 to 30617
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Indicator Name  30618 non-null  object 
 1   Indicator Code  30618 non-null  object 
 2   Country Name    30618 non-null  object 
 3   Country Code    30618 non-null  object 
 4   Year            30618 non-null  int64  
 5   Value           30618 non-null  float64
 6   Disaggregation  30618 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 1.6+ MB


In [14]:
leave_df.isna().mean().sort_values(ascending=False) * 100


Indicator Name    0.0
Indicator Code    0.0
Country Name      0.0
Country Code      0.0
Year              0.0
Value             0.0
Disaggregation    0.0
dtype: float64

In [16]:
leave_df.columns



Index(['Indicator Name', 'Indicator Code', 'Country Name', 'Country Code',
       'Year', 'Value', 'Disaggregation'],
      dtype='object')

In [17]:
leave_df['Year'].describe()


count    30618.000000
mean      1996.500000
std         15.586039
min       1970.000000
25%       1983.000000
50%       1996.500000
75%       2010.000000
max       2023.000000
Name: Year, dtype: float64

In [18]:
leave_df['Indicator Name'].value_counts()


Indicator Name
Length of paid maternity leave (calendar days)          10206
Length of paid paternity leave (calendar days)          10206
Length of paid shared parental leave (calendar days)    10206
Name: count, dtype: int64

In [19]:
leave_df['Disaggregation'].value_counts()


Disaggregation
female    10206
male      10206
total     10206
Name: count, dtype: int64

In [20]:
leave_filtered = leave_df[
    leave_df['Indicator Name'].isin([
        'Length of paid maternity leave (calendar days)',
        'Length of paid paternity leave (calendar days)',
        'Length of paid shared parental leave (calendar days)'
    ])
]


In [21]:
leave_filtered.shape
leave_filtered[['Indicator Name', 'Disaggregation']].value_counts()


Indicator Name                                        Disaggregation
Length of paid maternity leave (calendar days)        female            10206
Length of paid paternity leave (calendar days)        male              10206
Length of paid shared parental leave (calendar days)  total             10206
Name: count, dtype: int64

In [22]:
leave_filtered = leave_filtered[
    leave_filtered['Year'].between(2000, 2023)
]


In [23]:
#This standardizes column names

leave_clean = leave_filtered.rename(columns={
    'Country Name': 'country',
    'Country Code': 'country_code',
    'Year': 'year',
    'Value': 'leave_days',
    'Indicator Name': 'leave_type'
})


In [24]:
leave_clean = leave_filtered.rename(columns={
    'Country Name': 'country',
    'Country Code': 'country_code',
    'Year': 'year',
    'Value': 'leave_days',
    'Indicator Name': 'leave_type'
})


In [26]:
leave_wide = leave_clean.pivot_table(
    index=['country', 'country_code', 'year'],
    columns='leave_type',
    values='leave_days'
).reset_index()


In [27]:
leave_wide = leave_wide.rename(columns={
    'Length of paid maternity leave (calendar days)': 'maternity_leave_days',
    'Length of paid paternity leave (calendar days)': 'paternity_leave_days',
    'Length of paid shared parental leave (calendar days)': 'shared_parental_leave_days'
})


In [28]:
leave_wide.head()
leave_wide.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4536 entries, 0 to 4535
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   country                     4536 non-null   object 
 1   country_code                4536 non-null   object 
 2   year                        4536 non-null   int64  
 3   maternity_leave_days        4536 non-null   float64
 4   paternity_leave_days        4536 non-null   float64
 5   shared_parental_leave_days  4536 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 212.8+ KB


In [30]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

policy_cols = [
    'maternity_leave_days',
    'paternity_leave_days',
    'shared_parental_leave_days'
]

leave_wide[[c + '_norm' for c in policy_cols]] = scaler.fit_transform(
    leave_wide[policy_cols]
)

leave_wide.describe()


leave_type,year,maternity_leave_days,paternity_leave_days,shared_parental_leave_days,maternity_leave_days_norm,paternity_leave_days_norm,shared_parental_leave_days_norm
count,4536.0,4536.0,4536.0,4536.0,4536.0,4536.0,4536.0
mean,2011.5,99.634039,3.701499,67.401235,0.156904,0.018885,0.046165
std,6.92295,66.950459,10.733474,204.917032,0.105434,0.054763,0.140354
min,2000.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2005.75,84.0,0.0,0.0,0.132283,0.0,0.0
50%,2011.5,91.0,0.0,0.0,0.143307,0.0,0.0
75%,2017.25,112.0,3.0,0.0,0.176378,0.015306,0.0
max,2023.0,635.0,196.0,1460.0,1.0,1.0,1.0


In [31]:
leave_wide['leave_policy_strength'] = (
    leave_wide['maternity_leave_days_norm'] +
    leave_wide['paternity_leave_days_norm'] +
    leave_wide['shared_parental_leave_days_norm']
)


In [32]:
leave_wide['leave_policy_strength'] = (
    leave_wide['leave_policy_strength'] / 3
)


In [35]:
df_clean[['year']].dtypes
leave_wide[['year']].dtypes


leave_type
year    int64
dtype: object

In [36]:
df_clean['year'] = pd.to_numeric(df_clean['year'], errors='coerce')


In [37]:
df_clean['year'] = df_clean['year'].astype(int)
leave_wide['year'] = leave_wide['year'].astype(int)


In [38]:
merged = pd.merge(
    df_clean,
    leave_wide,
    on=['country_code', 'year'],
    how='inner'
)


In [39]:
merged.shape
merged[['country_code','year','female_share_management','leave_policy_strength']].head()


Unnamed: 0,country_code,year,female_share_management,leave_policy_strength
0,AFG,2023,,0.071054
1,AFG,2022,,0.071054
2,AFG,2021,,0.071054
3,AFG,2020,5.889,0.071054
4,AFG,2019,,0.071054


In [41]:
analysis_df = merged.dropna(subset=['female_share_management'])
analysis_df.shape
analysis_df.describe()



Unnamed: 0,year,female_share_management,maternity_leave_days,paternity_leave_days,shared_parental_leave_days,maternity_leave_days_norm,paternity_leave_days_norm,shared_parental_leave_days_norm,leave_policy_strength
count,1650.0,1650.0,1650.0,1650.0,1650.0,1650.0,1650.0,1650.0,1650.0
mean,2013.281818,30.633521,117.95697,7.798788,108.272727,0.185759,0.03979,0.074159,0.099903
std,6.581944,10.090034,71.505664,16.500679,225.143807,0.112607,0.084187,0.154208,0.075441
min,2000.0,1.194,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2008.0,24.16025,84.0,0.0,0.0,0.132283,0.0,0.0,0.050645
50%,2014.0,31.367,112.0,2.0,0.0,0.176378,0.010204,0.0,0.067999
75%,2019.0,37.36675,126.0,13.0,112.0,0.198425,0.066327,0.076712,0.135282
max,2023.0,74.193,410.0,196.0,1460.0,0.645669,1.0,1.0,0.663519


In [42]:
analysis_df.to_csv("../data/processed/analysis_dataset.csv", index=False)
