In [1]:
#loading the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels as sm
import re

### Population Data

In [2]:
#loading the population data
df = pd.read_csv('Kenya.csv')
df.head()

Unnamed: 0,name,num_code,year,age,sex,population
0,Kenya,404,2015,15,female,341486.56
1,Kenya,404,2015,15,male,344435.68
2,Kenya,404,2015,16,female,499980.48
3,Kenya,404,2015,16,male,504210.88
4,Kenya,404,2015,17,female,598556.48


In [3]:
# Filter data for ages 15 to 35
filtered_df = df[(df["age"] >= 15) & (df["age"] <= 35)]

# Define age groups
age_groups = pd.cut(filtered_df["age"], bins=[15, 25, 36], labels=["15-24", "25-35"])

# Add age group column to DataFrame
filtered_df["age_group"] = age_groups

# Group and aggregate population by year, age group, and sex
grouped_df = filtered_df.groupby(["year", "age_group", "sex"])["population"].sum().reset_index()

print(grouped_df)


     year age_group     sex   population
0    2015     15-24  female  4818104.640
1    2015     15-24    male  4840215.200
2    2015     25-35  female  3705181.440
3    2015     25-35    male  3669375.040
4    2016     15-24  female  4962468.832
..    ...       ...     ...          ...
99   2039     25-35    male  6319410.400
100  2040     15-24  female  7511131.840
101  2040     15-24    male  7602708.640
102  2040     25-35  female  6425941.120
103  2040     25-35    male  6429023.040

[104 rows x 4 columns]


In [4]:
# Extracting popln data for the year 2015-2024
# Use .loc to filter population data for the years 2015 to 2024
popln = grouped_df.loc[(grouped_df["year"] >= 2015) & (grouped_df["year"] <= 2024)]

print(popln)

    year age_group     sex   population
0   2015     15-24  female  4818104.640
1   2015     15-24    male  4840215.200
2   2015     25-35  female  3705181.440
3   2015     25-35    male  3669375.040
4   2016     15-24  female  4962468.832
5   2016     15-24    male  4986519.680
6   2016     25-35  female  3778345.472
7   2016     25-35    male  3742889.280
8   2017     15-24  female  5106833.024
9   2017     15-24    male  5132824.160
10  2017     25-35  female  3851509.504
11  2017     25-35    male  3816403.520
12  2018     15-24  female  5251197.216
13  2018     15-24    male  5279128.640
14  2018     25-35  female  3924673.536
15  2018     25-35    male  3889917.760
16  2019     15-24  female  5395561.408
17  2019     15-24    male  5425433.120
18  2019     25-35  female  3997837.568
19  2019     25-35    male  3963432.000
20  2020     15-24  female  5539925.600
21  2020     15-24    male  5571737.600
22  2020     25-35  female  4071001.600
23  2020     25-35    male  4036946.240


### Inactivity Rate

In [5]:
# Loading the inactivity rate by sex and age dataset
df1=pd.read_csv('Inactivity_rate.csv')
df1.head()


Unnamed: 0,indicator.label,sex.label,classif1.label,time,obs_value
0,Inactivity rate by sex and age -- ILO modelled...,Sex: Male,"Age (Youth, adults): 15-24",2024,57.193
1,Inactivity rate by sex and age -- ILO modelled...,Sex: Male,"Age (Youth, adults): 25+",2024,6.661
2,Inactivity rate by sex and age -- ILO modelled...,Sex: Female,"Age (Youth, adults): 15-24",2024,58.297
3,Inactivity rate by sex and age -- ILO modelled...,Sex: Female,"Age (Youth, adults): 25+",2024,11.648
4,Inactivity rate by sex and age -- ILO modelled...,Sex: Male,"Age (Youth, adults): 15-24",2023,57.228


In [6]:
# Use .loc to filter inactivity data for the years 2015 to 2024
filtered_df = df1.loc[df1["time"].between(2015, 2024)]
# Define mappings for classif1.label and sex.label
classif1_mapping = {
    "Age (Youth, adults): 15-24": "15-24",
    "Age (Youth, adults): 25+": "25+"
}

sex_mapping = {
    "Sex: Female": "female",
    "Sex: Male": "male"
}

# Apply mappings to create formatted columns
filtered_df["age_group"] = filtered_df["classif1.label"].map(classif1_mapping)
filtered_df["sex"] = filtered_df["sex.label"].map(sex_mapping)

# Rearrange columns to match the desired format
formatted_df = filtered_df[["time", "age_group", "sex", "obs_value"]]

# Sort the formatted data from 2015 to 2024
sorted_df = formatted_df.sort_values(by=["time", "age_group", "sex"])

print(sorted_df)

    time age_group     sex  obs_value
38  2015     15-24  female     55.928
36  2015     15-24    male     53.436
39  2015       25+  female     14.453
37  2015       25+    male      6.567
34  2016     15-24  female     56.361
32  2016     15-24    male     53.445
35  2016       25+  female     14.283
33  2016       25+    male      6.499
30  2017     15-24  female     56.906
28  2017     15-24    male     54.779
31  2017       25+  female     13.838
29  2017       25+    male      6.685
26  2018     15-24  female     57.463
24  2018     15-24    male     56.142
27  2018       25+  female     13.340
25  2018       25+    male      6.876
22  2019     15-24  female     58.043
20  2019     15-24    male     57.540
23  2019       25+  female     12.788
21  2019       25+    male      7.032
18  2020     15-24  female     58.456
16  2020     15-24    male     57.930
19  2020       25+  female     13.634
17  2020       25+    male      7.883
14  2021     15-24  female     58.395
12  2021    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["age_group"] = filtered_df["classif1.label"].map(classif1_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["sex"] = filtered_df["sex.label"].map(sex_mapping)


In [7]:
# Reset the index and drop the old index column
inactive_r = sorted_df.reset_index(drop=True)

print(inactive_r)

    time age_group     sex  obs_value
0   2015     15-24  female     55.928
1   2015     15-24    male     53.436
2   2015       25+  female     14.453
3   2015       25+    male      6.567
4   2016     15-24  female     56.361
5   2016     15-24    male     53.445
6   2016       25+  female     14.283
7   2016       25+    male      6.499
8   2017     15-24  female     56.906
9   2017     15-24    male     54.779
10  2017       25+  female     13.838
11  2017       25+    male      6.685
12  2018     15-24  female     57.463
13  2018     15-24    male     56.142
14  2018       25+  female     13.340
15  2018       25+    male      6.876
16  2019     15-24  female     58.043
17  2019     15-24    male     57.540
18  2019       25+  female     12.788
19  2019       25+    male      7.032
20  2020     15-24  female     58.456
21  2020     15-24    male     57.930
22  2020       25+  female     13.634
23  2020       25+    male      7.883
24  2021     15-24  female     58.395
25  2021    

## 1.	Apply ILO inactive rate by age and gender to corresponding population for 2015 - 2024 to get total inactive population (ILO inactive share * population)

In [8]:
# Concatenate the inactive data and population data
df3= pd.concat([inactive_r, popln], axis=1)
df3
# Calculate the total inactive population (ILO inactive share * population)
df3['total_inactive_population'] = (df3['obs_value'] / 100) * df3['population']
df3



Unnamed: 0,time,age_group,sex,obs_value,year,age_group.1,sex.1,population,total_inactive_population
0,2015,15-24,female,55.928,2015,15-24,female,4818104.64,2694670.0
1,2015,15-24,male,53.436,2015,15-24,male,4840215.2,2586417.0
2,2015,25+,female,14.453,2015,25-35,female,3705181.44,535509.9
3,2015,25+,male,6.567,2015,25-35,male,3669375.04,240967.9
4,2016,15-24,female,56.361,2016,15-24,female,4962468.832,2796897.0
5,2016,15-24,male,53.445,2016,15-24,male,4986519.68,2665045.0
6,2016,25+,female,14.283,2016,25-35,female,3778345.472,539661.1
7,2016,25+,male,6.499,2016,25-35,male,3742889.28,243250.4
8,2017,15-24,female,56.906,2017,15-24,female,5106833.024,2906094.0
9,2017,15-24,male,54.779,2017,15-24,male,5132824.16,2811710.0


In [9]:
#inactive population
inactive_population = df3[['total_inactive_population']]

#Total population by gender
total_population =df3[['population']]

### 2.	Apply ILO unemployment rate by age and gender to corresponding population for 2015-2024 to get the total unemployed population. Please note that the unemployed rate is expressed as a percentage of the labour force and not the total population. (ILO unemployed rate *(tot pop population - inactive population))

In [10]:
# read unemplyment csv
unemployment_rate = pd.read_csv('Unemployment_rate.csv')


# drop records with age 15+
unemployment_rate = unemployment_rate[unemployment_rate['classif1.label'] != 'Age (Youth, adults): 15+']

#filter unemployment data for the years 2015 to 2024
filtered_df = unemployment_rate.loc[unemployment_rate["time"].between(2015, 2024)]

# Define mappings for classif1.label and sex.label
classif1_mapping = {
    "Age (Youth, adults): 15-24": "15-24",
    "Age (Youth, adults): 25+": "25+"
}

sex_mapping = {
    "Sex: Female": "female",
    "Sex: Male": "male"
}

# Apply mappings to create formatted columns
filtered_df["age_group"] = filtered_df["classif1.label"].map(classif1_mapping)
filtered_df["sex"] = filtered_df["sex.label"].map(sex_mapping)

# Rearrange columns to match the desired format
formatted_df = filtered_df[["time", "age_group", "sex", "obs_value"]]

# Sort the formatted data from 2015 to 2024
sorted_df = formatted_df.sort_values(by=["time", "age_group", "sex"])
sorted_df

# Reset the index and drop the old index column
df4 = sorted_df.reset_index(drop=True)

print(df4)



    time age_group     sex  obs_value
0   2015     15-24  female      7.339
1   2015     15-24    male      7.337
2   2015       25+  female      1.793
3   2015       25+    male      1.354
4   2016     15-24  female      7.371
5   2016     15-24    male      7.378
6   2016       25+  female      1.787
7   2016       25+    male      1.347
8   2017     15-24  female      9.304
9   2017     15-24    male      8.798
10  2017       25+  female      2.300
11  2017       25+    male      1.959
12  2018     15-24  female     11.192
13  2018     15-24    male     10.189
14  2018       25+  female      2.793
15  2018       25+    male      2.546
16  2019     15-24  female     13.175
17  2019     15-24    male     11.675
18  2019       25+  female      3.322
19  2019       25+    male      3.166
20  2020     15-24  female     14.775
21  2020     15-24    male     12.873
22  2020       25+  female      3.597
23  2020       25+    male      3.721
24  2021     15-24  female     14.463
25  2021    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["age_group"] = filtered_df["classif1.label"].map(classif1_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["sex"] = filtered_df["sex.label"].map(sex_mapping)


In [11]:
#Total population by gender
df4['total_population'] =df3[['population']]

#inactive population
df4['inactive_population'] = df3[['total_inactive_population']]

In [12]:
#adding the labour force column to the data set
df4['labour_force']= df4['total_population'] - df4['inactive_population']
df4

Unnamed: 0,time,age_group,sex,obs_value,total_population,inactive_population,labour_force
0,2015,15-24,female,7.339,4818104.64,2694670.0,2123435.0
1,2015,15-24,male,7.337,4840215.2,2586417.0,2253798.0
2,2015,25+,female,1.793,3705181.44,535509.9,3169672.0
3,2015,25+,male,1.354,3669375.04,240967.9,3428407.0
4,2016,15-24,female,7.371,4962468.832,2796897.0,2165572.0
5,2016,15-24,male,7.378,4986519.68,2665045.0,2321474.0
6,2016,25+,female,1.787,3778345.472,539661.1,3238684.0
7,2016,25+,male,1.347,3742889.28,243250.4,3499639.0
8,2017,15-24,female,9.304,5106833.024,2906094.0,2200739.0
9,2017,15-24,male,8.798,5132824.16,2811710.0,2321114.0


In [13]:
# Calculate the total unemployed population
df4['total_unemployed_population'] = (df4['obs_value'] / 100) * df4['labour_force']
df4

Unnamed: 0,time,age_group,sex,obs_value,total_population,inactive_population,labour_force,total_unemployed_population
0,2015,15-24,female,7.339,4818104.64,2694670.0,2123435.0,155838.900297
1,2015,15-24,male,7.337,4840215.2,2586417.0,2253798.0,165361.145006
2,2015,25+,female,1.793,3705181.44,535509.9,3169672.0,56832.211187
3,2015,25+,male,1.354,3669375.04,240967.9,3428407.0,46420.633232
4,2016,15-24,female,7.371,4962468.832,2796897.0,2165572.0,159624.295432
5,2016,15-24,male,7.378,4986519.68,2665045.0,2321474.0,171278.369208
6,2016,25+,female,1.787,3778345.472,539661.1,3238684.0,57875.290018
7,2016,25+,male,1.347,3742889.28,243250.4,3499639.0,47140.13606
8,2017,15-24,female,9.304,5106833.024,2906094.0,2200739.0,204756.721518
9,2017,15-24,male,8.798,5132824.16,2811710.0,2321114.0,204211.64609


### 3.	Get total employed population by subtracting inactivity and unemployed population from total population


In [14]:
# Total employed population
df4['total_employed_population']= df4['total_population'] - (df4['inactive_population']+df4['total_unemployed_population'])
df4

Unnamed: 0,time,age_group,sex,obs_value,total_population,inactive_population,labour_force,total_unemployed_population,total_employed_population
0,2015,15-24,female,7.339,4818104.64,2694670.0,2123435.0,155838.900297,1967596.0
1,2015,15-24,male,7.337,4840215.2,2586417.0,2253798.0,165361.145006,2088437.0
2,2015,25+,female,1.793,3705181.44,535509.9,3169672.0,56832.211187,3112839.0
3,2015,25+,male,1.354,3669375.04,240967.9,3428407.0,46420.633232,3381987.0
4,2016,15-24,female,7.371,4962468.832,2796897.0,2165572.0,159624.295432,2005947.0
5,2016,15-24,male,7.378,4986519.68,2665045.0,2321474.0,171278.369208,2150196.0
6,2016,25+,female,1.787,3778345.472,539661.1,3238684.0,57875.290018,3180809.0
7,2016,25+,male,1.347,3742889.28,243250.4,3499639.0,47140.13606,3452499.0
8,2017,15-24,female,9.304,5106833.024,2906094.0,2200739.0,204756.721518,1995982.0
9,2017,15-24,male,8.798,5132824.16,2811710.0,2321114.0,204211.64609,2116903.0


### 4.	Using the employed by sex, age and economic activity indicator in the dataset, apply the shares of each ISIC REV 4 sector to the total employed population to get absolute numbers for each sector. Please note that the obs value provided by the ILO does not represent the population. It should only be used to get shares for each sector. I.e shares for agric = agric obs value/ total for ISIC REV 4 sectors. This share should then be multiplied by the employed population

In [15]:
#loading the economic data
economic_activity = pd.read_csv('Emp_economic.csv')
economic_activity.head()

Unnamed: 0,indicator.label,sex.label,classif1.label,classif2.label,time,obs_value
0,"Employment by sex, age and economic activity (...",Sex: Total,"Age (Youth, adults): 15+",Economic activity (ISIC-Rev.4): Total,2019,18724.541
1,"Employment by sex, age and economic activity (...",Sex: Total,"Age (Youth, adults): 15+",Economic activity (ISIC-Rev.4): A. Agriculture...,2019,6288.368
2,"Employment by sex, age and economic activity (...",Sex: Total,"Age (Youth, adults): 15+",Economic activity (ISIC-Rev.4): B. Mining and ...,2019,161.644
3,"Employment by sex, age and economic activity (...",Sex: Total,"Age (Youth, adults): 15+",Economic activity (ISIC-Rev.4): C. Manufacturing,2019,1283.218
4,"Employment by sex, age and economic activity (...",Sex: Total,"Age (Youth, adults): 15+",Economic activity (ISIC-Rev.4): D. Electricity...,2019,16.395


In [16]:
# drop records with Sex Total
economic_activity = economic_activity[economic_activity['sex.label'] != 'Sex: Total']

# drop records with age 15+
economic_activity = economic_activity[economic_activity['classif1.label'] != 'Age (Youth, adults): 15+']

#drop record with age 15-64
economic_activity = economic_activity[economic_activity['classif1.label'] != 'Age (Youth, adults): 15-64']
economic_activity


Unnamed: 0,indicator.label,sex.label,classif1.label,classif2.label,time,obs_value
130,"Employment by sex, age and economic activity (...",Sex: Male,"Age (Youth, adults): 15-24",Economic activity (ISIC-Rev.4): Total,2019,1613.832
131,"Employment by sex, age and economic activity (...",Sex: Male,"Age (Youth, adults): 15-24",Economic activity (ISIC-Rev.4): A. Agriculture...,2019,597.013
132,"Employment by sex, age and economic activity (...",Sex: Male,"Age (Youth, adults): 15-24",Economic activity (ISIC-Rev.4): B. Mining and ...,2019,15.825
133,"Employment by sex, age and economic activity (...",Sex: Male,"Age (Youth, adults): 15-24",Economic activity (ISIC-Rev.4): C. Manufacturing,2019,108.756
134,"Employment by sex, age and economic activity (...",Sex: Male,"Age (Youth, adults): 15-24",Economic activity (ISIC-Rev.4): D. Electricity...,2019,
...,...,...,...,...,...,...
255,"Employment by sex, age and economic activity (...",Sex: Female,"Age (Youth, adults): 25+","Economic activity (ISIC-Rev.4): R. Arts, enter...",2019,12.371
256,"Employment by sex, age and economic activity (...",Sex: Female,"Age (Youth, adults): 25+",Economic activity (ISIC-Rev.4): S. Other servi...,2019,286.629
257,"Employment by sex, age and economic activity (...",Sex: Female,"Age (Youth, adults): 25+",Economic activity (ISIC-Rev.4): T. Activities ...,2019,327.240
258,"Employment by sex, age and economic activity (...",Sex: Female,"Age (Youth, adults): 25+",Economic activity (ISIC-Rev.4): U. Activities ...,2019,


In [17]:
# Define mappings for classif1.label , classif2.label and sex.label
econ_classif1_mapping = {
    "Age (Youth, adults): 15-24": "15-24",
    "Age (Youth, adults): 25+": "25+"
}
econ_sex_mapping = {
    "Sex: Female": "female",
    "Sex: Male": "male"
}
econ_classif2_mapping = {
    'Economic activity (ISIC-Rev.4): Total': 'total',
 'Economic activity (ISIC-Rev.4): A. Agriculture; forestry and fishing': 'Agricullture, forestry and fishing',
 'Economic activity (ISIC-Rev.4): B. Mining and quarrying': 'Mining and Quarrying',
 'Economic activity (ISIC-Rev.4): C. Manufacturing': 'Manufacturing',
 'Economic activity (ISIC-Rev.4): E. Water supply; sewerage, waste management and remediation activities': ' Water supply, sewerage, waste management and remediation activities',
 'Economic activity (ISIC-Rev.4): H. Transportation and storage': 'Transportation and storage',
 'Economic activity (ISIC-Rev.4): I. Accommodation and food service activities': 'Accommodation and food service activities',
 'Economic activity (ISIC-Rev.4): J. Information and communication': 'Information and communication',
 'Economic activity (ISIC-Rev.4): K. Financial and insurance activities': 'Financial and insurance activities',
 'Economic activity (ISIC-Rev.4): L. Real estate activities': 'Real estate activities',
 'Economic activity (ISIC-Rev.4): M. Professional, scientific and technical activities': 'Professional, scientific and technical activities',
 'Economic activity (ISIC-Rev.4): N. Administrative and support service activities': 'Administrative and support service activities',
 'Economic activity (ISIC-Rev.4): O. Public administration and defence; compulsory social security': 'Public administration and defence, compulsory social security',
 'Economic activity (ISIC-Rev.4): P. Education': 'Education',
 'Economic activity (ISIC-Rev.4): D. Electricity; gas, steam and air conditioning supply': 'Electricity; gas, steam and air conditioning supply',
 'Economic activity (ISIC-Rev.4): Q. Human health and social work activities': 'Human health and social work activities',
 'Economic activity (ISIC-Rev.4): R. Arts, entertainment and recreation': 'Arts, entertainment and recreation',
 'Economic activity (ISIC-Rev.4): S. Other service activities': 'Other service activities',
 'Economic activity (ISIC-Rev.4): X. Not elsewhere classified': 'Not Elsewhere classified',
 'Economic activity (ISIC-Rev.4): F. Construction': 'Construction',
 'Economic activity (ISIC-Rev.4): G. Wholesale and retail trade; repair of motor vehicles and motorcycles': 'Wholesale and retail trade; repair of motor vehicles and motorcycles',
 'Economic activity (ISIC-Rev.4): U. Activities of extraterritorial organizations and bodies': 'Activities of extraterritorial organizations and bodies',
 'Economic activity (ISIC-Rev.4): T. Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use': 'Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use'
}

# Apply mappings to create Renamed  column values
economic_activity["age_group"] = economic_activity["classif1.label"].map(econ_classif1_mapping)
economic_activity["sex"] = economic_activity["sex.label"].map(econ_sex_mapping)
economic_activity['economic_activity'] = economic_activity['classif2.label'].map(econ_classif2_mapping)
economic_activity

#economic_activity.to_csv(csv_file_path, index=False)

# Dropping the initial classif1.label, sex.label, classif2.label
economic_activity.drop(['sex.label','classif1.label','classif2.label'],axis=1,inplace=True)



In [18]:
economic_activity

Unnamed: 0,indicator.label,time,obs_value,age_group,sex,economic_activity
130,"Employment by sex, age and economic activity (...",2019,1613.832,15-24,male,total
131,"Employment by sex, age and economic activity (...",2019,597.013,15-24,male,"Agricullture, forestry and fishing"
132,"Employment by sex, age and economic activity (...",2019,15.825,15-24,male,Mining and Quarrying
133,"Employment by sex, age and economic activity (...",2019,108.756,15-24,male,Manufacturing
134,"Employment by sex, age and economic activity (...",2019,,15-24,male,"Electricity; gas, steam and air conditioning s..."
...,...,...,...,...,...,...
255,"Employment by sex, age and economic activity (...",2019,12.371,25+,female,"Arts, entertainment and recreation"
256,"Employment by sex, age and economic activity (...",2019,286.629,25+,female,Other service activities
257,"Employment by sex, age and economic activity (...",2019,327.240,25+,female,Activities of households as employers; undiffe...
258,"Employment by sex, age and economic activity (...",2019,,25+,female,Activities of extraterritorial organizations a...


In [19]:
# split the data based on the age ( 15-24, 25+)
age_column = "age_group"

# Split the DataFrame based on age 
df_15_24= economic_activity[economic_activity[age_column] == "15-24"]
df_25_plus = economic_activity[economic_activity[age_column] == "25+"]



# splitting the 15-24 into male and female  
total_male_15_24=  df_15_24[df_15_24["sex"] == "male"][df_15_24["economic_activity"] == "total"]["obs_value"]
total_male_15_24

total_female_15_24 = df_15_24[df_15_24["sex"] == "female"][df_15_24["economic_activity"] == "total"]["obs_value"]
total_female_15_24

# Initialize an empty list to store shares
shares_list = [] 

# Loop through the DataFrame rows and calculate shares
for index, row in df_15_24.iterrows():
    if row['sex'] == 'female':
        share = (row['obs_value'] / total_female_15_24)
        #share = share[1]
        #print(f"==> {share}")  

    else:
        share = row['obs_value'] / total_male_15_24
         
    shares_list.append(share)  # Append the calculated share to the list

# Assign the shares list to the 'shares' column in the DataFrame
#df_15_24['shares'] = shares_list
#

# Share data
shr_data = {'share': 
    ['130  1.0 Name: obs_value, dtype: float64',
 '130    0.369935 Name: obs_value, dtype: float64',
 '130    0.009806 Name: obs_value, dtype: float64',
 '130    0.06739 Name: obs_value, dtype: float64',
 '130   NaN Name: obs_value, dtype: float64',
 '130    0.004135 Name: obs_value, dtype: float64',
 '130    0.085303 Name: obs_value, dtype: float64',
 '130    0.1943 Name: obs_value, dtype: float64',
 '130    0.076461 Name: obs_value, dtype: float64',
 '130    0.032888 Name: obs_value, dtype: float64',
 '130    0.005183 Name: obs_value, dtype: float64',
 '130    0.004987 Name: obs_value, dtype: float64',
 '130    0.001794 Name: obs_value, dtype: float64',
 '130    0.004162 Name: obs_value, dtype: float64',
 '130    0.01999 Name: obs_value, dtype: float64',
 '130    0.020065 Name: obs_value, dtype: float64',
 '130    0.034218 Name: obs_value, dtype: float64',
 '130    0.011259 Name: obs_value, dtype: float64',
' 130    0.004757 Name: obs_value, dtype: float64',
 '130    0.020986 Name: obs_value, dtype: float64',
 '130    0.031087 Name: obs_value, dtype: float64',
 '216    1.0 Name: obs_value, dtype: float64',
 '216    0.328172 Name: obs_value, dtype: float64',
 '216    0.007453 Name: obs_value, dtype: float64',
 '216    0.068082 Name: obs_value, dtype: float64',
 '216    0.003288 Name: obs_value, dtype: float64',
 '216    0.042384 Name: obs_value, dtype: float64',
 '216    0.206942 Name: obs_value, dtype: float64',
 '216    0.03027 Name: obs_value, dtype: float64',
 '216    0.055839 Name: obs_value, dtype: float64',
 '216    0.003184 Name: obs_value, dtype: float64',
 '216    0.016027 Name: obs_value, dtype: float64',
 '216   NaN Name: obs_value, dtype: float64',
 '216    0.005942 Name: obs_value, dtype: float64',
 '216    0.0183 Name: obs_value, dtype: float64',
 '216    0.012768 Name: obs_value, dtype: float64',
 '216    0.056535 Name: obs_value, dtype: float64',
 '216    0.018167 Name: obs_value, dtype: float64',
 '216   NaN Name: obs_value, dtype: float64',
' 216    0.044659 Name: obs_value, dtype: float64',
 '216    0.079715Name: obs_value, dtype: float64',
 '216   NaN Name: obs_value, dtype: float64']}

df_shr= pd.DataFrame(shr_data)

# Use regular expressions to extract numerical values
#df_shr['shares'] = df_shr['shares'].str.extract(r'130\s+([\d.]+)|216\s+([\d.]+)').astype(float)
#df['shares'] = df['shares'].str.extract(r'130\s+([\d.]+)').astype(float)

# Use regular expressions to extract numerical values
df_shr['shares'] = df_shr['share'].str.extract(r'\d+\s+(\d+\.\d+)').astype(float)
print(df_shr)
#df_15_24["shares"] = df_shr['shares'].str.extract(r'130\s+([\d.]+)').astype(float)
#df_15_24.to_csv('shares.csv', index=False) 


                                               share    shares
0           130  1.0 Name: obs_value, dtype: float64  1.000000
1    130    0.369935 Name: obs_value, dtype: float64  0.369935
2    130    0.009806 Name: obs_value, dtype: float64  0.009806
3     130    0.06739 Name: obs_value, dtype: float64  0.067390
4          130   NaN Name: obs_value, dtype: float64       NaN
5    130    0.004135 Name: obs_value, dtype: float64  0.004135
6    130    0.085303 Name: obs_value, dtype: float64  0.085303
7      130    0.1943 Name: obs_value, dtype: float64  0.194300
8    130    0.076461 Name: obs_value, dtype: float64  0.076461
9    130    0.032888 Name: obs_value, dtype: float64  0.032888
10   130    0.005183 Name: obs_value, dtype: float64  0.005183
11   130    0.004987 Name: obs_value, dtype: float64  0.004987
12   130    0.001794 Name: obs_value, dtype: float64  0.001794
13   130    0.004162 Name: obs_value, dtype: float64  0.004162
14    130    0.01999 Name: obs_value, dtype: float64  0

  total_male_15_24=  df_15_24[df_15_24["sex"] == "male"][df_15_24["economic_activity"] == "total"]["obs_value"]
  total_female_15_24 = df_15_24[df_15_24["sex"] == "female"][df_15_24["economic_activity"] == "total"]["obs_value"]


In [20]:
# Reset the index and drop the old index column
df_15_24 = df_15_24.reset_index(drop=True)

# combining the share data and economic activity for 15_24 age group
economic_shares=pd.concat([df_15_24,df_shr],axis=1)
economic_shares
# dropping the extra shares column
economic_shares=economic_shares.drop(['share'],axis=1)
economic_shares



Unnamed: 0,indicator.label,time,obs_value,age_group,sex,economic_activity,shares
0,"Employment by sex, age and economic activity (...",2019,1613.832,15-24,male,total,1.0
1,"Employment by sex, age and economic activity (...",2019,597.013,15-24,male,"Agricullture, forestry and fishing",0.369935
2,"Employment by sex, age and economic activity (...",2019,15.825,15-24,male,Mining and Quarrying,0.009806
3,"Employment by sex, age and economic activity (...",2019,108.756,15-24,male,Manufacturing,0.06739
4,"Employment by sex, age and economic activity (...",2019,,15-24,male,"Electricity; gas, steam and air conditioning s...",
5,"Employment by sex, age and economic activity (...",2019,6.674,15-24,male,"Water supply, sewerage, waste management and ...",0.004135
6,"Employment by sex, age and economic activity (...",2019,137.664,15-24,male,Construction,0.085303
7,"Employment by sex, age and economic activity (...",2019,313.567,15-24,male,Wholesale and retail trade; repair of motor ve...,0.1943
8,"Employment by sex, age and economic activity (...",2019,123.395,15-24,male,Transportation and storage,0.076461
9,"Employment by sex, age and economic activity (...",2019,53.076,15-24,male,Accommodation and food service activities,0.032888


#### Calculating the shares for the 25+ group

In [3]:
##  Working on the 25+ age group
 
# Reset the index and drop the old index column
df_25_plus = df_25_plus.reset_index(drop=True)
df_25_plus


NameError: name 'df_25_plus' is not defined

In [1]:
# getting the total male obs_value
total_male_25_plus=  df_25_plus[df_25_plus["sex"] == "male"][df_25_plus["economic_activity"] == "total"]["obs_value"]
total_male_25_plus
# getting the total female obs_value
total_female_25_plus = df_25_plus[df_25_plus["sex"] == "female"][df_25_plus["economic_activity"] == "total"]["obs_value"]
total_female_25_plus

# Initialize an empty list to store shares
shares_list = [] 

# Loop through the DataFrame rows and calculate shares
for index, row in df_25_plus.iterrows():
    if row['sex'] == 'female':
        share= (row['obs_value'] / total_female_25_plus)
    else:
        share= row['obs_value'] / total_male_25_plus    
    shares_list.append(share)  # Append the calculated share to the list


# Sample data
shr_data_25_plus = {'share': 
    [
'237    1.0 Name: obs_value, dtype: float64',
'237    0.363414 Name: obs_value, dtype: float64',
'237    0.006519 Name: obs_value, dtype: float64',
'237    0.0604 Name: obs_value, dtype: float64',
'237    NaN Name: obs_value, dtype: float64',
'237    0.003092 Name: obs_value, dtype: float64',
'237    0.02996 Name: obs_value, dtype: float64',
'237    0.243332 Name: obs_value, dtype: float64',
'237    0.022834 Name: obs_value, dtype: float64',
'237    0.04528 Name: obs_value, dtype: float64',
'237    0.001272 Name: obs_value, dtype: float64',
'237    0.01078 Name: obs_value, dtype: float64',
'237    0.003166 Name: obs_value, dtype: float64',
'237    0.004425 Name: obs_value, dtype: float64',
'237    0.011597 Name: obs_value, dtype: float64',
'237    0.019979 Name: obs_value, dtype: float64',
'237    0.068547 Name: obs_value, dtype: float64',
'237    0.021153 Name: obs_value, dtype: float64',
'237    0.001645 Name: obs_value, dtype: float64',
'237    0.038123 Name: obs_value, dtype: float64',
'237    0.043524 Name: obs_value, dtype: float64',
'237    NaN Name: obs_value, dtype: float64',
'237    NaN Name: obs_value, dtype: float64'
]}

#df['shares'] = df['shares'].str.extract(r'130\s+([\d.]+)').astype(float)
shr_data_25_plus= {'share':shares_list}

df_shr_25_plus= pd.DataFrame(shr_data_25_plus)
#print(df_shr_25_plus)

numbers =[]

#shr_data_25_plus= {'share'}
for entry in shr_data_25_plus['share']:
    match = re.search(r'\b(?:22|0)\s+([0-9.]+)', entry)
    if match:
        numbers.append(float(match.group(1)))

numbers


#numerical_values = [float(re.search(r'([\d.]+)', entry).group(1)) for entry in df_shr_25_plus]
#numerical_values
#df_shr_25_plus['shares'] = df_shr_25_plus['share'].str.extract(r'237\s+([\d.]+)').astype(float)
#df_shr_25_plus
#df_shr['shares'] = df_shr['share'].str.extract(r'\d+\s+(\d+\.\d+)').astype(float)df_shr_25_plus
 #print(f"{shares_list}")  df_shr_25_plus


NameError: name 'df_25_plus' is not defined

In [45]:
shr_data_25 = {'share': shares_list}
shr_data_25

{'share': [0    1.0
  Name: obs_value, dtype: float64,
  0    0.305114
  Name: obs_value, dtype: float64,
  0    0.010544
  Name: obs_value, dtype: float64,
  0    0.076296
  Name: obs_value, dtype: float64,
  0    0.001358
  Name: obs_value, dtype: float64,
  0    0.002446
  Name: obs_value, dtype: float64,
  0    0.106259
  Name: obs_value, dtype: float64,
  0    0.163471
  Name: obs_value, dtype: float64,
  0    0.096427
  Name: obs_value, dtype: float64,
  0    0.026705
  Name: obs_value, dtype: float64,
  0    0.005729
  Name: obs_value, dtype: float64,
  0    0.00963
  Name: obs_value, dtype: float64,
  0    0.00388
  Name: obs_value, dtype: float64,
  0    0.009963
  Name: obs_value, dtype: float64,
  0    0.035274
  Name: obs_value, dtype: float64,
  0    0.031288
  Name: obs_value, dtype: float64,
  0    0.06024
  Name: obs_value, dtype: float64,
  0    0.012252
  Name: obs_value, dtype: float64,
  0    0.002747
  Name: obs_value, dtype: float64,
  0    0.02075
  Name: obs_val

### 5.	Since the employed by sex, age and economic activity not modeled indicator only provides values for individual/select years, you will need to get the sector growth rate from the modeled data set (Employment by sex and economic activity -- ILO modelled estimates, Nov. 2022 (thousands) and assume that these growth rates would also apply to the youth population (15-35 years). For example, if the share of females in agriculture in the modeled dataset is 55% in 2019 and 57% in 2020 (2 percentage points increase) and you only have 2019 sectors for Kenya in the not modeled dataset (by age and sex). Assuming the share of agriculture for female youth is 49% then in 2020 you would assume the share also grows by 2 percentage points to 51%.