Author: Niamh Hogan

This notebook analyses the differences between the sexes by age in Ireland collected by the 2022 census

## Step 1: Cleaning Data

In [367]:
# imports

import pandas as pd 

In [368]:
# read in data
df = pd.read_csv('./data/Irish_population_cso.csv')

# sanity check
# df.sample(10)

In [369]:
# drop unnecessary columns
drop_col_list = ["Statistic Label","CensusYear","Administrative Counties","UNIT"] 

df.drop(columns=drop_col_list, inplace=True)

# sanity check
print (df.head(3))

          Sex Single Year of Age    VALUE
0  Both sexes           All ages  5149139
1  Both sexes           All ages    61968
2  Both sexes           All ages   592713


In [370]:
# Drop all ages
df = df[df["Single Year of Age"] != "All ages"] 

In [371]:
# Drop all sexes
df = df[df["Sex"] != "Both sexes"]

In [372]:
# Replace under 1 years of age to 0
df["Single Year of Age"] = df["Single Year of Age"].str.replace("Under 1 year", "0")


In [373]:
# Remove spaces and years in ages
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.replace.html
df["Single Year of Age"] = df["Single Year of Age"].str.replace("\D", "", regex=True)


  df["Single Year of Age"] = df["Single Year of Age"].str.replace("\D", "", regex=True)


In [374]:
# Check data types
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6464 entries, 3296 to 9791
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Sex                 6464 non-null   object
 1   Single Year of Age  6464 non-null   object
 2   VALUE               6464 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 202.0+ KB
None


In [375]:
# Convert single year of age to int
df["Single Year of Age"] = df["Single Year of Age"].astype("int64")

## Step 2: Converting to Pivot Table 

In [376]:
# Convert to pivot table
df_anal = pd.pivot_table(df, 'VALUE',"Single Year of Age","Sex")
print (df_anal.head(10))
# df_anal.to_csv("population_for_analysis.csv")

# https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html#pandas.pivot_table

Sex                    Female       Male
Single Year of Age                      
0                   1761.6250  1850.6250
1                   1721.5625  1804.6875
2                   1810.8750  1889.7500
3                   1842.6875  1937.5625
4                   1863.6875  1980.3750
5                   1958.8750  2042.7500
6                   2038.8750  2130.7500
7                   2098.2500  2214.3125
8                   2152.3125  2268.5000
9                   2201.6875  2310.5625


## Task 1: Weighted mean age (by sex) and the difference between the sexes by age

In [None]:
# Read in Data from folder
file_path = "./data/population_for_analysis.csv"

df = pd.read_csv(file_path)
print(df.head(3))

# https://phoenixnap.com/kb/absolute-path-vs-relative-path

   Single Year of Age     Female       Male
0                   0  1761.6250  1850.6250
1                   1  1721.5625  1804.6875
2                   2  1810.8750  1889.7500
