# Assignment05 Population Analysis in Ireland
# Author: Michal Gondek 

In [34]:
# Import and Load Data

# Reference for Pandas: https://www.w3schools.com/python/pandas/default.asp
# Reference for Matplotlib: https://www.w3schools.com/python/matplotlib_pyplot.asp
import pandas as pd
import numpy as np

# Load dataset 
url = "https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/FY006A/CSV/1.0/en"
df = pd.read_csv(url)


In [35]:
# Keep only relevant columns and rename for convenience 
df = df[['Sex', 'Single Year of Age', 'Administrative Counties', 'VALUE']]


# Part 1 Weighted mean age by sex
# Clean Data 

In [36]:
# Clean the 'Single Year of Age' column 
df['Single Year of Age'] = df['Single Year of Age'].str.replace(' years', '', regex=False)
df['Single Year of Age'] = df['Single Year of Age'].str.replace('All ages', '', regex=False)
df['Single Year of Age'] = df['Single Year of Age'].str.replace('Under 1 year', '0', regex=False)
df['Single Year of Age'] = pd.to_numeric(df['Single Year of Age'], errors='coerce')
df = df.dropna(subset=['Single Year of Age'])
df['Single Year of Age'] = df['Single Year of Age'].astype(int)

In [37]:
# Filter national data only for Part 1 
df_national = df[df['Administrative Counties'] == 'Ireland']

## Calculate weighted mean age using age as value and population as weight

In [38]:
# Weighted mean accounts for the fact that some age groups have more individuals that others
# The population for each age influences how much it contributes to the final mean 

# I will begin with each variable listed as none so the script can later check whether both values were computed
female_mean_age = None
male_mean_age = None

# For each sex I created a subset containing only rows for that sex
for sex in df_national['Sex'].unique():
    subset = df_national[df_national['Sex'] == sex]
    w_mean = np.average(subset['Single Year of Age'], weights=subset['VALUE'])
    print(f'Weighted mean age for {sex}: {w_mean:.2f}')
    
    if sex == 'Female':
        female_mean_age = w_mean
    elif sex == 'Male':
        male_mean_age = w_mean

# Calculate difference if both sexes are present
# Print and store results
if female_mean_age is not None and male_mean_age is not None:
    difference = female_mean_age - male_mean_age
    print(f'Difference (Female - Male): {difference:.2f} years')
else:
    print("Both Male and Female data not found for national level")






Weighted mean age for Both sexes: 38.75
Weighted mean age for Male: 38.16
Weighted mean age for Female: 39.33
Difference (Female - Male): 1.17 years


# Part 2
## Age Group Write that code that would group the people within 5 years of that age together
## Using age 35

In [39]:

# Using age 35 I used target function to specify that age 
# Using age range sets the 5+- window creating an age group from 30 to 40 years old
target_age = 35
age_range = 5   

# Filter out Both Sexes row
age_group_filtered = age_group[age_group['Sex'].isin(['Male', 'Female'])]


# Sum population by sex in this age group
pivot_age_group = age_group_filtered.pivot_table(
    index='Sex',
    values='VALUE',
    aggfunc='sum'
)  

# Print population by sex in this age group first
print('Population by sex in this age group:')
print(pivot_age_group)



# Calculate difference between female and male 
male_pop = pivot_age_group.loc['Male', 'VALUE']
female_pop = pivot_age_group.loc['Female', 'VALUE']

pop_diff = female_pop - male_pop

# Print Results
print(f"\nPopulation difference for ages {target_age-age_range} to {target_age+age_range} (Female - Male): {pop_diff}")

Population by sex in this age group:
         VALUE
Sex           
Female  414506
Male    384030

Population difference for ages 30 to 40 (Female - Male): 30476


# Part 3
## Region with biggest sex difference in this age group


In [53]:
# Convert age from string like "013" to integer 13
df['Single Year of Age'] = df['Single Year of Age'].astype(int)

# Filter to your age range, example: 30â€“40
df_national = df[(df['Single Year of Age'] >= 30) & (df['Single Year of Age'] <= 40)]

# Remove national total ("Ireland")
df_regions = df_national[df_national['Administrative Counties'] != 'Ireland']

# Keep Male + Female
df_regions = df_regions[df_regions['Sex'].isin(['Male', 'Female'])]

if df_regions.empty:
    print("No regional population data exists for this age range.")
else:
    pivot_region = df_regions.pivot_table(
        index='Administrative Counties',
        columns='Sex',
        values='VALUE',
        aggfunc='sum',
        fill_value=0
    )

    pivot_region['Difference'] = pivot_region['Female'] - pivot_region['Male']
    pivot_region['AbsDiff'] = pivot_region['Difference'].abs()

    biggest_diff_region = pivot_region.sort_values('AbsDiff', ascending=False).head(1)

    print("\nRegion with the biggest population difference between the sexes:")
    print(biggest_diff_region[['Male', 'Female', 'Difference']])



Region with the biggest population difference between the sexes:
Sex                       Male  Female  Difference
Administrative Counties                           
Fingal County Council    26150   29092        2942
