In [2]:
##IMPORTING LIBRARIES
import pandas as pd
import statistics as stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Creating a DataFrame with data from file with Estimated Population statistic
df_pop = pd.read_csv("PEA04.csv")
df_pop.sample(8)

Unnamed: 0,STATISTIC Label,Year,Age Group,Sex,Region,UNIT,VALUE
277,Estimated Population (Persons in April),2011,50 - 54 years,Both sexes,Mid-East,Thousand,38.2
4505,Estimated Population (Persons in April),2019,70 - 74 years,Female,South-West,Thousand,14.7
2004,Estimated Population (Persons in April),2014,85 years and over,Both sexes,Dublin,Thousand,16.6
4793,Estimated Population (Persons in April),2020,30 - 34 years,Male,South-West,Thousand,22.0
4373,Estimated Population (Persons in April),2019,45 - 49 years,Female,Midland,Thousand,10.8
4236,Estimated Population (Persons in April),2019,20 - 24 years,Female,Dublin,Thousand,45.9
5715,Estimated Population (Persons in April),2022,10 - 14 years,Female,State,Thousand,183.4
2128,Estimated Population (Persons in April),2015,10 - 14 years,Female,South-East,Thousand,14.6


In [4]:
#Explore information about the DataFrame, i.e. nr of rows, nr of not null values, data types for columns, etc...
df_pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6669 entries, 0 to 6668
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   STATISTIC Label  6669 non-null   object 
 1   Year             6669 non-null   int64  
 2   Age Group        6669 non-null   object 
 3   Sex              6669 non-null   object 
 4   Region           6669 non-null   object 
 5   UNIT             6669 non-null   object 
 6   VALUE            6669 non-null   float64
dtypes: float64(1), int64(1), object(5)
memory usage: 364.8+ KB


In [5]:
#using the transpose view for more comfortable reading
df_pop.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,6669.0,2017.0,3.741938,2011.0,2014.0,2017.0,2020.0,2023.0
VALUE,6669.0,75.67046,281.512172,1.2,12.5,20.7,42.2,5281.6


In [6]:
#Checking available age groups in the DataFrame with Estimated Population
df_pop["Age Group"].unique()

array(['0 - 4 years', '5 - 9 years', '10 - 14 years', '15 - 19 years',
       '20 - 24 years', '25 - 29 years', '30 - 34 years', '35 - 39 years',
       '40 - 44 years', '45 - 49 years', '50 - 54 years', '55 - 59 years',
       '60 - 64 years', '65 - 69 years', '70 - 74 years', '75 - 79 years',
       '80 - 84 years', '85 years and over', 'All ages'], dtype=object)

In [7]:
df_f_age = pd.read_csv("ICA108.csv")

In [8]:
df_f_age["Age Group"].unique() 

array(['16 - 29 years', '30 - 44 years', '45 - 59 years', '60 - 74 years',
       '75 years and over'], dtype=object)

In [9]:
#Function that converts Age Groups by combining several age groups in bigger one
def age_conv (x):
    if(x["Age Group"]=="0 - 4 years" or x["Age Group"]=="5 - 9 years" or x["Age Group"]=="10 - 14 years"):
        age = "0 - 14 years" 
    elif (x["Age Group"]=="15 - 19 years" or x["Age Group"]=="20 - 24 years" or x["Age Group"]=="25 - 29 years"):
        age = "16 - 29 years" 
    elif (x["Age Group"]=="30 - 34 years" or x["Age Group"]=="35 - 39 years" or x["Age Group"]=="40 - 44 years"):
        age = "30 - 44 years" 
    elif (x["Age Group"]=="45 - 49 years" or x["Age Group"]=="50 - 54 years" or x["Age Group"]=="55 - 59 years"):
        age = "45 - 59 years" 
    elif (x["Age Group"]=="60 - 64 years" or x["Age Group"]=="65 - 69 years" or x["Age Group"]=="70 - 74 years"):
        age = "60 - 74 years" 
    elif (x["Age Group"]=="80 - 84 years" or x["Age Group"]=="85 years and over"):
        age = "75 years and over" 
    else:
        age = "'All ages'"
    return age

In [10]:
#Creating new column with converted age groups in the existing DataFrame and checking if column is created
df_pop["Converted Age"] = df_pop.apply(age_conv,axis=1)
df_pop.head()

Unnamed: 0,STATISTIC Label,Year,Age Group,Sex,Region,UNIT,VALUE,Converted Age
0,Estimated Population (Persons in April),2011,0 - 4 years,Both sexes,State,Thousand,356.0,0 - 14 years
1,Estimated Population (Persons in April),2011,0 - 4 years,Both sexes,Border,Thousand,30.7,0 - 14 years
2,Estimated Population (Persons in April),2011,0 - 4 years,Both sexes,West,Thousand,32.6,0 - 14 years
3,Estimated Population (Persons in April),2011,0 - 4 years,Both sexes,Mid-West,Thousand,35.0,0 - 14 years
4,Estimated Population (Persons in April),2011,0 - 4 years,Both sexes,South-East,Thousand,32.0,0 - 14 years


In [11]:
#quick verification that we have only new age groups
df_pop["Converted Age"].unique()

array(['0 - 14 years', '16 - 29 years', '30 - 44 years', '45 - 59 years',
       '60 - 74 years', "'All ages'", '75 years and over'], dtype=object)

In [12]:
# Boolean indexing - https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
#removing unnecessary rows with irrelevant age groups
#and verifying nr of remaining rows

df_pop = df_pop.drop(df_pop[(df_pop["Converted Age"]=="'All ages'") | (df_pop["Converted Age"]=="0 - 14 years")].index)

df_pop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4914 entries, 81 to 6641
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   STATISTIC Label  4914 non-null   object 
 1   Year             4914 non-null   int64  
 2   Age Group        4914 non-null   object 
 3   Sex              4914 non-null   object 
 4   Region           4914 non-null   object 
 5   UNIT             4914 non-null   object 
 6   VALUE            4914 non-null   float64
 7   Converted Age    4914 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 345.5+ KB


In [13]:
#quick check that only relevant age groups are in the DataFrame
df_pop["Converted Age"].unique()

array(['16 - 29 years', '30 - 44 years', '45 - 59 years', '60 - 74 years',
       '75 years and over'], dtype=object)

In [14]:
#removing rows that reflect data splitted by sex and leaving only data related to both sexes

df_pop = df_pop.drop(df_pop[(df_pop["Sex"] != "Both sexes")].index)

df_pop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1638 entries, 81 to 6623
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   STATISTIC Label  1638 non-null   object 
 1   Year             1638 non-null   int64  
 2   Age Group        1638 non-null   object 
 3   Sex              1638 non-null   object 
 4   Region           1638 non-null   object 
 5   UNIT             1638 non-null   object 
 6   VALUE            1638 non-null   float64
 7   Converted Age    1638 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 115.2+ KB


In [15]:
#quick check for left data
df_pop["Sex"].unique()

array(['Both sexes'], dtype=object)

In [16]:
df_pop["Age Group"] = df_pop["Converted Age"].values
df_pop.sample(5)


Unnamed: 0,STATISTIC Label,Year,Age Group,Sex,Region,UNIT,VALUE,Converted Age
3165,Estimated Population (Persons in April),2017,16 - 29 years,Both sexes,Dublin,Thousand,78.3,16 - 29 years
248,Estimated Population (Persons in April),2011,45 - 59 years,Both sexes,South-West,Thousand,45.5,45 - 59 years
2814,Estimated Population (Persons in April),2016,45 - 59 years,Both sexes,Dublin,Thousand,85.7,45 - 59 years
2788,Estimated Population (Persons in April),2016,30 - 44 years,Both sexes,Mid-East,Thousand,56.9,30 - 44 years
817,Estimated Population (Persons in April),2012,45 - 59 years,Both sexes,Mid-East,Thousand,34.1,45 - 59 years


In [17]:
df_pop = df_pop.rename(columns={'Age Group': 'AgeGroup'})

In [18]:
df_pop = df_pop.drop(columns=["Converted Age"])

In [24]:
#one last check
df_pop.head(8)

Unnamed: 0,STATISTIC Label,Year,AgeGroup,Sex,Region,UNIT,VALUE
81,Estimated Population (Persons in April),2011,16 - 29 years,Both sexes,State,Thousand,281.0
82,Estimated Population (Persons in April),2011,16 - 29 years,Both sexes,Border,Thousand,25.9
83,Estimated Population (Persons in April),2011,16 - 29 years,Both sexes,West,Thousand,27.9
84,Estimated Population (Persons in April),2011,16 - 29 years,Both sexes,Mid-West,Thousand,29.9
85,Estimated Population (Persons in April),2011,16 - 29 years,Both sexes,South-East,Thousand,25.9
86,Estimated Population (Persons in April),2011,16 - 29 years,Both sexes,South-West,Thousand,40.8
87,Estimated Population (Persons in April),2011,16 - 29 years,Both sexes,Dublin,Thousand,72.0
88,Estimated Population (Persons in April),2011,16 - 29 years,Both sexes,Mid-East,Thousand,40.7


In [25]:
#Export DataFrame to a csv File
df_pop.to_csv("e_population_cleared.csv", index = False)