In [1]:
##IMPORTING LIBRARIES
import pandas as pd
import statistics as stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress the warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_pop = pd.read_csv("F2002.csv")
df_pop.head()

Unnamed: 0,Statistic Label,CensusYear,County,UNIT,VALUE
0,Average Number of Persons per Private Househol...,1971,Co. Clare,Number,3.81
1,Average Number of Persons per Private Househol...,1971,Co. Cork,Number,3.93
2,Average Number of Persons per Private Househol...,1971,Co. Cavan,Number,3.64
3,Average Number of Persons per Private Househol...,1971,Co. Carlow,Number,4.17
4,Average Number of Persons per Private Househol...,1971,Co. Donegal,Number,3.9


In [3]:
df_reg = pd.read_csv("Regions.csv")

In [4]:
#Using 'ffill' method in the column with Regions to fill NAs with the last valid value 
df_reg["Name of region "] = df_reg["Name of region "].fillna(method='ffill')
df_reg.sample(8)

Unnamed: 0,Name of region,Constituent counties,Type of area
30,South-East,Waterford,City
7,Dublin,Dublin,City
34,South-West,Cork,City
20,Midlands,,
17,Midlands,Longford,Administrative county
2,Border,Leitrim,Administrative county
6,Border,,
10,Dublin,South Dublin,Administrative county


In [5]:
df_reg = df_reg.dropna()

In [6]:
# North Tiperrary and South Tiperrary contain additional space at the end, just remove it
df_reg["Constituent counties"] = df_reg["Constituent counties"].str.replace(f'Tipperary ', 'Tipperary')

In [7]:
#final check
df_reg["Constituent counties"].unique()

array(['Cavan', 'Donegal', 'Leitrim', 'Louth', 'Monaghan', 'Sligo',
       'Dublin', 'Dún Laoghaire-Rathdown', 'Fingal', 'South Dublin',
       'Kildare', 'Meath', 'Wicklow', 'Laois', 'Longford', 'Offaly',
       'Westmeath', 'Clare', 'Limerick', 'North Tipperary', 'Tipperary',
       'Carlow', 'Kilkenny', 'South Tipperary', 'Waterford', 'Wexford',
       'Cork', 'Kerry', 'Galway', 'Mayo', 'Roscommon'], dtype=object)

In [8]:
df_pop = pd.read_csv("F2002.csv")

In [9]:
df_pop['County'] = df_pop['County'].str.replace('.', '')
df_pop['County'] = df_pop['County'].str.replace('Co ', '')
df_pop.County.unique()

array(['Clare', 'Cork', 'Cavan', 'Carlow', 'Donegal', 'Dublin', 'Galway',
       'Kildare', 'Kilkenny', 'Kerry', 'Longford', 'Louth', 'Limerick',
       'Leitrim', 'Laois', 'Meath', 'Monaghan', 'Mayo', 'Offaly',
       'Roscommon', 'Sligo', 'Tipperary', 'Waterford', 'Wicklow',
       'Westmeath', 'Wexford', 'Ireland'], dtype=object)

In [10]:
#column Region  will be created in the dataFrame

#Function is working with 2 dataframes as arguments:
#for eah element from column "County" in the df_it will check if there is according name in 
#"Constituent counties" column from df_reg
#and if so value from column "Name of region" will be taken for new column

def create_region_column(df_it, df_reg):
    df_it['Region'] = df_it['County'].apply(
        lambda x: df_reg.loc[df_reg['Constituent counties'] == x, 'Name of region '].iloc[0] 
        if x in df_reg['Constituent counties'].to_list() else x)
    return df_it

In [11]:
#applying fuction to current dataframe and checking if new column is created with correct values
df_pop = create_region_column(df_pop, df_reg)

In [12]:
#replacing the counties with regions and checking 
df_pop["County"] = df_pop["Region"]

In [13]:
df_pop = df_pop.rename(columns={'County': 'Name of Region'})
df_pop = df_pop.drop(columns=["Region"])
df_pop["Name of Region"].unique()

array(['Mid-West', 'South-West', 'Border', 'South-East', 'Dublin', 'West',
       'Mid-East', 'Midlands', 'Ireland'], dtype=object)

In [14]:
# Group the DataFrame by the 'Statistic Label', 'CensusYear', 'Name of Region', and 'UNIT' columns
df_grouped = df_pop.groupby(['Statistic Label','CensusYear', 'Name of Region','UNIT'])

# Calculate the mean of the 'VALUE' column for each group
df_agg = df_grouped['VALUE'].mean()

In [15]:
df_agg.head()

Statistic Label                                                             CensusYear  Name of Region  UNIT  
Average Number of Persons per Private Household in Permanent Housing Units  1971        Border          Number    3.726667
                                                                                        Dublin          Number    3.940000
                                                                                        Ireland         Number    3.940000
                                                                                        Mid-East        Number    4.126667
                                                                                        Mid-West        Number    3.923333
Name: VALUE, dtype: float64

In [16]:
df_agg = df_agg.reset_index()
df_agg.head()

Unnamed: 0,Statistic Label,CensusYear,Name of Region,UNIT,VALUE
0,Average Number of Persons per Private Househol...,1971,Border,Number,3.726667
1,Average Number of Persons per Private Househol...,1971,Dublin,Number,3.94
2,Average Number of Persons per Private Househol...,1971,Ireland,Number,3.94
3,Average Number of Persons per Private Househol...,1971,Mid-East,Number,4.126667
4,Average Number of Persons per Private Househol...,1971,Mid-West,Number,3.923333


In [17]:
df_agg.to_csv("F2002_m.csv", index = False)

# second part 
#### interpolating NAs

In [18]:
df_pop = pd.read_csv("F2002_m.csv")

In [19]:
# Create a set to store the unique names of the regions
region_names = set()

# Iterate over the data frame and add each region name to the set
for region in df_pop['Name of Region']:
    region_names.add(region)

# Convert the set to a list
unique_region_names = list(region_names)

In [20]:
unique_region_names

['Midlands',
 'Mid-West',
 'South-West',
 'Border',
 'South-East',
 'Dublin',
 'Ireland',
 'Mid-East',
 'West']

In [21]:
df_pop.CensusYear.unique()

array([1971, 1981, 1991, 2002, 2006, 2011, 2016, 2022], dtype=int64)

In [22]:
df_pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Statistic Label  72 non-null     object 
 1   CensusYear       72 non-null     int64  
 2   Name of Region   72 non-null     object 
 3   UNIT             72 non-null     object 
 4   VALUE            72 non-null     float64
dtypes: float64(1), int64(1), object(3)
memory usage: 2.9+ KB


In [23]:
years_to_interpolate = [2007, 2008, 2009, 2010, 2012, 2013, 2014, 2015, 2017, 2018, 2019, 2020, 2021]

# Create a list of lists, where each sublist contains the values for a single row in the new data frame
data = []
for year in years_to_interpolate:
    for region in unique_region_names:
        row = [np.nan, year, region, np.nan, np.nan]
        data.append(row)

In [24]:
df = pd.DataFrame(data, columns=['Statistic Label', 'CensusYear', 'Name of Region', 'UNIT', 'VALUE'])

In [25]:
df.head()

Unnamed: 0,Statistic Label,CensusYear,Name of Region,UNIT,VALUE
0,,2007,Midlands,,
1,,2007,Mid-West,,
2,,2007,South-West,,
3,,2007,Border,,
4,,2007,South-East,,


In [26]:
df_pop = pd.concat([df_pop, df], ignore_index=True)

In [27]:
df_pop.head()

Unnamed: 0,Statistic Label,CensusYear,Name of Region,UNIT,VALUE
0,Average Number of Persons per Private Househol...,1971,Border,Number,3.726667
1,Average Number of Persons per Private Househol...,1971,Dublin,Number,3.94
2,Average Number of Persons per Private Househol...,1971,Ireland,Number,3.94
3,Average Number of Persons per Private Househol...,1971,Mid-East,Number,4.126667
4,Average Number of Persons per Private Househol...,1971,Mid-West,Number,3.923333


In [28]:
df_pop['Statistic Label'] = df_pop['Statistic Label'].fillna('Average Number of Persons per Private Household in Permanent Housing Units')
df_pop['UNIT'] = df_pop['UNIT'].fillna('Number')

In [29]:
# Sort the DataFrame 'df_pop' by 'year' in ascending order
df_pop = df_pop.sort_values(by=['CensusYear'], ascending=True)

# Reset the index of the DataFrame 'df_pop'
df_pop = df_pop.reset_index(drop=True)

In [30]:
# Group the DataFrame by region
grouped_dataframe = df_pop.groupby("Name of Region")

# Interpolate the missing values in each region
interpolated_dataframe = grouped_dataframe.apply(lambda group: group.interpolate(method="linear", axis=0))

# Ungroup the DataFrame
interpolated_dataframe = interpolated_dataframe.reset_index(drop=True)


In [31]:
# Sort the DataFrame 'df_pop' by 'year' in ascending order
df_pop = interpolated_dataframe.sort_values(by=['CensusYear'], ascending=True)

# Reset the index of the DataFrame 'df_pop'
df_pop = df_pop.reset_index(drop=True)

In [32]:
df_pop.head()

Unnamed: 0,Statistic Label,CensusYear,Name of Region,UNIT,VALUE
0,Average Number of Persons per Private Househol...,1971,Border,Number,3.726667
1,Average Number of Persons per Private Househol...,1971,South-West,Number,3.91
2,Average Number of Persons per Private Househol...,1971,Ireland,Number,3.94
3,Average Number of Persons per Private Househol...,1971,South-East,Number,4.0475
4,Average Number of Persons per Private Househol...,1971,Mid-East,Number,4.126667


In [33]:
df_pop['VALUE'] = df_pop['VALUE'].apply(lambda x: '{:.3f}'.format(x))

In [34]:
df_pop = df_pop.rename(columns={'VALUE': 'Average Nr of Persons Household','CensusYear': 'Year','Name of Region': 'Region'})

In [35]:
df_pop = pd.DataFrame(df_pop, columns=['Year', 'Region', 'Average Nr of Persons Household'])

In [36]:
df_pop.head()

Unnamed: 0,Year,Region,Average Nr of Persons Household
0,1971,Border,3.727
1,1971,South-West,3.91
2,1971,Ireland,3.94
3,1971,South-East,4.047
4,1971,Mid-East,4.127


In [37]:
df_pop.to_csv("F2002_c.csv", index = False)