# Step 5.  Aggregation of all cleaned and processed data into a resulting dataset. 
This dataset will be the basis for solving the tasks.

In [1]:
##IMPORTING LIBRARIES
import pandas as pd
import statistics as stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Suppress the warnings
import warnings
warnings.filterwarnings('ignore')

In [19]:
#Creating dataframes from clean files
df_pop = pd.read_csv("PEA04_c.csv")
df_f = pd.read_csv("F2002_c.csv")
df_ica = pd.read_csv("ICA_c.csv")
#brief view
df_pop.head()

Unnamed: 0,Year,Region,Estimated Population nr
0,2011,Border,784000.0
1,2011,Dublin,2523000.0
2,2011,Ireland,9149700.0
3,2011,Mid-East,1314900.0
4,2011,Mid-West,933600.0


In [20]:
#checking nr of rows
df_pop.shape

(117, 3)

In [21]:
# making sure that there are only 9 regions
df_pop.Region.unique()

array(['Border', 'Dublin', 'Ireland', 'Mid-East', 'Mid-West', 'Midlands',
       'South-East', 'South-West', 'West'], dtype=object)

In [22]:
##brief view
df_f.head()

Unnamed: 0,Year,Region,Average Nr of Persons Household
0,2011,Border,2.728
1,2011,Midlands,2.792
2,2011,Mid-East,2.917
3,2011,South-East,2.748
4,2011,South-West,2.69


In [23]:
#checking nr of rows
df_f.shape

(108, 3)

In [24]:
# making sure that there are only 9 regions
df_f.Region.unique()

array(['Border', 'Midlands', 'Mid-East', 'South-East', 'South-West',
       'Dublin', 'West', 'Ireland', 'Mid-West'], dtype=object)

In [25]:
#brief view
df_ica.head()

Unnamed: 0,Year,Region,% Households with Internet Access
0,2007,Border,43.0
1,2007,Mid-West,58.0
2,2007,Midlands,54.0
3,2007,Mid-East,66.0
4,2007,South-East,49.0


In [26]:
#checking nr of rows
df_ica.shape

(144, 3)

In [27]:
# making sure that there are only 9 regions
df_ica.Region.unique()

array(['Border', 'Mid-West', 'Midlands', 'Mid-East', 'South-East',
       'Ireland', 'South-West', 'Dublin', 'West'], dtype=object)

In [28]:
# nerging data frames with estimated population number and average number of persong per household
df_agg1 = df_pop.merge(df_f, on=['Year', 'Region'], how='left')
df_agg1.head()

Unnamed: 0,Year,Region,Estimated Population nr,Average Nr of Persons Household
0,2011,Border,784000.0,2.728
1,2011,Dublin,2523000.0,2.65
2,2011,Ireland,9149700.0,2.73
3,2011,Mid-East,1314900.0,2.917
4,2011,Mid-West,933600.0,2.703


In [29]:
# adding to aggregated data frame % of households with internet access
df_agg1 = df_agg1.merge(df_ica, on=['Year', 'Region'], how='left')
df_agg1.head(10)

Unnamed: 0,Year,Region,Estimated Population nr,Average Nr of Persons Household,% Households with Internet Access
0,2011,Border,784000.0,2.728,66.0
1,2011,Dublin,2523000.0,2.65,84.0
2,2011,Ireland,9149700.0,2.73,78.0
3,2011,Mid-East,1314900.0,2.917,87.0
4,2011,Mid-West,933600.0,2.703,77.0
5,2011,Midlands,567600.0,2.792,79.0
6,2011,South-East,820600.0,2.748,71.0
7,2011,South-West,1324600.0,2.69,76.0
8,2011,West,881600.0,2.7,79.0
9,2012,West,878700.0,2.694,82.0


##### Calculating Number of people with Internet access by using next formulas

Nr of Households= Estimated Population nr/ Average Nr of Persons Household

Nr of Households with Internet Access = Nr of Households * % Households with Internet Access / 100

Nr of Persons with Internet Access = Nr of Households with Internet Access * Average Nr of Persons Household

% of Persons with Internet Access = Nr of Persons with Internet Access/ Estimated Population nr*100


In [44]:
df_agg1['Nr of Households'] = df_agg1['Estimated Population nr'] / df_agg1['Average Nr of Persons Household']

In [45]:
df_agg1['Nr of Households with Internet Access'] = df_agg1['Nr of Households']*df_agg1['% Households with Internet Access']/100

In [46]:
df_agg1['Nr of Persons with Internet Access'] = df_agg1['Nr of Households with Internet Access'] * df_agg1['Average Nr of Persons Household']

In [47]:
df_agg1['% of Persons with Internet Access'] = df_agg1['Nr of Persons with Internet Access']/df_agg1['Estimated Population nr']*100

In [48]:
df_agg1.head()

Unnamed: 0,Year,Region,Estimated Population nr,Average Nr of Persons Household,% Households with Internet Access,Nr of Households,Nr of Households with Internet Access,Nr of Persons with Internet Access,% of Persons with Internet Access
0,2011,Border,784000.0,2.728,66.0,287390.0,189677.4,517440.0,66.0
1,2011,Dublin,2523000.0,2.65,84.0,952075.5,799743.4,2119320.0,84.0
2,2011,Ireland,9149700.0,2.73,78.0,3351538.0,2614200.0,7136766.0,78.0
3,2011,Mid-East,1314900.0,2.917,87.0,450771.3,392171.1,1143963.0,87.0
4,2011,Mid-West,933600.0,2.703,77.0,345394.0,265953.4,718872.0,77.0


In [49]:
df_agg1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 9 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Year                                   108 non-null    int64  
 1   Region                                 108 non-null    object 
 2   Estimated Population nr                108 non-null    float64
 3   Average Nr of Persons Household        108 non-null    float64
 4   % Households with Internet Access      108 non-null    float64
 5   Nr of Households                       108 non-null    float64
 6   Nr of Households with Internet Access  108 non-null    float64
 7   Nr of Persons with Internet Access     108 non-null    float64
 8   % of Persons with Internet Access      108 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 7.7+ KB


In [35]:
df_agg1 = df_agg1.dropna()

In [36]:
df_agg1.shape

(108, 9)

In [50]:
#formating values for readable view
df_agg1['Nr of Households'] = df_agg1['Nr of Households'].apply(lambda x: f"{x:.2f}")

In [51]:
df_agg1['Nr of Households with Internet Access'] = df_agg1['Nr of Households with Internet Access'].apply(lambda x: f"{x:.2f}")

In [67]:
df_agg1 = df_agg1.rename(columns={'Households with Internet Access': '% of Households with Internet Access'})

In [52]:
df_agg1.head(10)

Unnamed: 0,Year,Region,Estimated Population nr,Average Nr of Persons Household,% Households with Internet Access,Nr of Households,Nr of Households with Internet Access,Nr of Persons with Internet Access,% of Persons with Internet Access
0,2011,Border,784000.0,2.728,66.0,287390.03,189677.42,517440.0,66.0
1,2011,Dublin,2523000.0,2.65,84.0,952075.47,799743.4,2119320.0,84.0
2,2011,Ireland,9149700.0,2.73,78.0,3351538.46,2614200.0,7136766.0,78.0
3,2011,Mid-East,1314900.0,2.917,87.0,450771.34,392171.07,1143963.0,87.0
4,2011,Mid-West,933600.0,2.703,77.0,345394.01,265953.39,718872.0,77.0
5,2011,Midlands,567600.0,2.792,79.0,203295.13,160603.15,448404.0,79.0
6,2011,South-East,820600.0,2.748,71.0,298617.18,212018.2,582626.0,71.0
7,2011,South-West,1324600.0,2.69,76.0,492416.36,374236.43,1006696.0,76.0
8,2011,West,881600.0,2.7,79.0,326518.52,257949.63,696464.0,79.0
9,2012,Border,784200.0,2.724,75.0,287885.46,215914.1,588150.0,75.0


In [53]:
# Sort the DataFrame 'df_pop' by 'year' in ascending order
df_agg1 = df_agg1.sort_values(by=['Year',"Region"], ascending=True)

# Reset the index of the DataFrame 'df_pop'
df_agg1 = df_agg1.reset_index(drop=True)

In [54]:
df_final = pd.DataFrame(df_agg1,columns=['Year',"Region",'Estimated Population nr','Nr of Persons with Internet Access',
                                       '% of Persons with Internet Access'])

In [55]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 5 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Year                                108 non-null    int64  
 1   Region                              108 non-null    object 
 2   Estimated Population nr             108 non-null    float64
 3   Nr of Persons with Internet Access  108 non-null    float64
 4   % of Persons with Internet Access   108 non-null    float64
dtypes: float64(3), int64(1), object(1)
memory usage: 4.3+ KB


In [56]:
df_final.to_csv("FINAL.csv", index = False)