In [38]:
##IMPORTING LIBRARIES
import pandas as pd
import statistics as stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress the warnings
import warnings
warnings.filterwarnings('ignore')

In [39]:
df_ica1 = pd.read_csv("ICA29.csv")
df_ica1.head()

Unnamed: 0,Region,Year,Statistic Label,UNIT,VALUE
0,State,2007,Households with Computer,%,65.0
1,State,2007,Households with Computer Unweighted Sample,Number,4361.0
2,State,2007,Households with Computer connected to the Inte...,%,57.0
3,State,2007,Households with Computer connected to the Inte...,Number,4361.0
4,State,2008,Households with Computer,%,70.0


In [40]:
df_ica1.shape

(360, 5)

In [41]:
df_ica1.Year.unique()

array([2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016],
      dtype=int64)

In [42]:
df_ica2 = pd.read_csv("ICA172.csv")
df_ica2.head()

Unnamed: 0,STATISTIC Label,Year,NUTS 3 Region,UNIT,VALUE
0,Households with internet access,2022,State,%,94
1,Households with internet access,2022,Border,%,90
2,Households with internet access,2022,Midland,%,93
3,Households with internet access,2022,West,%,91
4,Households with internet access,2022,Dublin,%,97


In [43]:
df_ica2.shape

(27, 5)

In [44]:
df_ica2.Year.unique()

array([2022, 2021, 2020], dtype=int64)

In [45]:
#re-order columns 
df_ica1 = df_ica1[['Statistic Label','Year','Region','UNIT','VALUE']]
df_ica1.head()

Unnamed: 0,Statistic Label,Year,Region,UNIT,VALUE
0,Households with Computer,2007,State,%,65.0
1,Households with Computer Unweighted Sample,2007,State,Number,4361.0
2,Households with Computer connected to the Inte...,2007,State,%,57.0
3,Households with Computer connected to the Inte...,2007,State,Number,4361.0
4,Households with Computer,2008,State,%,70.0


In [46]:
df_ica1.head()

Unnamed: 0,Statistic Label,Year,Region,UNIT,VALUE
0,Households with Computer,2007,State,%,65.0
1,Households with Computer Unweighted Sample,2007,State,Number,4361.0
2,Households with Computer connected to the Inte...,2007,State,%,57.0
3,Households with Computer connected to the Inte...,2007,State,Number,4361.0
4,Households with Computer,2008,State,%,70.0


In [47]:
df_ica1 = df_ica1.drop(df_ica1[(df_ica1["Statistic Label"] != "Households with Computer connected to the Internet")].index)


In [48]:
df_ica1.shape

(90, 5)

In [49]:
df_ica2.head()

Unnamed: 0,STATISTIC Label,Year,NUTS 3 Region,UNIT,VALUE
0,Households with internet access,2022,State,%,94
1,Households with internet access,2022,Border,%,90
2,Households with internet access,2022,Midland,%,93
3,Households with internet access,2022,West,%,91
4,Households with internet access,2022,Dublin,%,97


In [50]:
df_ica2 = df_ica2.rename(columns={'STATISTIC Label': 'Statistic Label'})
df_ica2 = df_ica2.rename(columns={'NUTS 3 Region': 'Region'})

In [51]:
df_ica = pd.concat([df_ica1, df_ica2], ignore_index=True)
df_ica.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117 entries, 0 to 116
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Statistic Label  117 non-null    object 
 1   Year             117 non-null    int64  
 2   Region           117 non-null    object 
 3   UNIT             117 non-null    object 
 4   VALUE            117 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.7+ KB


In [52]:
# Sort the DataFrame 'df_pop' by 'year' in ascending order
df_ica = df_ica.sort_values(by=['Year'], ascending=True)

# Reset the index of the DataFrame 'df_pop'
df_ica = df_ica.reset_index(drop=True)

In [53]:
df_ica.shape

(117, 5)

In [54]:
df_ica.Year.unique()

array([2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2020,
       2021, 2022], dtype=int64)

In [55]:
df_ica.to_csv("ICA_m.csv", index = False)

# interpolating NAs

In [56]:
df_ica = pd.read_csv("ICA_m.csv")

In [57]:
# Create a set to store the unique names of the regions
region_names = set()

# Iterate over the data frame and add each region name to the set
for region in df_ica['Region']:
    region_names.add(region)

# Convert the set to a list
unique_region_names = list(region_names)

In [58]:
years_to_interpolate = [2017, 2018, 2019]

# Create a list of lists, where each sublist contains the values for a single row in the new data frame
data = []
for year in years_to_interpolate:
    for region in unique_region_names:
        row = [np.nan, year, region, np.nan, np.nan]
        data.append(row)

In [59]:
df = pd.DataFrame(data, columns=['Statistic Label', 'Year', 'Region', 'UNIT', 'VALUE'])

In [60]:
df_ica = pd.concat([df_ica, df], ignore_index=True)

In [61]:
df_ica.head()

Unnamed: 0,Statistic Label,Year,Region,UNIT,VALUE
0,Households with Computer connected to the Inte...,2007,State,%,57.0
1,Households with Computer connected to the Inte...,2007,Dublin,%,66.0
2,Households with Computer connected to the Inte...,2007,Midland,%,54.0
3,Households with Computer connected to the Inte...,2007,South-West,%,56.0
4,Households with Computer connected to the Inte...,2007,Border,%,43.0


In [62]:
df_ica.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Statistic Label  117 non-null    object 
 1   Year             144 non-null    int64  
 2   Region           144 non-null    object 
 3   UNIT             117 non-null    object 
 4   VALUE            117 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 5.8+ KB


In [63]:
df_ica['Statistic Label'] = df_ica['Statistic Label'].fillna('Households  with internet access')
df_ica['UNIT'] = df_ica['UNIT'].fillna('%')

In [64]:
df_ica.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Statistic Label  144 non-null    object 
 1   Year             144 non-null    int64  
 2   Region           144 non-null    object 
 3   UNIT             144 non-null    object 
 4   VALUE            117 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 5.8+ KB


In [65]:
# Sort the DataFrame 'df_pop' by 'year' in ascending order
df_ica = df_ica.sort_values(by=['Year'], ascending=True)

# Reset the index of the DataFrame 'df_pop'
df_ica = df_ica.reset_index(drop=True)

In [66]:
# Group the DataFrame by region
grouped_dataframe = df_ica.groupby("Region")

# Interpolate the missing values in each region
interpolated_dataframe = grouped_dataframe.apply(lambda group: group.interpolate(method="linear", axis=0))

# Ungroup the DataFrame
interpolated_dataframe = interpolated_dataframe.reset_index(drop=True)


In [67]:
# Sort the DataFrame 'df_pop' by 'year' in ascending order
df_ica = interpolated_dataframe.sort_values(by=['Year'], ascending=True)

# Reset the index of the DataFrame 'df_pop'
df_ica = df_ica.reset_index(drop=True)

In [68]:
df_ica.head()

Unnamed: 0,Statistic Label,Year,Region,UNIT,VALUE
0,Households with Computer connected to the Inte...,2007,Border,%,43.0
1,Households with Computer connected to the Inte...,2007,Midland,%,54.0
2,Households with Computer connected to the Inte...,2007,South-East,%,49.0
3,Households with Computer connected to the Inte...,2007,Mid-West,%,58.0
4,Households with Computer connected to the Inte...,2007,South-West,%,56.0


In [69]:
# In column 'Region' replacing value 'State' with 'Ireland' and 'Midland' with 'Midlands'
df_ica = df_ica.replace('State', 'Ireland')

In [70]:
df_ica = df_ica.replace('Midland', 'Midlands')

In [71]:
df_ica.rename(columns={'VALUE': 'Households with Internet Access'}, inplace=True)

In [72]:
df = pd.DataFrame(df_ica, columns=['Year', 'Region', 'Households with Internet Access'])

In [73]:
df.to_csv("ICA_c.csv", index = False)