In [1]:
# Ignore warnings ignore
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Imports
import pandas as pd
import numpy as np

## Data import

In [3]:
df_temperatures=pd.read_csv("../../Data/Raw_Data/temperatures.csv")
df_bank_holidays=pd.read_csv("../../Data/Raw_Data/bank_holidays.csv")
df_electricity_demand=pd.read_csv("../../Data/Raw_Data/electricity_demand.csv")
df_population=pd.read_csv("../../Data/Raw_Data/population.csv")

In [4]:
#Dropping not needed columns
df_temperatures = df_temperatures.drop('Unnamed: 0', 1)
df_population = df_population.drop('Unnamed: 0', 1)
df_bank_holidays = df_bank_holidays.drop('Unnamed: 0', 1)

## Defining region master so cities and regions can be related

In [5]:
df_regions=pd.DataFrame()

In [6]:
df_regions['Region']=[x for x in df_population['Region'].unique()]

In [7]:
df_regions['City']=['Sevilla','Zaragoza','Oviedo','Santander','Valladolid','Albacete','Barcelona','Valencia',
                   'Badajoz','Vigo','Madrid','Murcia','Pamplona','Bilbao','Logroño']

In [8]:
df_regions

Unnamed: 0,Region,City
0,Andalucía,Sevilla
1,Aragón,Zaragoza
2,Asturias,Oviedo
3,Cantabria,Santander
4,Castilla y León,Valladolid
5,Castilla-La Mancha,Albacete
6,Cataluña,Barcelona
7,Comunidad Valenciana,Valencia
8,Extremadura,Badajoz
9,Galicia,Vigo


In [9]:
df_regions.to_csv("../../Data/Intermediate_Data/regions.csv")

## Calculate population percentage of each region

In [10]:
#Calculating country population over the years
df_county_population=df_population.groupby(['Year'], 
                                           as_index=False).agg({'Population':'sum'}).rename(columns={'Population':'Total_Population'})
df_county_population

Unnamed: 0,Year,Total_Population
0,2015,43249750.0
1,2016,43177319.0
2,2017,43176933.0
3,2018,43294859.0
4,2019,43552095.0
5,2020,43932022.0
6,2021,43869377.0


In [11]:
df_population=pd.merge(df_population,df_county_population,on='Year', how='left')
df_population

Unnamed: 0,Region,Year,Population,Total_Population
0,Andalucía,2021,8472407.0,43869377.0
1,Andalucía,2020,8464411.0,43932022.0
2,Andalucía,2019,8414240.0,43552095.0
3,Andalucía,2018,8384408.0,43294859.0
4,Andalucía,2017,8379820.0,43176933.0
...,...,...,...,...
100,La Rioja,2019,316798.0,43552095.0
101,La Rioja,2018,315675.0,43294859.0
102,La Rioja,2017,315381.0,43176933.0
103,La Rioja,2016,315794.0,43177319.0


In [12]:
df_population['Population_Ratio']=df_population['Population']/df_population['Total_Population']
df_population

Unnamed: 0,Region,Year,Population,Total_Population,Population_Ratio
0,Andalucía,2021,8472407.0,43869377.0,0.193128
1,Andalucía,2020,8464411.0,43932022.0,0.192671
2,Andalucía,2019,8414240.0,43552095.0,0.193199
3,Andalucía,2018,8384408.0,43294859.0,0.193658
4,Andalucía,2017,8379820.0,43176933.0,0.194081
...,...,...,...,...,...
100,La Rioja,2019,316798.0,43552095.0,0.007274
101,La Rioja,2018,315675.0,43294859.0,0.007291
102,La Rioja,2017,315381.0,43176933.0,0.007304
103,La Rioja,2016,315794.0,43177319.0,0.007314


## Temperature ponderation by region population

In [13]:
#We include region to temperature
df_temperatures_region=pd.merge(df_temperatures,df_regions,on='City', how='left')
df_temperatures_region

Unnamed: 0,Time,Date,Year,Month,Day,Hour,City,Temp,Region
0,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Albacete,268.948527,Castilla-La Mancha
1,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Badajoz,272.818933,Extremadura
2,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Barcelona,276.065316,Cataluña
3,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Bilbao,272.969665,País Vasco
4,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Logroño,266.218819,La Rioja
...,...,...,...,...,...,...,...,...,...
920515,2021-12-31 23:00:00,2021-12-31,2021,12,31,23,Sevilla,281.146932,Andalucía
920516,2021-12-31 23:00:00,2021-12-31,2021,12,31,23,Valencia,279.901981,Comunidad Valenciana
920517,2021-12-31 23:00:00,2021-12-31,2021,12,31,23,Valladolid,275.407475,Castilla y León
920518,2021-12-31 23:00:00,2021-12-31,2021,12,31,23,Vigo,278.176621,Galicia


In [14]:
#We include population to temperature
df_temperatures_population=pd.merge(df_temperatures_region,df_population,on=['Region','Year'], how='left')
df_temperatures_population

Unnamed: 0,Time,Date,Year,Month,Day,Hour,City,Temp,Region,Population,Total_Population,Population_Ratio
0,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Albacete,268.948527,Castilla-La Mancha,2059191.0,43249750.0,0.047612
1,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Badajoz,272.818933,Extremadura,1092997.0,43249750.0,0.025272
2,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Barcelona,276.065316,Cataluña,7508106.0,43249750.0,0.173599
3,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Bilbao,272.969665,País Vasco,2189257.0,43249750.0,0.050619
4,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Logroño,266.218819,La Rioja,317053.0,43249750.0,0.007331
...,...,...,...,...,...,...,...,...,...,...,...,...
920515,2021-12-31 23:00:00,2021-12-31,2021,12,31,23,Sevilla,281.146932,Andalucía,8472407.0,43869377.0,0.193128
920516,2021-12-31 23:00:00,2021-12-31,2021,12,31,23,Valencia,279.901981,Comunidad Valenciana,5058138.0,43869377.0,0.115300
920517,2021-12-31 23:00:00,2021-12-31,2021,12,31,23,Valladolid,275.407475,Castilla y León,2383139.0,43869377.0,0.054324
920518,2021-12-31 23:00:00,2021-12-31,2021,12,31,23,Vigo,278.176621,Galicia,2695645.0,43869377.0,0.061447


In [15]:
#Temperature ponderation by population
df_temperatures_population['Temp_Ponderation']= df_temperatures_population['Temp']*df_temperatures_population['Population_Ratio']

In [16]:
df_temperatures_population

Unnamed: 0,Time,Date,Year,Month,Day,Hour,City,Temp,Region,Population,Total_Population,Population_Ratio,Temp_Ponderation
0,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Albacete,268.948527,Castilla-La Mancha,2059191.0,43249750.0,0.047612,12.805077
1,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Badajoz,272.818933,Extremadura,1092997.0,43249750.0,0.025272,6.894613
2,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Barcelona,276.065316,Cataluña,7508106.0,43249750.0,0.173599,47.924616
3,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Bilbao,272.969665,País Vasco,2189257.0,43249750.0,0.050619,13.817438
4,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,Logroño,266.218819,La Rioja,317053.0,43249750.0,0.007331,1.951583
...,...,...,...,...,...,...,...,...,...,...,...,...,...
920515,2021-12-31 23:00:00,2021-12-31,2021,12,31,23,Sevilla,281.146932,Andalucía,8472407.0,43869377.0,0.193128,54.297357
920516,2021-12-31 23:00:00,2021-12-31,2021,12,31,23,Valencia,279.901981,Comunidad Valenciana,5058138.0,43869377.0,0.115300,32.272691
920517,2021-12-31 23:00:00,2021-12-31,2021,12,31,23,Valladolid,275.407475,Castilla y León,2383139.0,43869377.0,0.054324,14.961104
920518,2021-12-31 23:00:00,2021-12-31,2021,12,31,23,Vigo,278.176621,Galicia,2695645.0,43869377.0,0.061447,17.093140


## Including bank holiday days

In [17]:
df_bank_holidays

Unnamed: 0,Date,Year,Month,Day,Region
0,2015-01-01,2015,1,1,Nacional
1,2015-01-06,2015,1,6,Nacional
2,2015-02-28,2015,2,28,Andalucía
3,2015-03-19,2015,3,19,Comunidad Valenciana
4,2015-03-19,2015,3,19,Madrid
...,...,...,...,...,...
398,2021-10-09,2021,10,9,Comunidad Valenciana
399,2021-10-12,2021,10,12,Nacional
400,2021-11-01,2021,11,1,Nacional
401,2021-12-06,2021,12,6,Nacional


In [18]:
df_bank_holidays_city=pd.merge(df_bank_holidays,df_regions,on='Region', how='left')
df_bank_holidays_city

Unnamed: 0,Date,Year,Month,Day,Region,City
0,2015-01-01,2015,1,1,Nacional,
1,2015-01-06,2015,1,6,Nacional,
2,2015-02-28,2015,2,28,Andalucía,Sevilla
3,2015-03-19,2015,3,19,Comunidad Valenciana,Valencia
4,2015-03-19,2015,3,19,Madrid,Madrid
...,...,...,...,...,...,...
398,2021-10-09,2021,10,9,Comunidad Valenciana,Valencia
399,2021-10-12,2021,10,12,Nacional,
400,2021-11-01,2021,11,1,Nacional,
401,2021-12-06,2021,12,6,Nacional,


In [19]:
df_bank_holidays_city.isnull().values.sum()

53

In [20]:
df_bank_holidays_city['Region'][pd.isnull(df_bank_holidays_city.City)].unique()

array(['Nacional'], dtype=object)

In [21]:
df_bank_holidays_city['City']=df_bank_holidays_city['City'].fillna('Nacional')

In [22]:
def national_holiday(row):
    if row=='Nacional':
        return 1
    else:
        return 0

In [23]:
df_bank_holidays_city['Country_Bank_Holiday']=df_bank_holidays_city['Region'].apply(lambda row: national_holiday(row))

In [24]:
df_bank_holidays_city

Unnamed: 0,Date,Year,Month,Day,Region,City,Country_Bank_Holiday
0,2015-01-01,2015,1,1,Nacional,Nacional,1
1,2015-01-06,2015,1,6,Nacional,Nacional,1
2,2015-02-28,2015,2,28,Andalucía,Sevilla,0
3,2015-03-19,2015,3,19,Comunidad Valenciana,Valencia,0
4,2015-03-19,2015,3,19,Madrid,Madrid,0
...,...,...,...,...,...,...,...
398,2021-10-09,2021,10,9,Comunidad Valenciana,Valencia,0
399,2021-10-12,2021,10,12,Nacional,Nacional,1
400,2021-11-01,2021,11,1,Nacional,Nacional,1
401,2021-12-06,2021,12,6,Nacional,Nacional,1


In [25]:
# Taking population
df_bank_holidays_population=pd.merge(df_bank_holidays_city,df_population,on=['Region','Year'], how='left')
df_bank_holidays_population

Unnamed: 0,Date,Year,Month,Day,Region,City,Country_Bank_Holiday,Population,Total_Population,Population_Ratio
0,2015-01-01,2015,1,1,Nacional,Nacional,1,,,
1,2015-01-06,2015,1,6,Nacional,Nacional,1,,,
2,2015-02-28,2015,2,28,Andalucía,Sevilla,0,8399043.0,43249750.0,0.194199
3,2015-03-19,2015,3,19,Comunidad Valenciana,Valencia,0,4980689.0,43249750.0,0.115161
4,2015-03-19,2015,3,19,Madrid,Madrid,0,6436996.0,43249750.0,0.148833
...,...,...,...,...,...,...,...,...,...,...
398,2021-10-09,2021,10,9,Comunidad Valenciana,Valencia,0,5058138.0,43869377.0,0.115300
399,2021-10-12,2021,10,12,Nacional,Nacional,1,,,
400,2021-11-01,2021,11,1,Nacional,Nacional,1,,,
401,2021-12-06,2021,12,6,Nacional,Nacional,1,,,


In [26]:
df_bank_holidays_population['City'][pd.isnull(df_bank_holidays_population.Population_Ratio)].unique()

array(['Nacional'], dtype=object)

In [27]:
df_bank_holidays_population['Population_Ratio']=df_bank_holidays_population['Population_Ratio'].fillna(1)

In [28]:
df_bank_holidays_population

Unnamed: 0,Date,Year,Month,Day,Region,City,Country_Bank_Holiday,Population,Total_Population,Population_Ratio
0,2015-01-01,2015,1,1,Nacional,Nacional,1,,,1.000000
1,2015-01-06,2015,1,6,Nacional,Nacional,1,,,1.000000
2,2015-02-28,2015,2,28,Andalucía,Sevilla,0,8399043.0,43249750.0,0.194199
3,2015-03-19,2015,3,19,Comunidad Valenciana,Valencia,0,4980689.0,43249750.0,0.115161
4,2015-03-19,2015,3,19,Madrid,Madrid,0,6436996.0,43249750.0,0.148833
...,...,...,...,...,...,...,...,...,...,...
398,2021-10-09,2021,10,9,Comunidad Valenciana,Valencia,0,5058138.0,43869377.0,0.115300
399,2021-10-12,2021,10,12,Nacional,Nacional,1,,,1.000000
400,2021-11-01,2021,11,1,Nacional,Nacional,1,,,1.000000
401,2021-12-06,2021,12,6,Nacional,Nacional,1,,,1.000000


In [29]:
df_bank_holidays_population = df_bank_holidays_population.drop(['Population','Total_Population'], 1)
df_bank_holidays_population

Unnamed: 0,Date,Year,Month,Day,Region,City,Country_Bank_Holiday,Population_Ratio
0,2015-01-01,2015,1,1,Nacional,Nacional,1,1.000000
1,2015-01-06,2015,1,6,Nacional,Nacional,1,1.000000
2,2015-02-28,2015,2,28,Andalucía,Sevilla,0,0.194199
3,2015-03-19,2015,3,19,Comunidad Valenciana,Valencia,0,0.115161
4,2015-03-19,2015,3,19,Madrid,Madrid,0,0.148833
...,...,...,...,...,...,...,...,...
398,2021-10-09,2021,10,9,Comunidad Valenciana,Valencia,0,0.115300
399,2021-10-12,2021,10,12,Nacional,Nacional,1,1.000000
400,2021-11-01,2021,11,1,Nacional,Nacional,1,1.000000
401,2021-12-06,2021,12,6,Nacional,Nacional,1,1.000000


In [30]:
df_bank_holidays_population['Partial_Bank_Holiday'] = np.where(df_bank_holidays_population['City']!='Nacional', 
    1, 
    0)

In [31]:
df_bank_holidays_population['Partial_Bank_Holiday_Weight']=df_bank_holidays_population['Partial_Bank_Holiday']*\
df_bank_holidays_population['Population_Ratio']
df_bank_holidays_population

Unnamed: 0,Date,Year,Month,Day,Region,City,Country_Bank_Holiday,Population_Ratio,Partial_Bank_Holiday,Partial_Bank_Holiday_Weight
0,2015-01-01,2015,1,1,Nacional,Nacional,1,1.000000,0,0.000000
1,2015-01-06,2015,1,6,Nacional,Nacional,1,1.000000,0,0.000000
2,2015-02-28,2015,2,28,Andalucía,Sevilla,0,0.194199,1,0.194199
3,2015-03-19,2015,3,19,Comunidad Valenciana,Valencia,0,0.115161,1,0.115161
4,2015-03-19,2015,3,19,Madrid,Madrid,0,0.148833,1,0.148833
...,...,...,...,...,...,...,...,...,...,...
398,2021-10-09,2021,10,9,Comunidad Valenciana,Valencia,0,0.115300,1,0.115300
399,2021-10-12,2021,10,12,Nacional,Nacional,1,1.000000,0,0.000000
400,2021-11-01,2021,11,1,Nacional,Nacional,1,1.000000,0,0.000000
401,2021-12-06,2021,12,6,Nacional,Nacional,1,1.000000,0,0.000000


In [340]:
#Aggregating data by day:
df_bank_holidays_agg=df_bank_holidays_population.groupby(['Date','Year','Month','Day'], 
                                           as_index=False).agg(Country_Bank_Holiday=('Country_Bank_Holiday', 'mean'), 
                                                               Partial_Bank_Holiday=('Partial_Bank_Holiday', 'mean'),
                                                               Partial_Bank_Holiday_Weight=('Partial_Bank_Holiday_Weight',
                                                                                     'sum')
                                                              )

In [341]:
df_bank_holidays_agg

Unnamed: 0,Date,Year,Month,Day,Country_Bank_Holiday,Partial_Bank_Holiday,Partial_Bank_Holiday_Weight
0,2015-01-01,2015,1,1,1.0,0.0,0.000000
1,2015-01-06,2015,1,6,1.0,0.0,0.000000
2,2015-02-28,2015,2,28,0.0,1.0,0.194199
3,2015-03-19,2015,3,19,0.0,1.0,0.363348
4,2015-03-20,2015,3,20,0.0,1.0,0.063176
...,...,...,...,...,...,...,...
161,2021-10-09,2021,10,9,0.0,1.0,0.115300
162,2021-10-12,2021,10,12,1.0,0.0,0.000000
163,2021-11-01,2021,11,1,1.0,0.0,0.000000
164,2021-12-06,2021,12,6,1.0,0.0,0.000000


In [342]:
df_bank_holidays_agg.to_csv("../../Data/Intermediate_Data/bank_holidays_agg.csv")

## Temperature ponderation by region population: aggregation by country

In [343]:
#Country temperature
#Calculating country population over the years
df_county_temp=df_temperatures_population.groupby(['Time','Date','Year','Month','Day','Hour'], 
                                           as_index=False).agg({'Temp_Ponderation':'sum'})
df_county_temp

Unnamed: 0,Time,Date,Year,Month,Day,Hour,Temp_Ponderation
0,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,272.368163
1,2015-01-01 01:00:00,2015-01-01,2015,1,1,1,272.047456
2,2015-01-01 02:00:00,2015-01-01,2015,1,1,2,271.796548
3,2015-01-01 03:00:00,2015-01-01,2015,1,1,3,271.602937
4,2015-01-01 04:00:00,2015-01-01,2015,1,1,4,271.459464
...,...,...,...,...,...,...,...
61363,2021-12-31 19:00:00,2021-12-31,2021,12,31,19,281.005748
61364,2021-12-31 20:00:00,2021-12-31,2021,12,31,20,280.474065
61365,2021-12-31 21:00:00,2021-12-31,2021,12,31,21,279.770309
61366,2021-12-31 22:00:00,2021-12-31,2021,12,31,22,279.171545


## Check if we have all years, months, days and hours

In [344]:
#All monts in each year
for i in [2015,2016,2017,2018,2019,2020,2021]:
    print(df_county_temp['Month'][(df_county_temp['Year']==i)].unique())

[ 1  2  3  4  5  6  7  8  9 10 11 12]
[ 1  2  3  4  5  6  7  8  9 10 11 12]
[ 1  2  3  4  5  6  7  8  9 10 11 12]
[ 1  2  3  4  5  6  7  8  9 10 11 12]
[ 1  2  3  4  5  6  7  8  9 10 11 12]
[ 1  2  3  4  5  6  7  8  9 10 11 12]
[ 1  2  3  4  5  6  7  8  9 10 11 12]


In [345]:
#All days in each month
df_county_temp.groupby(['Year','Month'],as_index=False).agg({'Temp_Ponderation':'count'})
df_county_temp.groupby(['Year','Month'],as_index=False).agg({'Temp_Ponderation':'count'})['Temp_Ponderation'].unique()

array([744, 672, 720, 696])

In [346]:
#All hours in each day
df_county_temp.groupby(['Date'],as_index=False).agg({'Temp_Ponderation':'count'})['Temp_Ponderation'].unique()


array([24])

In [347]:
df_county_temp.to_csv("../../Data/Intermediate_Data/county_temp.csv")

## Including bank holidays

In [348]:
df_county_temp_holidays=pd.merge(df_county_temp,df_bank_holidays_agg,on=['Date','Year','Month','Day'], how='left')

In [349]:
df_county_temp_holidays['Country_Bank_Holiday']=df_county_temp_holidays['Country_Bank_Holiday'].fillna(0)
df_county_temp_holidays['Partial_Bank_Holiday']=df_county_temp_holidays['Partial_Bank_Holiday'].fillna(0)
df_county_temp_holidays['Partial_Bank_Holiday_Weight']=df_county_temp_holidays['Partial_Bank_Holiday_Weight'].fillna(0)

In [350]:
df_county_temp_holidays

Unnamed: 0,Time,Date,Year,Month,Day,Hour,Temp_Ponderation,Country_Bank_Holiday,Partial_Bank_Holiday,Partial_Bank_Holiday_Weight
0,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,272.368163,1.0,0.0,0.0
1,2015-01-01 01:00:00,2015-01-01,2015,1,1,1,272.047456,1.0,0.0,0.0
2,2015-01-01 02:00:00,2015-01-01,2015,1,1,2,271.796548,1.0,0.0,0.0
3,2015-01-01 03:00:00,2015-01-01,2015,1,1,3,271.602937,1.0,0.0,0.0
4,2015-01-01 04:00:00,2015-01-01,2015,1,1,4,271.459464,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
61363,2021-12-31 19:00:00,2021-12-31,2021,12,31,19,281.005748,0.0,0.0,0.0
61364,2021-12-31 20:00:00,2021-12-31,2021,12,31,20,280.474065,0.0,0.0,0.0
61365,2021-12-31 21:00:00,2021-12-31,2021,12,31,21,279.770309,0.0,0.0,0.0
61366,2021-12-31 22:00:00,2021-12-31,2021,12,31,22,279.171545,0.0,0.0,0.0


In [351]:
df_county_temp_holidays.to_csv("../../Data/Intermediate_Data/county_temp_holidays.csv")

## Check demand data

In [352]:
df_electricity_demand

Unnamed: 0.1,Unnamed: 0,utcDateTime,Time,Date,Year,Month,Day,Hour,DemandaElect_ES_MWh
0,336,01/01/2015 0:00,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,245115
1,337,01/01/2015 1:00,2015-01-01 01:00:00,2015-01-01,2015,1,1,1,228661667
2,338,01/01/2015 2:00,2015-01-01 02:00:00,2015-01-01,2015,1,1,2,213928333
3,339,01/01/2015 3:00,2015-01-01 03:00:00,2015-01-01,2015,1,1,3,203196667
4,340,01/01/2015 4:00,2015-01-01 04:00:00,2015-01-01,2015,1,1,4,19923
...,...,...,...,...,...,...,...,...,...
61363,61699,31/12/2021 19:00,2021-12-31 19:00:00,2021-12-31,2021,12,31,19,276531667
61364,61700,31/12/2021 20:00,2021-12-31 20:00:00,2021-12-31,2021,12,31,20,267465
61365,61701,31/12/2021 21:00,2021-12-31 21:00:00,2021-12-31,2021,12,31,21,239526667
61366,61702,31/12/2021 22:00,2021-12-31 22:00:00,2021-12-31,2021,12,31,22,223248333


In [353]:
for i in [2015,2016,2017,2018,2019,2020,2021]:
    print(df_electricity_demand['Month'][(df_electricity_demand['Year']==i)].unique())

[ 1  2  3  4  5  6  7  8  9 10 11 12]
[ 1  2  3  4  5  6  7  8  9 10 11 12]
[ 1  2  3  4  5  6  7  8  9 10 11 12]
[ 1  2  3  4  5  6  7  8  9 10 11 12]
[ 1  2  3  4  5  6  7  8  9 10 11 12]
[ 1  2  3  4  5  6  7  8  9 10 11 12]
[ 1  2  3  4  5  6  7  8  9 10 11 12]


In [354]:
#All days in each month
df_electricity_demand.groupby(['Year','Month'],as_index=False).agg({'DemandaElect_ES_MWh':'count'})

Unnamed: 0,Year,Month,DemandaElect_ES_MWh
0,2015,1,744
1,2015,2,672
2,2015,3,744
3,2015,4,720
4,2015,5,744
...,...,...,...
79,2021,8,744
80,2021,9,720
81,2021,10,744
82,2021,11,720


In [355]:
df_electricity_demand.groupby(['Year','Month'],as_index=False).agg({'DemandaElect_ES_MWh':'count'})['DemandaElect_ES_MWh'].unique()

array([744, 672, 720, 696])

In [356]:
#All hours in each day
df_electricity_demand.groupby(['Date'],as_index=False).agg({'DemandaElect_ES_MWh':'count'})['DemandaElect_ES_MWh'].unique()

array([24])

## Merging demand and temperature

In [357]:
df_county_temp_holidays

Unnamed: 0,Time,Date,Year,Month,Day,Hour,Temp_Ponderation,Country_Bank_Holiday,Partial_Bank_Holiday,Partial_Bank_Holiday_Weight
0,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,272.368163,1.0,0.0,0.0
1,2015-01-01 01:00:00,2015-01-01,2015,1,1,1,272.047456,1.0,0.0,0.0
2,2015-01-01 02:00:00,2015-01-01,2015,1,1,2,271.796548,1.0,0.0,0.0
3,2015-01-01 03:00:00,2015-01-01,2015,1,1,3,271.602937,1.0,0.0,0.0
4,2015-01-01 04:00:00,2015-01-01,2015,1,1,4,271.459464,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
61363,2021-12-31 19:00:00,2021-12-31,2021,12,31,19,281.005748,0.0,0.0,0.0
61364,2021-12-31 20:00:00,2021-12-31,2021,12,31,20,280.474065,0.0,0.0,0.0
61365,2021-12-31 21:00:00,2021-12-31,2021,12,31,21,279.770309,0.0,0.0,0.0
61366,2021-12-31 22:00:00,2021-12-31,2021,12,31,22,279.171545,0.0,0.0,0.0


In [358]:
df_electricity_demand

Unnamed: 0.1,Unnamed: 0,utcDateTime,Time,Date,Year,Month,Day,Hour,DemandaElect_ES_MWh
0,336,01/01/2015 0:00,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,245115
1,337,01/01/2015 1:00,2015-01-01 01:00:00,2015-01-01,2015,1,1,1,228661667
2,338,01/01/2015 2:00,2015-01-01 02:00:00,2015-01-01,2015,1,1,2,213928333
3,339,01/01/2015 3:00,2015-01-01 03:00:00,2015-01-01,2015,1,1,3,203196667
4,340,01/01/2015 4:00,2015-01-01 04:00:00,2015-01-01,2015,1,1,4,19923
...,...,...,...,...,...,...,...,...,...
61363,61699,31/12/2021 19:00,2021-12-31 19:00:00,2021-12-31,2021,12,31,19,276531667
61364,61700,31/12/2021 20:00,2021-12-31 20:00:00,2021-12-31,2021,12,31,20,267465
61365,61701,31/12/2021 21:00,2021-12-31 21:00:00,2021-12-31,2021,12,31,21,239526667
61366,61702,31/12/2021 22:00,2021-12-31 22:00:00,2021-12-31,2021,12,31,22,223248333


In [359]:
df_county_temp_demand=pd.merge(df_electricity_demand,df_county_temp_holidays,
                              on=['Time','Date','Year','Month','Day','Hour'], how='inner')
df_county_temp_demand = df_county_temp_demand.drop(['Unnamed: 0','utcDateTime'], 1)

In [360]:
df_county_temp_demand

Unnamed: 0,Time,Date,Year,Month,Day,Hour,DemandaElect_ES_MWh,Temp_Ponderation,Country_Bank_Holiday,Partial_Bank_Holiday,Partial_Bank_Holiday_Weight
0,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,245115,272.368163,1.0,0.0,0.0
1,2015-01-01 01:00:00,2015-01-01,2015,1,1,1,228661667,272.047456,1.0,0.0,0.0
2,2015-01-01 02:00:00,2015-01-01,2015,1,1,2,213928333,271.796548,1.0,0.0,0.0
3,2015-01-01 03:00:00,2015-01-01,2015,1,1,3,203196667,271.602937,1.0,0.0,0.0
4,2015-01-01 04:00:00,2015-01-01,2015,1,1,4,19923,271.459464,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
61363,2021-12-31 19:00:00,2021-12-31,2021,12,31,19,276531667,281.005748,0.0,0.0,0.0
61364,2021-12-31 20:00:00,2021-12-31,2021,12,31,20,267465,280.474065,0.0,0.0,0.0
61365,2021-12-31 21:00:00,2021-12-31,2021,12,31,21,239526667,279.770309,0.0,0.0,0.0
61366,2021-12-31 22:00:00,2021-12-31,2021,12,31,22,223248333,279.171545,0.0,0.0,0.0


In [365]:
df_country_population=df_population.groupby(['Year'],as_index=False).agg({'Population':'sum'})
df_country_population

Unnamed: 0,Year,Population
0,2015,43249750.0
1,2016,43177319.0
2,2017,43176933.0
3,2018,43294859.0
4,2019,43552095.0
5,2020,43932022.0
6,2021,43869377.0


In [375]:
#Including population
df_electricity_demand=pd.merge(df_county_temp_demand,df_country_population,
                              on='Year', how='left')
df_electricity_demand

Unnamed: 0,Time,Date,Year,Month,Day,Hour,DemandaElect_ES_MWh,Temp_Ponderation,Country_Bank_Holiday,Partial_Bank_Holiday,Partial_Bank_Holiday_Weight,Population
0,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,245115,272.368163,1.0,0.0,0.0,43249750.0
1,2015-01-01 01:00:00,2015-01-01,2015,1,1,1,228661667,272.047456,1.0,0.0,0.0,43249750.0
2,2015-01-01 02:00:00,2015-01-01,2015,1,1,2,213928333,271.796548,1.0,0.0,0.0,43249750.0
3,2015-01-01 03:00:00,2015-01-01,2015,1,1,3,203196667,271.602937,1.0,0.0,0.0,43249750.0
4,2015-01-01 04:00:00,2015-01-01,2015,1,1,4,19923,271.459464,1.0,0.0,0.0,43249750.0
...,...,...,...,...,...,...,...,...,...,...,...,...
61363,2021-12-31 19:00:00,2021-12-31,2021,12,31,19,276531667,281.005748,0.0,0.0,0.0,43869377.0
61364,2021-12-31 20:00:00,2021-12-31,2021,12,31,20,267465,280.474065,0.0,0.0,0.0,43869377.0
61365,2021-12-31 21:00:00,2021-12-31,2021,12,31,21,239526667,279.770309,0.0,0.0,0.0,43869377.0
61366,2021-12-31 22:00:00,2021-12-31,2021,12,31,22,223248333,279.171545,0.0,0.0,0.0,43869377.0


In [376]:
#Check data types
df_electricity_demand.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61368 entries, 0 to 61367
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Time                         61368 non-null  object 
 1   Date                         61368 non-null  object 
 2   Year                         61368 non-null  int64  
 3   Month                        61368 non-null  int64  
 4   Day                          61368 non-null  int64  
 5   Hour                         61368 non-null  int64  
 6   DemandaElect_ES_MWh          61368 non-null  object 
 7   Temp_Ponderation             61368 non-null  float64
 8   Country_Bank_Holiday         61368 non-null  float64
 9   Partial_Bank_Holiday         61368 non-null  float64
 10  Partial_Bank_Holiday_Weight  61368 non-null  float64
 11  Population                   61368 non-null  float64
dtypes: float64(5), int64(4), object(3)
memory usage: 6.1+ MB


In [377]:
df_electricity_demand['DemandaElect_ES_MWh']=\
df_electricity_demand['DemandaElect_ES_MWh'].str.replace(',', '.').astype(float)

In [380]:
df_electricity_demand.rename(columns={'DemandaElect_ES_MWh': 'Demand_MWh', 'Temp_Ponderation': 'Temp_K'},
                             inplace=True)

In [381]:
df_electricity_demand

Unnamed: 0,Time,Date,Year,Month,Day,Hour,Demand_MWh,Temp_K,Country_Bank_Holiday,Partial_Bank_Holiday,Partial_Bank_Holiday_Weight,Population
0,2015-01-01 00:00:00,2015-01-01,2015,1,1,0,24511.5000,272.368163,1.0,0.0,0.0,43249750.0
1,2015-01-01 01:00:00,2015-01-01,2015,1,1,1,22866.1667,272.047456,1.0,0.0,0.0,43249750.0
2,2015-01-01 02:00:00,2015-01-01,2015,1,1,2,21392.8333,271.796548,1.0,0.0,0.0,43249750.0
3,2015-01-01 03:00:00,2015-01-01,2015,1,1,3,20319.6667,271.602937,1.0,0.0,0.0,43249750.0
4,2015-01-01 04:00:00,2015-01-01,2015,1,1,4,19923.0000,271.459464,1.0,0.0,0.0,43249750.0
...,...,...,...,...,...,...,...,...,...,...,...,...
61363,2021-12-31 19:00:00,2021-12-31,2021,12,31,19,27653.1667,281.005748,0.0,0.0,0.0,43869377.0
61364,2021-12-31 20:00:00,2021-12-31,2021,12,31,20,26746.5000,280.474065,0.0,0.0,0.0,43869377.0
61365,2021-12-31 21:00:00,2021-12-31,2021,12,31,21,23952.6667,279.770309,0.0,0.0,0.0,43869377.0
61366,2021-12-31 22:00:00,2021-12-31,2021,12,31,22,22324.8333,279.171545,0.0,0.0,0.0,43869377.0


In [382]:
df_electricity_demand.to_csv("../../Data/Intermediate_Data/electricity_demand.csv")