# Scrape

In [1]:
!pip install pandas matplotlib numpy openpyxl

Collecting pandas
  Using cached pandas-2.3.3-cp311-cp311-win_amd64.whl (11.3 MB)
Collecting matplotlib
  Using cached matplotlib-3.10.7-cp311-cp311-win_amd64.whl (8.1 MB)
Collecting numpy
  Using cached numpy-2.3.4-cp311-cp311-win_amd64.whl (13.1 MB)
Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Collecting pytz>=2020.1
  Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Collecting tzdata>=2022.7
  Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Collecting contourpy>=1.0.1
  Using cached contourpy-1.3.3-cp311-cp311-win_amd64.whl (225 kB)
Collecting cycler>=0.10
  Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting fonttools>=4.22.0
  Using cached fonttools-4.60.1-cp311-cp311-win_amd64.whl (2.3 MB)
Collecting kiwisolver>=1.3.1
  Using cached kiwisolver-1.4.9-cp311-cp311-win_amd64.whl (73 kB)
Collecting pillow>=8
  Using cached pillow-12.0.0-cp311-cp311-win_amd64.whl (7.0 MB)
Collecting pyparsing>=3
  Using cached pyparsing-3.2.


[notice] A new release of pip available: 22.3 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [40]:
import pandas as pd

def parse_custom_date(date_str):
    date_str = str(date_str).strip()  # Convert to string and strip spaces
    
    # Handle empty or nan-like values
    if date_str in ['nan', 'NaN', '', None]:
        return pd.NaT
    
    # Length checks (you can adjust if needed)
    length = len(date_str)
    
    try:
        if length == 10 and '-' in date_str:  # Format YYYY-MM-DD, e.g. 2001-01-01 or 2025-01-01
            return pd.to_datetime(date_str, format='%Y-%m-%d')
        elif length == 11 and '-' in date_str:  # Format DD-MMM-YYYY e.g. 01-JAN-2019
            return pd.to_datetime(date_str, format='%d-%b-%Y')
        elif length == 9 and '-' in date_str:  # Format DD-Mmm-YY e.g. 01-Jan-23
            return pd.to_datetime(date_str, format='%d-%b-%y')
        else:
            # Fallback to pandas default parsing
            return pd.to_datetime(date_str, errors='coerce')
    except Exception:
        return pd.NaT


In [43]:
import os

path = './demanddata/'
files = [f for f in os.listdir(path) if f.endswith('.csv')]

dfs = []

for file in files:
    temp_df = pd.read_csv(os.path.join(path, file))
    temp_df['SETTLEMENT_DATE'] = temp_df['SETTLEMENT_DATE'].apply(parse_custom_date)
    dfs.append(temp_df)

df = pd.concat(dfs, ignore_index=True)

print(f"Total rows combined: {len(df)}")
print(f"Missing dates after parsing: {df['SETTLEMENT_DATE'].isna().sum()}")

print(len(df))


Total rows combined: 434014
Missing dates after parsing: 0
434014


In [46]:
df.head()

Unnamed: 0,SETTLEMENT_DATE,SETTLEMENT_PERIOD,ND,TSD,ENGLAND_WALES_DEMAND,EMBEDDED_WIND_GENERATION,EMBEDDED_WIND_CAPACITY,EMBEDDED_SOLAR_GENERATION,EMBEDDED_SOLAR_CAPACITY,NON_BM_STOR,...,IFA_FLOW,IFA2_FLOW,BRITNED_FLOW,MOYLE_FLOW,EAST_WEST_FLOW,NEMO_FLOW,NSL_FLOW,ELECLINK_FLOW,VIKING_FLOW,GREENLINK_FLOW
0,2001-01-01,1,38631,,34060,,,,,0,...,1495,,,,,,,,,
1,2001-01-01,2,39808,,35370,,,,,0,...,1496,,,,,,,,,
2,2001-01-01,3,40039,,35680,,,,,0,...,1511,,,,,,,,,
3,2001-01-01,4,39339,,35029,,,,,0,...,1958,,,,,,,,,
4,2001-01-01,5,38295,,34047,,,,,0,...,1996,,,,,,,,,


In [47]:
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

df.head()

Unnamed: 0,settlement_date,settlement_period,nd,tsd,england_wales_demand,embedded_wind_generation,embedded_wind_capacity,embedded_solar_generation,embedded_solar_capacity,non_bm_stor,...,ifa_flow,ifa2_flow,britned_flow,moyle_flow,east_west_flow,nemo_flow,nsl_flow,eleclink_flow,viking_flow,greenlink_flow
0,2001-01-01,1,38631,,34060,,,,,0,...,1495,,,,,,,,,
1,2001-01-01,2,39808,,35370,,,,,0,...,1496,,,,,,,,,
2,2001-01-01,3,40039,,35680,,,,,0,...,1511,,,,,,,,,
3,2001-01-01,4,39339,,35029,,,,,0,...,1958,,,,,,,,,
4,2001-01-01,5,38295,,34047,,,,,0,...,1996,,,,,,,,,


In [48]:
print(df.isna().sum())

settlement_date                   0
settlement_period                 0
nd                                0
tsd                           70128
england_wales_demand              0
embedded_wind_generation     105168
embedded_wind_capacity       105168
embedded_solar_generation    140256
embedded_solar_capacity      140256
non_bm_stor                       0
pump_storage_pumping              0
scottish_transfer            385680
ifa_flow                          0
ifa2_flow                    140256
britned_flow                 140256
moyle_flow                    70128
east_west_flow               140256
nemo_flow                    140256
nsl_flow                     315552
eleclink_flow                315552
viking_flow                  315552
greenlink_flow               315552
dtype: int64


In [49]:
df.nunique()

settlement_date               9042
settlement_period               50
nd                           39473
tsd                          36960
england_wales_demand         35730
embedded_wind_generation      5634
embedded_wind_capacity         186
embedded_solar_generation     9979
embedded_solar_capacity       3373
non_bm_stor                    583
pump_storage_pumping          1981
scottish_transfer             7813
ifa_flow                      4111
ifa2_flow                     2035
britned_flow                  2126
moyle_flow                     892
east_west_flow                1058
nemo_flow                     2046
nsl_flow                      2715
eleclink_flow                 2029
viking_flow                   2576
greenlink_flow                 966
dtype: int64

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434014 entries, 0 to 434013
Data columns (total 22 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   settlement_date            434014 non-null  object 
 1   settlement_period          434014 non-null  int64  
 2   nd                         434014 non-null  int64  
 3   tsd                        363886 non-null  float64
 4   england_wales_demand       434014 non-null  int64  
 5   embedded_wind_generation   328846 non-null  float64
 6   embedded_wind_capacity     328846 non-null  float64
 7   embedded_solar_generation  293758 non-null  float64
 8   embedded_solar_capacity    293758 non-null  float64
 9   non_bm_stor                434014 non-null  int64  
 10  pump_storage_pumping       434014 non-null  int64  
 11  scottish_transfer          48334 non-null   float64
 12  ifa_flow                   434014 non-null  int64  
 13  ifa2_flow                  29

In [55]:
# Ensure date is datetime
df['settlement_date'] = pd.to_datetime(df['settlement_date'])

# Fill generation/flow columns missing values with zero
cols_fill_zero = [
    'embedded_wind_generation', 'embedded_wind_capacity',
    'embedded_solar_generation', 'embedded_solar_capacity',
    'ifa2_flow', 'britned_flow', 'east_west_flow',
    'nemo_flow', 'nsl_flow', 'eleclink_flow',
    'viking_flow', 'greenlink_flow','moyle_flow'
]
df[cols_fill_zero] = df[cols_fill_zero].fillna(0)

In [58]:
df['tsd'] = df['tsd'].fillna(method='bfill')
df = df.drop(columns=['scottish_transfer'])


  df['tsd'] = df['tsd'].fillna(method='bfill')


In [59]:
print(df.isna().sum())

settlement_date              0
settlement_period            0
nd                           0
tsd                          0
england_wales_demand         0
embedded_wind_generation     0
embedded_wind_capacity       0
embedded_solar_generation    0
embedded_solar_capacity      0
non_bm_stor                  0
pump_storage_pumping         0
ifa_flow                     0
ifa2_flow                    0
britned_flow                 0
moyle_flow                   0
east_west_flow               0
nemo_flow                    0
nsl_flow                     0
eleclink_flow                0
viking_flow                  0
greenlink_flow               0
dtype: int64


In [60]:
df.dtypes

settlement_date              datetime64[ns]
settlement_period                     int64
nd                                    int64
tsd                                 float64
england_wales_demand                  int64
embedded_wind_generation            float64
embedded_wind_capacity              float64
embedded_solar_generation           float64
embedded_solar_capacity             float64
non_bm_stor                           int64
pump_storage_pumping                  int64
ifa_flow                              int64
ifa2_flow                           float64
britned_flow                        float64
moyle_flow                          float64
east_west_flow                      float64
nemo_flow                           float64
nsl_flow                            float64
eleclink_flow                       float64
viking_flow                         float64
greenlink_flow                      float64
dtype: object