
### Data Collection and Preprocessing
  - Collect historical energy consumption, pricing, weather, and population data.
  - Preprocess the data, handle missing values, and engineer features.

In [45]:
## Import Libraries 

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json
import io 
import os

In [46]:
## Download the csv files fro the web 

# url of the csv files

url = 'http://reports.ieso.ca/public/Demand/'

# List of the csv files

files_to_download = ['PUB_Demand.csv', 'PUB_Demand_2002.csv', 'PUB_Demand_2002_v1.csv', 'PUB_Demand_2003.csv']

# Create a directory to store the downloaded files
os.makedirs('data', exist_ok=True)

# Download each CSV file and save it in the "data" directory
for filename in files_to_download:
    file_url = url + filename
    response = requests.get(file_url)
    with open(os.path.join('data', filename), 'wb') as f:
        f.write(response.content)
        print(f"Downloaded {filename}")



Downloaded PUB_Demand.csv
Downloaded PUB_Demand_2002.csv
Downloaded PUB_Demand_2002_v1.csv
Downloaded PUB_Demand_2003.csv


In [42]:
# import pandas as pd
# import os

# # Function to clean and read CSV files into a DataFrame
# def read_csv(filename):
#     # Read the CSV file into a DataFrame, skipping the first row
#     df = pd.read_csv(filename, skiprows=[1])
#     # Set column names from the second row
#     df.columns = df.iloc[0]
#     # Drop the second row
#     df = df.drop(index=1).reset_index(drop=True)
#     return df


# df = read_csv('data/PUB_Demand.csv')


In [68]:
# df = pd.read_csv('data/PUB_Demand.csv')
# df.head()


def read_csv(filename):
    # Read the CSV file into a DataFrame, skipping the first three rows
    df = pd.read_csv(filename, skiprows=[0, 1])
    # Set column names from the fourth row
    df.columns = df.iloc[0]
    # Drop the fourth row
    df = df.drop(index=0).reset_index(drop=True)
    return df


df = read_csv('data/PUB_Demand.csv')
df.head()

Unnamed: 0,Date,Hour,Market Demand,Ontario Demand
0,2024-01-01,1,17091,14482
1,2024-01-01,2,16658,14180
2,2024-01-01,3,16233,13722
3,2024-01-01,4,15909,13637
4,2024-01-01,5,15998,13697


In [69]:
df2 = read_csv('data/PUB_Demand_2002.csv')
df2.head()


Unnamed: 0,Date,Hour,Market Demand,Ontario Demand
0,2002-05-01,1,14141,14137
1,2002-05-01,2,13876,13872
2,2002-05-01,3,13974,13820
3,2002-05-01,4,13898,13744
4,2002-05-01,5,14378,14224


In [96]:
## Combine all csv files into one dataframe

def combine_csvs(directory):
# List to store DataFrames for each CSV file
    dfs = []

    # Iterate over each CSV file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            # Clean and read the CSV file into a DataFrame
            df = read_csv(file_path)
            dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)

    # Sort by the 'Date' column
    combined_df = combined_df.sort_values('Date')
    return combined_df


combined_df = combine_csvs('data')
combined_df.head()

Unnamed: 0,Date,Hour,Market Demand,Ontario Demand
15672,2002-05-01,1,14141,14137
1052,2002-05-01,21,17906,17902
1051,2002-05-01,20,17903,17899
1050,2002-05-01,19,17231,17227
1049,2002-05-01,18,17292,17288


In [73]:
combined_df.tail()

Unnamed: 0,Date,Hour,Market Demand,Ontario Demand
1010,2024-02-12,3,15253,13979
1009,2024-02-12,2,15275,14159
1008,2024-02-12,1,15798,14747
1020,2024-02-12,13,18638,17038
1015,2024-02-12,8,19056,17474


In [75]:
hours = combined_df['Hour'].unique()
hours

array(['1', '21', '20', '19', '18', '17', '16', '15', '14', '13', '12',
       '22', '11', '9', '8', '7', '6', '5', '4', '3', '2', '10', '23',
       '24'], dtype=object)

In [77]:
Date_column = combined_df['Date'].unique()
Date_column

array(['2002-05-01', '2002-05-02', '2002-05-03', '2002-05-04',
       '2002-05-05', '2002-05-06', '2002-05-07', '2002-05-08',
       '2002-05-09', '2002-05-10', '2002-05-11', '2002-05-12',
       '2002-05-13', '2002-05-14', '2002-05-15', '2002-05-16',
       '2002-05-17', '2002-05-18', '2002-05-19', '2002-05-20',
       '2002-05-21', '2002-05-22', '2002-05-23', '2002-05-24',
       '2002-05-25', '2002-05-26', '2002-05-27', '2002-05-28',
       '2002-05-29', '2002-05-30', '2002-05-31', '2002-06-01',
       '2002-06-02', '2002-06-03', '2002-06-04', '2002-06-05',
       '2002-06-06', '2002-06-07', '2002-06-08', '2002-06-09',
       '2002-06-10', '2002-06-11', '2002-06-12', '2002-06-13',
       '2002-06-14', '2002-06-15', '2002-06-16', '2002-06-17',
       '2002-06-18', '2002-06-19', '2002-06-20', '2002-06-21',
       '2002-06-22', '2002-06-23', '2002-06-24', '2002-06-25',
       '2002-06-26', '2002-06-27', '2002-06-28', '2002-06-29',
       '2002-06-30', '2002-07-01', '2002-07-02', '2002-

In [97]:
# Sort out the date column (combine date and hour)


# Convert 'Hour' to 24-hour format as integer and subtract 1
combined_df['Hour'] = (combined_df['Hour'].astype(int) - 1) % 24  # Ensure hour values are within 0-23 range

# Combine 'Date' and 'Hour' strings
combined_df['DateTime'] = pd.to_datetime(combined_df['Date'] + ' ' + combined_df['Hour'].astype(str) + ':00:00')

# Drop 'Date' and 'Hour' columns
combined_df.drop(columns=['Date', 'Hour'], inplace=True)

# Sort DataFrame by the 'DateTime' column
combined_df.sort_values(by='DateTime', inplace=True)

# Reset index
combined_df.reset_index(drop=True, inplace=True)

# Display the DataFrame head
combined_df.head()




Unnamed: 0,Market Demand,Ontario Demand,DateTime
0,14141,14137,2002-05-01 00:00:00
1,14141,14137,2002-05-01 00:00:00
2,13876,13872,2002-05-01 01:00:00
3,13876,13872,2002-05-01 01:00:00
4,13974,13820,2002-05-01 02:00:00


In [94]:
combined_df.tail()

Unnamed: 0,Date,Market Demand,Ontario Demand
21547,2024-02-12 02:00:00,15253,13979
21548,2024-02-12 01:00:00,15275,14159
21549,2024-02-12 00:00:00,15798,14747
21550,2024-02-12 12:00:00,18638,17038
21551,2024-02-12 07:00:00,19056,17474


In [99]:
df=pd.read_csv('data/PUB_Demand_2002_v1.csv')
df.head(50)

Unnamed: 0,\\Hourly Demand Report,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,\\Created at 2018-05-22 08:00:00,,,
1,\\For 2002,,,
2,Date,Hour,Market Demand,Ontario Demand
3,2002-05-01,1,14141,14137
4,2002-05-01,2,13876,13872
5,2002-05-01,3,13974,13820
6,2002-05-01,4,13898,13744
7,2002-05-01,5,14378,14224
8,2002-05-01,6,15408,15404
9,2002-05-01,7,17070,17066


In [100]:
df=pd.read_csv('data/PUB_Demand_2002.csv')
df.head(50)

Unnamed: 0,\\Hourly Demand Report,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,\\Created at 2018-05-22 08:00:00,,,
1,\\For 2002,,,
2,Date,Hour,Market Demand,Ontario Demand
3,2002-05-01,1,14141,14137
4,2002-05-01,2,13876,13872
5,2002-05-01,3,13974,13820
6,2002-05-01,4,13898,13744
7,2002-05-01,5,14378,14224
8,2002-05-01,6,15408,15404
9,2002-05-01,7,17070,17066
