In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium 
import openpyxl
from datetime import datetime

In [278]:
# Define the sheet names in a list so that we can easily iterate through the excel sheets (tabs). 
# Each sheet is a year between 2002 and 2023

sheet_names = []
for n in range(2015, 2024, 1):
    sheet_names.append(str(n))

In [279]:
# Creating a dictionary of dataframes, one dataframe per excel sheet. 
# this will allow us to tailor our data cleaning to the individual sheet since the format of the sheet changes over the years.

DataFrame_dict = {}


for sheet in sheet_names:
    DataFrame_dict["sheet{0}".format(sheet)] = pd.read_excel('DOE_Electric_Disturbance_Events.xlsx', engine='openpyxl', header=1, sheet_name=sheet)

In [280]:
# Drop month column because it is redundant and also drop alert criteria column, we will not be analysing this column this time. 

for sheet in list(range(2015,2024,1)):
    column_titles = DataFrame_dict["sheet{0}".format(sheet)].columns.tolist()
    if "Month" in column_titles:
        DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].drop(["Month"], axis=1)
    if "Event Month" in column_titles:
        DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].drop(["Event Month"], axis=1)
    if "Event Year" in column_titles:
        DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].drop(["Event Year"], axis=1)
    if "Alert Criteria" in column_titles:
        DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].drop(["Alert Criteria"], axis=1)

In [281]:
#Let us make sure the first column contains the date only. initially some cells had date and time values which resulted in an error when I tried to merge columns later on

def date_only(start_year, stop_year, name_of_date_col):
    for sheet in list(range(start_year, stop_year+1, 1)):
        DataFrame_dict["sheet{0}".format(sheet)][name_of_date_col] = pd.to_datetime(DataFrame_dict["sheet{0}".format(sheet)][name_of_date_col]).dt.date
        

In [282]:
date_only(2015, 2023, "Date Event Began")

In [283]:
DataFrame_dict["sheet2023"]

Unnamed: 0,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Demand Loss (MW),Number of Customers Affected
0,2023-01-01,00:06:00,01/01/2023,03:57:00,California: Riverside County;,WECC,Vandalism,0,0
1,2023-01-01,09:55:00,01/01/2023,10:30:00,California: Sacramento County;,WECC,System Operations,0,0
2,2023-01-01,14:47:00,01/01/2023,16:50:00,Florida: Citrus County;,SERC,Vandalism,19,3509
3,2023-01-02,16:12:00,01/03/2023,09:30:00,Texas: Dallas County;,TRE,Vandalism,0,0
4,2023-01-02,03:16:00,01/02/2023,16:46:00,New Mexico:,WECC,Vandalism,432,0
...,...,...,...,...,...,...,...,...,...
162,2023-06-29,15:42:00,Unknown,Unknown,Indiana,RF,- Weather or natural disaster,Unknown,140000
163,2023-06-28,11:01:00,2023-06-28 00:00:00,11:03:00,Georgia,SERC,- Vandalism - Theft,0,0
164,2023-06-22,02:30:00,2023-06-22 00:00:00,02:38:00,Louisiana,SERC,- Transmission equipment failure,17,6242
165,2023-06-20,17:48:00,2023-06-20 00:00:00,20:52:00,Louisiana,SERC,- Transmission equipment failure,0,0


In [242]:
# There was an issue with converting the Date of Restoration column to date only

In [284]:
for sheet in list(range(2015,2024,1)):
    datetime_col = pd.to_datetime(DataFrame_dict["sheet{0}".format(sheet)]['Date Event Began'], errors='coerce')
    print(sheet, datetime_col.isnull().sum())

    
# We have data issues in sheets 2015, 2016, 2017. The issue is that these sheets have many rows of NaNs at the very bottom of the spreadsheet that we need to remove

2015 48
2016 50
2017 41
2018 0
2019 0
2020 0
2021 0
2022 0
2023 0


In [285]:
# instead of dropping rows that have NaN, how about we keep rows that ar not NaN:
# df = df[df['EPS'].notna()]

for sheet in list(range(2015,2024,1)):
    DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)][DataFrame_dict["sheet{0}".format(sheet)]['Date Event Began'].notna()]

In [287]:
DataFrame_dict["sheet2016"]

Unnamed: 0,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Demand Loss (MW),Number of Customers Affected,Unnamed: 11
0,2016-01-04,05:15:00,2016-01-05 00:00:00,08:00:00,Wisconsin: Milwaukee County;,MRO,Sabotage,0,0,
1,2016-01-10,20:46:00,2016-01-11 00:00:00,05:25:00,Maine: Connecticut: Massachusetts: Vermont: Ne...,NPCC,Weather,Unknown,59859,
2,2016-01-11,20:16:00,2016-01-11 00:00:00,23:00:00,Pennsylvania: Chester County;,RFC,Vandalism,0,0,
3,2016-01-14,08:27:00,2016-01-14 00:00:00,12:00:00,Delaware:,RFC,Vandalism,0,0,
4,2016-01-17,12:00:00,2016-01-17 00:00:00,13:00:00,Utah: Salt Lake County;,WECC,Sabotage,0,0,
...,...,...,...,...,...,...,...,...,...,...
136,2016-12-16,07:45:00,12/16/2016,08:45:00,Oregon: Multnomah County;,WECC,Vandalism,0,0,
137,2016-12-26,04:00:00,12/26/2016,06:00:00,Washington:,WECC,Vandalism,0,0,
138,2016-12-28,04:03:00,12/31/2016,06:00:00,California:,WECC,Fuel Supply Deficiency,0,0,
139,2016-12-30,08:55:00,12/30/2016,08:56:00,Vermont: Chittenden County;,NPCC,Other,0,0,


In [288]:
# keep rows that are not "Unknown"

for sheet in list(range(2015,2024,1)):
    DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)][DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration'].str.strip(" ") !=  "Unknown"]

In [289]:
DataFrame_dict["sheet2016"]

Unnamed: 0,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Demand Loss (MW),Number of Customers Affected,Unnamed: 11
0,2016-01-04,05:15:00,2016-01-05 00:00:00,08:00:00,Wisconsin: Milwaukee County;,MRO,Sabotage,0,0,
1,2016-01-10,20:46:00,2016-01-11 00:00:00,05:25:00,Maine: Connecticut: Massachusetts: Vermont: Ne...,NPCC,Weather,Unknown,59859,
2,2016-01-11,20:16:00,2016-01-11 00:00:00,23:00:00,Pennsylvania: Chester County;,RFC,Vandalism,0,0,
3,2016-01-14,08:27:00,2016-01-14 00:00:00,12:00:00,Delaware:,RFC,Vandalism,0,0,
4,2016-01-17,12:00:00,2016-01-17 00:00:00,13:00:00,Utah: Salt Lake County;,WECC,Sabotage,0,0,
...,...,...,...,...,...,...,...,...,...,...
136,2016-12-16,07:45:00,12/16/2016,08:45:00,Oregon: Multnomah County;,WECC,Vandalism,0,0,
137,2016-12-26,04:00:00,12/26/2016,06:00:00,Washington:,WECC,Vandalism,0,0,
138,2016-12-28,04:03:00,12/31/2016,06:00:00,California:,WECC,Fuel Supply Deficiency,0,0,
139,2016-12-30,08:55:00,12/30/2016,08:56:00,Vermont: Chittenden County;,NPCC,Other,0,0,


In [291]:
 for sheet in list(range(2015,2024,1)):
    DataFrame_dict["sheet{0}".format(sheet)]['Date Event Began'] = pd.to_datetime(DataFrame_dict["sheet{0}".format(sheet)]['Date Event Began'].astype(str) + ' ' + DataFrame_dict["sheet{0}".format(sheet)]["Time Event Began"].astype(str))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [292]:
DataFrame_dict["sheet2016"]

Unnamed: 0,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Demand Loss (MW),Number of Customers Affected,Unnamed: 11
0,2016-01-04 05:15:00,05:15:00,2016-01-05 00:00:00,08:00:00,Wisconsin: Milwaukee County;,MRO,Sabotage,0,0,
1,2016-01-10 20:46:00,20:46:00,2016-01-11 00:00:00,05:25:00,Maine: Connecticut: Massachusetts: Vermont: Ne...,NPCC,Weather,Unknown,59859,
2,2016-01-11 20:16:00,20:16:00,2016-01-11 00:00:00,23:00:00,Pennsylvania: Chester County;,RFC,Vandalism,0,0,
3,2016-01-14 08:27:00,08:27:00,2016-01-14 00:00:00,12:00:00,Delaware:,RFC,Vandalism,0,0,
4,2016-01-17 12:00:00,12:00:00,2016-01-17 00:00:00,13:00:00,Utah: Salt Lake County;,WECC,Sabotage,0,0,
...,...,...,...,...,...,...,...,...,...,...
136,2016-12-16 07:45:00,07:45:00,12/16/2016,08:45:00,Oregon: Multnomah County;,WECC,Vandalism,0,0,
137,2016-12-26 04:00:00,04:00:00,12/26/2016,06:00:00,Washington:,WECC,Vandalism,0,0,
138,2016-12-28 04:03:00,04:03:00,12/31/2016,06:00:00,California:,WECC,Fuel Supply Deficiency,0,0,
139,2016-12-30 08:55:00,08:55:00,12/30/2016,08:56:00,Vermont: Chittenden County;,NPCC,Other,0,0,


In [293]:
# Now we want to combine the restoration date and restoration time columns. 
# But first we need to get rid of two problematic rows that are missing critical data

In [294]:
DataFrame_dict["sheet2022"].drop(66, axis=0, inplace=True)
DataFrame_dict["sheet2023"].drop(13, axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [295]:
for sheet in list(range(2015,2024,1)):
    DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration'] = pd.to_datetime(DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration'].astype(str) + ' ' + DataFrame_dict["sheet{0}".format(sheet)]["Time of Restoration"].astype(str))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
post_cleaning_column_titles = ["datetime_event_began", "NERC_region", "area_affected", "event_type", "demand_loss_(MW)", "number_of_customers_affected", "datetime_of_restoration"]

In [302]:
DataFrame_dict["sheet2023"]

Unnamed: 0,Date Event Began,Date of Restoration,Area Affected,NERC Region,Event Type,Demand Loss (MW),Number of Customers Affected
0,2023-01-01 00:06:00,2023-01-01 03:57:00,California: Riverside County;,WECC,Vandalism,0,0
1,2023-01-01 09:55:00,2023-01-01 10:30:00,California: Sacramento County;,WECC,System Operations,0,0
2,2023-01-01 14:47:00,2023-01-01 16:50:00,Florida: Citrus County;,SERC,Vandalism,19,3509
3,2023-01-02 16:12:00,2023-01-03 09:30:00,Texas: Dallas County;,TRE,Vandalism,0,0
4,2023-01-02 03:16:00,2023-01-02 16:46:00,New Mexico:,WECC,Vandalism,432,0
...,...,...,...,...,...,...,...
160,2023-06-25 19:30:00,2023-06-26 17:45:00,"Arkansas, Mississippi",SERC,- Weather or natural disaster,Unknown,64732
161,2023-06-27 17:37:00,2023-06-27 21:30:00,"Washington, Idaho, Montana",WECC,- Failure at high voltage substation or switch...,0,0
163,2023-06-28 11:01:00,2023-06-28 11:03:00,Georgia,SERC,- Vandalism - Theft,0,0
164,2023-06-22 02:30:00,2023-06-22 02:38:00,Louisiana,SERC,- Transmission equipment failure,17,6242


In [299]:
# Now that the "Date Event Began" column and "Date of Restoration" columns include datetime we can drop the "Time Event Began" and "Time of Restoration" columns 
# from the dataframes containing data for years 2011 - 2014

for sheet in list(range(2015,2024,1)):
    DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].drop(["Time Event Began", "Time of Restoration"], axis=1)

KeyError: "['Time Event Began' 'Time of Restoration'] not found in axis"

In [304]:
# Now we need to reorder the columns so they are in the same order as the initial sheets we cleaned.

post_cleaning_column_titles = ["datetime_event_began", "NERC_region", "area_affected", "event_type", "demand_loss_(MW)", "number_of_customers_affected", "datetime_of_restoration"]

for sheet in list(range(2015,2024,1)):
    old_col = DataFrame_dict["sheet{0}".format(sheet)].columns.tolist()
    DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)][[old_col[0], old_col[3], old_col[2], old_col[4], old_col[5], old_col[6], old_col[1]]]

In [305]:
for sheet in list(range(2015,2024,1)):
    old_col = DataFrame_dict["sheet{0}".format(sheet)].columns.tolist()
    for n in range(len(post_cleaning_column_titles)):
        DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].rename(columns={old_col[n]: post_cleaning_column_titles[n]})
        
    if len(old_col) > 7: #Drop all additional columns
        for x in range(7,len(old_col),1):
            DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].drop(old_col[x], axis=1)

In [307]:
DataFrame_dict["sheet2016"]

Unnamed: 0,datetime_event_began,NERC_region,area_affected,event_type,demand_loss_(MW),number_of_customers_affected,datetime_of_restoration
0,2016-01-04 05:15:00,MRO,Wisconsin: Milwaukee County;,Sabotage,0,0,2016-01-05 08:00:00
1,2016-01-10 20:46:00,NPCC,Maine: Connecticut: Massachusetts: Vermont: Ne...,Weather,Unknown,59859,2016-01-11 05:25:00
2,2016-01-11 20:16:00,RFC,Pennsylvania: Chester County;,Vandalism,0,0,2016-01-11 23:00:00
3,2016-01-14 08:27:00,RFC,Delaware:,Vandalism,0,0,2016-01-14 12:00:00
4,2016-01-17 12:00:00,WECC,Utah: Salt Lake County;,Sabotage,0,0,2016-01-17 13:00:00
...,...,...,...,...,...,...,...
136,2016-12-16 07:45:00,WECC,Oregon: Multnomah County;,Vandalism,0,0,2016-12-16 08:45:00
137,2016-12-26 04:00:00,WECC,Washington:,Vandalism,0,0,2016-12-26 06:00:00
138,2016-12-28 04:03:00,WECC,California:,Fuel Supply Deficiency,0,0,2016-12-31 06:00:00
139,2016-12-30 08:55:00,NPCC,Vermont: Chittenden County;,Other,0,0,2016-12-30 08:56:00


In [167]:
# Where are the unknowns:

for sheet in list(range(2015,2024,1)):
    is_unknown = 0
    for ind, row in DataFrame_dict["sheet{0}".format(sheet)].iterrows():
        if DataFrame_dict["sheet{0}".format(sheet)]["Time of Restoration"][ind] == "Unknown ":
            is_unknown += 1
    print(sheet, is_unknown)
            
        

2015 0
2016 0
2017 0
2018 0
2019 0
2020 0
2021 0
2022 0
2023 0


In [170]:
DataFrame_dict["sheet2023"]

Unnamed: 0,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Demand Loss (MW),Number of Customers Affected


In [273]:
DataFrame_dict["sheet2023"]['Date of Restoration'][13]

'Unkonwn'

In [None]:
## To try - clarify the date format you want

for sheet in list(range(2015,2024,1)):
    DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration'] = pd.to_datetime(DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration'].astype(str) + ' ' + DataFrame_dict["sheet{0}".format(sheet)]["Time of Restoration"].astype(str))

In [None]:
for sheet in list(range(2015,2024,1)):
    DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration'] = pd.to_datetime(DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration'].astype(str) + ' ' + DataFrame_dict["sheet{0}".format(sheet)]["Time of Restoration"].astype(str))

In [None]:
def combine_dt_drop_time(start_year, stop_year, name_of_date_col, name_of_time_col):
    for sheet in list(range(start_year,stop_year,1)):
        
        for ind, row in DataFrame_dict["sheet{0}".format(sheet)].iterrows(): #First confirm that the date col actually is a datetime with date info only
            if type(DataFrame_dict["sheet{0}".format(sheet)][name_of_date_col][ind]) != datetime:
                DataFrame_dict["sheet{0}".format(sheet)][name_of_date_col][ind] = datetime.now().date()
                
            if type(DataFrame_dict["sheet{0}".format(sheet)][name_of_time_col][ind]) == str: #Also confirm there are no unexpected values in the time col
                DataFrame_dict["sheet{0}".format(sheet)][name_of_time_col][ind] = "00:00:00"
            
            if type(DataFrame_dict["sheet{0}".format(sheet)][name_of_time_col][ind]) == None:
                DataFrame_dict["sheet{0}".format(sheet)][name_of_time_col][ind] = "00:00:00"
            
        
        DataFrame_dict["sheet{0}".format(sheet)][name_of_date_col] = pd.to_datetime(DataFrame_dict["sheet{0}".format(sheet)][name_of_date_col].astype(str) + ' ' + DataFrame_dict["sheet{0}".format(sheet)][name_of_time_col].astype(str))
        DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].drop([name_of_time_col], axis=1)

In [None]:
combine_dt_drop_time(2015,2023, "Date Event Began", "Time Event Began")