In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium 
import openpyxl

In [97]:
# Define the sheet names in a list so that we can easily iterate through the excel sheets (tabs). 
# Each sheet is a year between 2002 and 2023

sheet_names = []
for n in range(2011, 2015, 1):
    sheet_names.append(str(n))

In [98]:
# Creating a dictionary of dataframes, one dataframe per excel sheet. 
# this will allow us to tailor our data cleaning to the individual sheet since the format of the sheet changes over the years.

DataFrame_dict = {}


for sheet in sheet_names:
    DataFrame_dict["sheet{0}".format(sheet)] = pd.read_excel('DOE_Electric_Disturbance_Events.xlsx', engine='openpyxl', header=1, sheet_name=sheet)

In [99]:
DataFrame_dict["sheet2012"]

Unnamed: 0,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Demand Loss (MW),Number of Customers Affected,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,January,,,,,,,,,,,
1,2012-01-04 00:00:00,12:14:00,2012-01-04 00:00:00,12:14:00,"Tacoma, Washington",WECC,Suspected physical attack,,,,,
2,2012-01-05 00:00:00,10:35:00,2012-01-05 00:00:00,12:25:00,"CSWS/AEP West territory, Oklahoma",SPP,Sabotage,0,0,,,
3,2012-01-05 00:00:00,10:28:00,2012-01-05 00:00:00,12:25:00,"Creek County, Oklahoma",SPP,Suspected physical attack,,,,,
4,2012-01-09 00:00:00,14:30:00,2012-01-09 00:00:00,15:30:00,"Watertown, Connecticut",NPCC,Vandalism,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
217,2012-12-25 00:00:00,00:45:00,2012-12-28 00:00:00,16:15:00,Arkansas; Louisiana; Mississippi; Texas,SPP,Severe Weather - Winter Storm,Unknown,242509,,,
218,2012-12-25 00:00:00,09:28:00,2012-12-26 00:00:00,16:28:00,"Houston, Texas",TRE,"Severe Weather - Cold Front, High Winds",294,262000,,,
219,2012-12-26 00:00:00,14:50:00,2012-12-26 00:00:00,19:40:00,"Stantonsburg, North Carolina",SERC,Severe Weather - Thunderstorm,3,1200,,,
220,2012-12-31 00:00:00,14:21:00,2012-12-31 00:00:00,16:30:00,North Carolina,SERC,Transmission Interruption,40,12000,,,


In [100]:
from datetime import datetime

def drop_non_datetime_values(dataframe_name, column_name):
    for ind, row in dataframe_name.iterrows():
        if type(dataframe_name[column_name][ind]) == datetime:
            pass
        else:
            dataframe_name.drop(axis=0, index=ind, inplace=True)
    return dataframe_name

In [101]:
for sheet in list(range(2011,2015,1)):
    drop_non_datetime_values(DataFrame_dict["sheet{0}".format(sheet)], "Date Event Began")

In [102]:
#Let us make sure the first column contains the date only. initially some cells had date and time values which resulted in an error when I tried to merge columns later on

for sheet in list(range(2011,2014,1)):
    DataFrame_dict["sheet{0}".format(sheet)]['Date Event Began'] = pd.to_datetime(DataFrame_dict["sheet{0}".format(sheet)]['Date Event Began']).dt.date

In [103]:
DataFrame_dict["sheet2012"]

Unnamed: 0,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Demand Loss (MW),Number of Customers Affected,Unnamed: 9,Unnamed: 10,Unnamed: 11
1,2012-01-04,12:14:00,2012-01-04 00:00:00,12:14:00,"Tacoma, Washington",WECC,Suspected physical attack,,,,,
2,2012-01-05,10:35:00,2012-01-05 00:00:00,12:25:00,"CSWS/AEP West territory, Oklahoma",SPP,Sabotage,0,0,,,
3,2012-01-05,10:28:00,2012-01-05 00:00:00,12:25:00,"Creek County, Oklahoma",SPP,Suspected physical attack,,,,,
4,2012-01-09,14:30:00,2012-01-09 00:00:00,15:30:00,"Watertown, Connecticut",NPCC,Vandalism,,,,,
5,2012-01-09,13:36:00,2012-01-11 00:00:00,01:05:00,Louisiana,SERC,Load Shed,150,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
216,2012-12-17,06:55:00,2012-12-17 00:00:00,07:00:00,"Tacoma, Washington",WECC,Suspected Physical Attack,0,0,,,
217,2012-12-25,00:45:00,2012-12-28 00:00:00,16:15:00,Arkansas; Louisiana; Mississippi; Texas,SPP,Severe Weather - Winter Storm,Unknown,242509,,,
218,2012-12-25,09:28:00,2012-12-26 00:00:00,16:28:00,"Houston, Texas",TRE,"Severe Weather - Cold Front, High Winds",294,262000,,,
219,2012-12-26,14:50:00,2012-12-26 00:00:00,19:40:00,"Stantonsburg, North Carolina",SERC,Severe Weather - Thunderstorm,3,1200,,,


In [104]:
for sheet in list(range(2011,2015,1)):
    DataFrame_dict["sheet{0}".format(sheet)]['Date Event Began'] = pd.to_datetime(DataFrame_dict["sheet{0}".format(sheet)]['Date Event Began'].astype(str) + ' ' + DataFrame_dict["sheet{0}".format(sheet)]["Time Event Began"].astype(str))

In [105]:
DataFrame_dict["sheet2012"]["Date of Restoration"][3]
DataFrame_dict["sheet2012"]

Unnamed: 0,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Demand Loss (MW),Number of Customers Affected,Unnamed: 9,Unnamed: 10,Unnamed: 11
1,2012-01-04 12:14:00,12:14:00,2012-01-04 00:00:00,12:14:00,"Tacoma, Washington",WECC,Suspected physical attack,,,,,
2,2012-01-05 10:35:00,10:35:00,2012-01-05 00:00:00,12:25:00,"CSWS/AEP West territory, Oklahoma",SPP,Sabotage,0,0,,,
3,2012-01-05 10:28:00,10:28:00,2012-01-05 00:00:00,12:25:00,"Creek County, Oklahoma",SPP,Suspected physical attack,,,,,
4,2012-01-09 14:30:00,14:30:00,2012-01-09 00:00:00,15:30:00,"Watertown, Connecticut",NPCC,Vandalism,,,,,
5,2012-01-09 13:36:00,13:36:00,2012-01-11 00:00:00,01:05:00,Louisiana,SERC,Load Shed,150,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
216,2012-12-17 06:55:00,06:55:00,2012-12-17 00:00:00,07:00:00,"Tacoma, Washington",WECC,Suspected Physical Attack,0,0,,,
217,2012-12-25 00:45:00,00:45:00,2012-12-28 00:00:00,16:15:00,Arkansas; Louisiana; Mississippi; Texas,SPP,Severe Weather - Winter Storm,Unknown,242509,,,
218,2012-12-25 09:28:00,09:28:00,2012-12-26 00:00:00,16:28:00,"Houston, Texas",TRE,"Severe Weather - Cold Front, High Winds",294,262000,,,
219,2012-12-26 14:50:00,14:50:00,2012-12-26 00:00:00,19:40:00,"Stantonsburg, North Carolina",SERC,Severe Weather - Thunderstorm,3,1200,,,


In [106]:
#Let us make sure the third column contains the date only. initially some cells had date and time values which resulted in an error when I tried to merge columns later on

for sheet in list(range(2011,2015,1)):
    for ind, row in DataFrame_dict["sheet{0}".format(sheet)].iterrows():
        if type(DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration'][ind]) != datetime:
            DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration'][ind] = datetime.now().date()
            #DataFrame_dict["sheet{0}".format(sheet)]['Time of Restoration'][ind] = datetime.now().time()
            
        if type(DataFrame_dict["sheet{0}".format(sheet)]['Time of Restoration'][ind]) == str:
            DataFrame_dict["sheet{0}".format(sheet)]['Time of Restoration'][ind] = "00:00:00"
    
    DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration'] = pd.to_datetime(DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration']).dt.date
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [107]:
DataFrame_dict["sheet2012"]["Time of Restoration"][3]

datetime.time(12, 25)

In [108]:
for sheet in list(range(2011,2015,1)):
    DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration'] = pd.to_datetime(DataFrame_dict["sheet{0}".format(sheet)]['Date of Restoration'].astype(str) + ' ' + DataFrame_dict["sheet{0}".format(sheet)]["Time of Restoration"].astype(str))

In [109]:
DataFrame_dict["sheet2012"]

Unnamed: 0,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Demand Loss (MW),Number of Customers Affected,Unnamed: 9,Unnamed: 10,Unnamed: 11
1,2012-01-04 12:14:00,12:14:00,2012-01-04 12:14:00,12:14:00,"Tacoma, Washington",WECC,Suspected physical attack,,,,,
2,2012-01-05 10:35:00,10:35:00,2012-01-05 12:25:00,12:25:00,"CSWS/AEP West territory, Oklahoma",SPP,Sabotage,0,0,,,
3,2012-01-05 10:28:00,10:28:00,2012-01-05 12:25:00,12:25:00,"Creek County, Oklahoma",SPP,Suspected physical attack,,,,,
4,2012-01-09 14:30:00,14:30:00,2012-01-09 15:30:00,15:30:00,"Watertown, Connecticut",NPCC,Vandalism,,,,,
5,2012-01-09 13:36:00,13:36:00,2012-01-11 01:05:00,01:05:00,Louisiana,SERC,Load Shed,150,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
216,2012-12-17 06:55:00,06:55:00,2012-12-17 07:00:00,07:00:00,"Tacoma, Washington",WECC,Suspected Physical Attack,0,0,,,
217,2012-12-25 00:45:00,00:45:00,2012-12-28 16:15:00,16:15:00,Arkansas; Louisiana; Mississippi; Texas,SPP,Severe Weather - Winter Storm,Unknown,242509,,,
218,2012-12-25 09:28:00,09:28:00,2012-12-26 16:28:00,16:28:00,"Houston, Texas",TRE,"Severe Weather - Cold Front, High Winds",294,262000,,,
219,2012-12-26 14:50:00,14:50:00,2012-12-26 19:40:00,19:40:00,"Stantonsburg, North Carolina",SERC,Severe Weather - Thunderstorm,3,1200,,,


In [112]:
# Now that the "Date Event Began" column and "Date of Restoration" columns include datetime we can drop the "Time Event Began" and "Time of Restoration" columns 
# from the dataframes containing data for years 2011 - 2014

for sheet in list(range(2011,2015,1)):
    DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].drop(["Time Event Began", "Time of Restoration"], axis=1)

KeyError: "['Time Event Began' 'Time of Restoration'] not found in axis"

In [113]:
DataFrame_dict["sheet2012"]

Unnamed: 0,Date Event Began,Date of Restoration,Area Affected,NERC Region,Event Type,Demand Loss (MW),Number of Customers Affected
1,2012-01-04 12:14:00,2012-01-04 12:14:00,"Tacoma, Washington",WECC,Suspected physical attack,,
2,2012-01-05 10:35:00,2012-01-05 12:25:00,"CSWS/AEP West territory, Oklahoma",SPP,Sabotage,0,0
3,2012-01-05 10:28:00,2012-01-05 12:25:00,"Creek County, Oklahoma",SPP,Suspected physical attack,,
4,2012-01-09 14:30:00,2012-01-09 15:30:00,"Watertown, Connecticut",NPCC,Vandalism,,
5,2012-01-09 13:36:00,2012-01-11 01:05:00,Louisiana,SERC,Load Shed,150,1
...,...,...,...,...,...,...,...
216,2012-12-17 06:55:00,2012-12-17 07:00:00,"Tacoma, Washington",WECC,Suspected Physical Attack,0,0
217,2012-12-25 00:45:00,2012-12-28 16:15:00,Arkansas; Louisiana; Mississippi; Texas,SPP,Severe Weather - Winter Storm,Unknown,242509
218,2012-12-25 09:28:00,2012-12-26 16:28:00,"Houston, Texas",TRE,"Severe Weather - Cold Front, High Winds",294,262000
219,2012-12-26 14:50:00,2012-12-26 19:40:00,"Stantonsburg, North Carolina",SERC,Severe Weather - Thunderstorm,3,1200


In [93]:
post_cleaning_column_titles = ["datetime_event_began", "NERC_region", "area_affected", "event_type", "demand_loss_(MW)", "number_of_customers_affected", "datetime_of_restoration"]

In [115]:
# Now we need to reorder the columns so they are in the same order as the initial sheets we cleaned.

for sheet in list(range(2011,2015,1)):
    old_col = DataFrame_dict["sheet{0}".format(sheet)].columns.tolist()
    DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)][[old_col[0], old_col[3], old_col[2], old_col[4], old_col[5], old_col[6], old_col[1]]]

In [117]:
for sheet in list(range(2011,2015,1)):
    old_col = DataFrame_dict["sheet{0}".format(sheet)].columns.tolist()
    for n in range(len(post_cleaning_column_titles)):
        DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].rename(columns={old_col[n]: post_cleaning_column_titles[n]})
        
    if len(old_col) > 7: #Drop all additional columns
        for x in range(7,len(old_col),1):
            DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].drop(old_col[x], axis=1)

In [118]:
DataFrame_dict["sheet2012"]

Unnamed: 0,datetime_event_began,NERC_region,area_affected,event_type,demand_loss_(MW),number_of_customers_affected,datetime_of_restoration
1,2012-01-04 12:14:00,WECC,"Tacoma, Washington",Suspected physical attack,,,2012-01-04 12:14:00
2,2012-01-05 10:35:00,SPP,"CSWS/AEP West territory, Oklahoma",Sabotage,0,0,2012-01-05 12:25:00
3,2012-01-05 10:28:00,SPP,"Creek County, Oklahoma",Suspected physical attack,,,2012-01-05 12:25:00
4,2012-01-09 14:30:00,NPCC,"Watertown, Connecticut",Vandalism,,,2012-01-09 15:30:00
5,2012-01-09 13:36:00,SERC,Louisiana,Load Shed,150,1,2012-01-11 01:05:00
...,...,...,...,...,...,...,...
216,2012-12-17 06:55:00,WECC,"Tacoma, Washington",Suspected Physical Attack,0,0,2012-12-17 07:00:00
217,2012-12-25 00:45:00,SPP,Arkansas; Louisiana; Mississippi; Texas,Severe Weather - Winter Storm,Unknown,242509,2012-12-28 16:15:00
218,2012-12-25 09:28:00,TRE,"Houston, Texas","Severe Weather - Cold Front, High Winds",294,262000,2012-12-26 16:28:00
219,2012-12-26 14:50:00,SERC,"Stantonsburg, North Carolina",Severe Weather - Thunderstorm,3,1200,2012-12-26 19:40:00
