# Maven Power Outage Challenge

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium 
import openpyxl

## Step 0. Initial exploration of the data - it is messy!

In [2]:
# Define the sheet names in a list so that we can easily iterate through the excel sheets (tabs). 
# Each sheet is a year between 2002 and 2023

sheet_names = []
for n in range(2002, 2024, 1):
    sheet_names.append(str(n))

In [3]:
# Creating a dictionary of dataframes, one dataframe per excel sheet. 
# this will allow us to tailor our data cleaning to the individual sheet since the format of the sheet changes over the years.

DataFrame_dict = {}


for sheet in sheet_names:
    if sheet in ["2002", "2008"]:
        header_value = 2
    else:
        header_value = 1
    
    DataFrame_dict["sheet{0}".format(sheet)] = pd.read_excel('DOE_Electric_Disturbance_Events.xlsx', engine='openpyxl', header=header_value, sheet_name=sheet)

In [4]:
# Now each excel sheet has been transformed to a dataframe, and I have taken into acocunt the header rows of the original sheets. 

In [5]:
# Check out one of the dataframes to see if we get the data we expect:
DataFrame_dict["sheet2008"].head()

Unnamed: 0,Date,NERC Region,Time,Area Affected,Type of Disturbance,Loss (megawatts),Number of Customers Affected 1,Restoration,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,,,,,,,,Date/Time,,,,
1,January,,,,,,,,,,,
2,2008-01-04 00:00:00,WECC,4:00 a.m.,Northern California,Winter Storm,500.0,2606931,5:00 p.m. January 14,,,,
3,2008-01-04 00:00:00,WECC,7:47 a.m.,Sacramento County,Severe Storm,300.0,150000,4:30 p.m. January 04,,,,
4,2008-01-29 00:00:00,WECC,5:00 a.m.,"San Francisco Bay Area, California",Exciter Faulted,,-,12:17 p.m. January 29,,,,


The data above is what I expect, however it is still very messy! but before I start cleaning each individual dataframe 
I would like to create one single dataframe which stores all the data. To do this will involve a few steps:
- First, I need to make sure columns in each dataframe match (i.e. contain the same feature information)
- then I wil concatenate thesource dataframes to create a single datasource

In [6]:
# before I look at the column names for each dataframe let's determine how we will concatenate two dataframes:

example_df1 = pd.DataFrame(
    {
        "ID": [78, 88, 98],
        "Name": ["Nick", "Stan", "Ludwig"],
        "CGPA": [2.3, 2.5, 3.2],
        "Dept": ["EEE", "BA", "LAW"],
        "Region": ["Puerto Rico", "Miami", "Malmo"]
    })

example_df1

Unnamed: 0,ID,Name,CGPA,Dept,Region
0,78,Nick,2.3,EEE,Puerto Rico
1,88,Stan,2.5,BA,Miami
2,98,Ludwig,3.2,LAW,Malmo


In [7]:
example_df2 = pd.DataFrame(
    {
        "ID": [15, 16, 17, 18, 19],
        "Name": ["Abid", "Matthew", "Nisha", "Natassha", "Nahla"],
        "CGPA": [2.3, 3.0, 3.9, 2.5, 3.2],
        "Dept": ["EEE", "IT", "CS", "BA", "LAW"],
        "Region": ["Islamabad", "Ontario", "London", "Saba", "Denver"],
    })

example_df2

Unnamed: 0,ID,Name,CGPA,Dept,Region
0,15,Abid,2.3,EEE,Islamabad
1,16,Matthew,3.0,IT,Ontario
2,17,Nisha,3.9,CS,London
3,18,Natassha,2.5,BA,Saba
4,19,Nahla,3.2,LAW,Denver


In [8]:
combined_dataframe = example_df1.append(example_df2)
combined_dataframe

Unnamed: 0,ID,Name,CGPA,Dept,Region
0,78,Nick,2.3,EEE,Puerto Rico
1,88,Stan,2.5,BA,Miami
2,98,Ludwig,3.2,LAW,Malmo
0,15,Abid,2.3,EEE,Islamabad
1,16,Matthew,3.0,IT,Ontario
2,17,Nisha,3.9,CS,London
3,18,Natassha,2.5,BA,Saba
4,19,Nahla,3.2,LAW,Denver


In [9]:
# Now that I know how to concatenate two dataframes in pandas lets explore the column titles for each dataframe:
column_title_dictionary = {}
for sheet in sheet_names: #Note: sheet_names is defined at the top of this notebook
    column_title_dictionary["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].columns.tolist()

column_title_df = pd.DataFrame.from_dict(column_title_dictionary, orient='index') #use the orient argument to make the keys of the dictionary equivalent to the index values of the dataframe
column_title_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
sheet2002,Date,NERC Region,Time,Area,Type of Disturbance,Loss (megawatts),Number of Customers Affected,Restoration Time,,,,,,,,,,,
sheet2003,Date,NERC Region,Time,Area Affected,Type of Disturbance,Loss (megawatts),Number of Customers Affected 1,Restoration,,,,,,,,,,,
sheet2004,Date,NERC Region,Time,Area Affected,Type of Disturbance,Loss (megawatts),Number of Customers Affected 1,Restoration,,,,,,,,,,,
sheet2005,Date,NERC Region,Time,Area Affected,Type of Disturbance,Loss (megawatts),Number of Customers Affected 1,Restoration,,,,,,,,,,,
sheet2006,Date,NERC Region,Time,Area Affected,Type of Disturbance,Loss (megawatts),Number of Customers Affected 1,Restoration,,,,,,,,,,,
sheet2007,Date,NERC Region,Time,Area Affected,Type of Disturbance,Loss (megawatts),Number of Customers Affected 1[1],Restoration,,,,,,,,,,,
sheet2008,Date,NERC Region,Time,Area Affected,Type of Disturbance,Loss (megawatts),Number of Customers Affected 1,Restoration,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,,,,,,,
sheet2009,Date,NERC Region,Time,Area Affected,Type of Disturbance,Loss (megawatts),Number of Customers Affected 1,Restoration,,,,,,,,,,,
sheet2010,Date,NERC Region,Time,Area Affected,Type of Disturbance,Loss (megawatts),Number of Customers Affected 1,Restoration,,,,,,,,,,,
sheet2011,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Area Affected,NERC Region,Event Type,Demand Loss (MW),Number of Customers Affected,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18


In [10]:
post_cleaning_column_titles = ["datetime_event_began", "NERC_region", "area_affected", "event_type", "demand_loss_(MW)", "number_of_customers_affected", "datetime_of_restoration"]
print(len(post_cleaning_column_titles)) # In our cleaned and concatenated dataframe I will have 7 columns

7


### How will I manipulate the dataframes so that I have seven consistent features accross all dataframes?
From 2002 to 2010:
- reorder columns so that columns 1 and 2 switch places(i.e. "Time" comes before NERC Region)
- concatenate columns 0 and 1. This value will override existing column 0 and should leave you with one column that containes datetime_event_begins
- rename columns so that they match the names from post_cleaning_column_titles

From 2011 to 2014:
- 

In [11]:
# for the dataframes containing data from 2002 to 2010, reorder columns so that columns 1 and 2 switch places(i.e. "Time" comes before NERC Region)
for sheet in list(range(2002,2011,1)):
    temp_cols_list = DataFrame_dict["sheet{0}".format(sheet)].columns.tolist()
    save_col1 = temp_cols_list[1] 
    temp_cols_list[1] = temp_cols_list[2]
    temp_cols_list[2] = save_col1
    DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)][temp_cols_list]

DataFrame_dict["sheet2002"]

Unnamed: 0,Date,Time,NERC Region,Area,Type of Disturbance,Loss (megawatts),Number of Customers Affected,Restoration Time
0,January,,,,,,,
1,2002-01-30 00:00:00,06:00:00,SPP,Oklahoma,Ice Storm,500,1881134,2002-02-07 12:00:00
2,,,,,,,,
3,2002-01-29 00:00:00,Evening,SPP,Metropolitan Kansas City Area,Ice Storm,500-600,270000,
4,2002-01-30 00:00:00,16:00:00,SPP,Missouri,Ice Storm,210,95000,2002-02-10 21:00:00
5,February,,,,,,,
6,2002-02-27 00:00:00,10:48:00,WSCC,California,Interruption of Firm Load,300,255000,2002-02-27 11:35:00
7,March,,,,,,,
8,2002-03-09 00:00:00,00:00:00,ECAR,Lower Peninsula of Michigan,Severe Weather,190,190000,2002-03-11 12:00:00
9,April,,,,,,,


In [12]:
# Lets create a function that takes a dataframe, iterates through the rows, and checkes that the "Date" column to a 
# datetime datatype. otherwise it drops the row because this indicates the row is an unexpected format and most likely a human readable marker.

from datetime import datetime

def drop_non_datetime_values(dataframe_name, column_name):
    for ind, row in dataframe_name.iterrows():
        if type(dataframe_name[column_name][ind]) == datetime:
            pass
        else:
            dataframe_name.drop(axis=0, index=ind, inplace=True)
    return dataframe_name

In [13]:
# Removing rows from the dataframe that are redundant or that repeat the column names.
for sheet in list(range(2002,2011,1)):
    drop_non_datetime_values(DataFrame_dict["sheet{0}".format(sheet)], "Date")

In [14]:
DataFrame_dict["sheet2002"].head()

Unnamed: 0,Date,Time,NERC Region,Area,Type of Disturbance,Loss (megawatts),Number of Customers Affected,Restoration Time
1,2002-01-30 00:00:00,06:00:00,SPP,Oklahoma,Ice Storm,500,1881134,2002-02-07 12:00:00
3,2002-01-29 00:00:00,Evening,SPP,Metropolitan Kansas City Area,Ice Storm,500-600,270000,
4,2002-01-30 00:00:00,16:00:00,SPP,Missouri,Ice Storm,210,95000,2002-02-10 21:00:00
6,2002-02-27 00:00:00,10:48:00,WSCC,California,Interruption of Firm Load,300,255000,2002-02-27 11:35:00
8,2002-03-09 00:00:00,00:00:00,ECAR,Lower Peninsula of Michigan,Severe Weather,190,190000,2002-03-11 12:00:00


In [15]:
# We have discovered some "Time" value that can not be changed to a time datatype due to how 
# they have been entered into the database. I will change Ëvening" to "20:00:00". I will adjust these one-by-one to preserve as much information as possible
DataFrame_dict["sheet2002"].loc[3, "Time"] = "20:00:00"
DataFrame_dict["sheet2003"].loc[56, "Time"] = "15:00:00"
DataFrame_dict["sheet2003"].loc[78, "Time"] = "12:00:00"
DataFrame_dict["sheet2004"].loc[3, "Time"] = "00:00:00"
DataFrame_dict["sheet2004"].loc[81, "Time"] = "09:52:00"
DataFrame_dict["sheet2005"].loc[13, "Time"] = "17:28:00"
DataFrame_dict["sheet2005"].loc[21, "Time"] = "00:00:00"
DataFrame_dict["sheet2006"].loc[12, "Time"] = "00:00:00" # Original value = "Ongoing"

# This is the code I used to find the index value of the problematic time data:
# DataFrame_dict["sheet2006"][DataFrame_dict["sheet2006"]["Time"] == "Ongoing"].index.values[0]

DataFrame_dict["sheet2002"].head()

Unnamed: 0,Date,Time,NERC Region,Area,Type of Disturbance,Loss (megawatts),Number of Customers Affected,Restoration Time
1,2002-01-30 00:00:00,06:00:00,SPP,Oklahoma,Ice Storm,500,1881134,2002-02-07 12:00:00
3,2002-01-29 00:00:00,20:00:00,SPP,Metropolitan Kansas City Area,Ice Storm,500-600,270000,
4,2002-01-30 00:00:00,16:00:00,SPP,Missouri,Ice Storm,210,95000,2002-02-10 21:00:00
6,2002-02-27 00:00:00,10:48:00,WSCC,California,Interruption of Firm Load,300,255000,2002-02-27 11:35:00
8,2002-03-09 00:00:00,00:00:00,ECAR,Lower Peninsula of Michigan,Severe Weather,190,190000,2002-03-11 12:00:00


In [16]:
#Now that the date and time columns are in a format that can be recognised by pd.to_datetime, lets concatenate these two columns

for sheet in list(range(2002,2011,1)):
    DataFrame_dict["sheet{0}".format(sheet)]['Date'] = pd.to_datetime(DataFrame_dict["sheet{0}".format(sheet)]['Date'].astype(str) + ' ' + DataFrame_dict["sheet{0}".format(sheet)]['Time'].astype(str))

In [17]:
#concatenated columns 0 and 1. This value will override existing column 0 and should leave you with one column that containes datetime_event_begins

In [18]:
DataFrame_dict["sheet2002"].head()

Unnamed: 0,Date,Time,NERC Region,Area,Type of Disturbance,Loss (megawatts),Number of Customers Affected,Restoration Time
1,2002-01-30 06:00:00,06:00:00,SPP,Oklahoma,Ice Storm,500,1881134,2002-02-07 12:00:00
3,2002-01-29 20:00:00,20:00:00,SPP,Metropolitan Kansas City Area,Ice Storm,500-600,270000,
4,2002-01-30 16:00:00,16:00:00,SPP,Missouri,Ice Storm,210,95000,2002-02-10 21:00:00
6,2002-02-27 10:48:00,10:48:00,WSCC,California,Interruption of Firm Load,300,255000,2002-02-27 11:35:00
8,2002-03-09 00:00:00,00:00:00,ECAR,Lower Peninsula of Michigan,Severe Weather,190,190000,2002-03-11 12:00:00


In [19]:
# Now that the time data is included in the first column of our dataframe we can drop the "Time" column from the dataframes containing data for years 2002 - 2010

In [20]:
for sheet in list(range(2002,2011,1)):
    DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].drop("Time", axis=1)

In [21]:
DataFrame_dict["sheet2002"].head()

Unnamed: 0,Date,NERC Region,Area,Type of Disturbance,Loss (megawatts),Number of Customers Affected,Restoration Time
1,2002-01-30 06:00:00,SPP,Oklahoma,Ice Storm,500,1881134,2002-02-07 12:00:00
3,2002-01-29 20:00:00,SPP,Metropolitan Kansas City Area,Ice Storm,500-600,270000,
4,2002-01-30 16:00:00,SPP,Missouri,Ice Storm,210,95000,2002-02-10 21:00:00
6,2002-02-27 10:48:00,WSCC,California,Interruption of Firm Load,300,255000,2002-02-27 11:35:00
8,2002-03-09 00:00:00,ECAR,Lower Peninsula of Michigan,Severe Weather,190,190000,2002-03-11 12:00:00


In [25]:
len(post_cleaning_column_titles)

7

In [34]:
# Now that dataframes for 2002 - 2010 are fairly clean let us standardise the column names so we can concatenate the dataframes:

for sheet in list(range(2002,2011,1)):
    old_col = DataFrame_dict["sheet{0}".format(sheet)].columns.tolist()
    for n in range(len(post_cleaning_column_titles)):
        DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].rename(columns={old_col[n]: post_cleaning_column_titles[n]})
        
    if len(old_col) > 7: #Drop all additional columns
        for x in range(7,len(old_col),1):
            DataFrame_dict["sheet{0}".format(sheet)] = DataFrame_dict["sheet{0}".format(sheet)].drop(old_col[x], axis=1)


In [35]:
DataFrame_dict["sheet2003"].head()

Unnamed: 0,datetime_event_began,NERC_region,area_affected,event_type,demand_loss_(MW),number_of_customers_affected,datetime_of_restoration
2,2003-01-25 14:00:00,ECAR,"Cincinnati, Ohio",Cyber Threat From Internet,,,"1/26/03, 2:00 a.m."
4,2003-02-27 11:32:00,SERC,"Piedmont, North Carolina",Winter Ice Storm,1000,"over 340,000","3/01/03, 8:00 a.m."
8,2003-04-03 19:00:00,ECAR,Lower Michigan Peninsula,Ice Storm,300,425000,"4/06/03, 5:00 p.m."
9,2003-04-04 03:11:00,NPCC,"New York,",Severe Storm,200-250,160000,"4/05/03, 2:00 p.m."
12,2003-04-15 11:00:00,ERCOT,"Cities of Bryan,",Relaying Malfunction,212,68530,"4/15/03, 2:06 p.m."


In [36]:
# Now that dataframes for 2002 - 2010 are fairly clean and standardised, lets concatenate them:

cleaned_combined_df = DataFrame_dict["sheet2002"]

for sheet in list(range(2003,2011,1)):
    cleaned_combined_df = cleaned_combined_df.append(DataFrame_dict["sheet{0}".format(sheet)])
    
cleaned_combined_df 

Unnamed: 0,datetime_event_began,NERC_region,area_affected,event_type,demand_loss_(MW),number_of_customers_affected,datetime_of_restoration
1,2002-01-30 06:00:00,SPP,Oklahoma,Ice Storm,500,1881134,2002-02-07 12:00:00
3,2002-01-29 20:00:00,SPP,Metropolitan Kansas City Area,Ice Storm,500-600,270000,
4,2002-01-30 16:00:00,SPP,Missouri,Ice Storm,210,95000,2002-02-10 21:00:00
6,2002-02-27 10:48:00,WSCC,California,Interruption of Firm Load,300,255000,2002-02-27 11:35:00
8,2002-03-09 00:00:00,ECAR,Lower Peninsula of Michigan,Severe Weather,190,190000,2002-03-11 12:00:00
...,...,...,...,...,...,...,...
136,2010-12-14 07:20:00,WECC,California,Electrical System Separation - Islanding,9,6635,7:25 a.m. December 14
137,2010-12-14 07:36:00,WECC,Southern California,Transmission Equipment/Firm System Load,464,,9:00 a.m. December 15
138,2010-12-18 05:00:00,WECC,"Redmond, Washington",Severe Weather,184,92090,10:00 p.m. December 19
139,2010-12-26 08:15:00,SERC,Carolina,Severe Weather,,42000,4:15 p.m. December 26


In [6]:
disturbances.columns

Index(['Table B.2.', 'Major Disturbances and Unusual Occurrences, 2002',
       'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6',
       'Unnamed: 7'],
      dtype='object')

Initial notes about data:
1. the column headers seem to be located in row 1
2. date formats are inconsistent
3. I will need to check what NERC Region means
4. What does area refer to? Sometimes it is an entire state while other times it is a more specific part of a state.
5. lots of NaN that needs to be addressed
6. some columns have inconsistent data located in them - perhaps these rows need to be removed

In [7]:
#What is the shape of the data:

disturbances.shape

(41, 8)

## Step 1. Creating a rough data cleaning plan in pseudocode 

After taking an initial look at the data I know I want to use the python package openpyxl to clean/ wrangle the data programmatically. Here is what I want to accomplish:
0. Load workbook so that we can read the data from the excel sheet with openpyxl
1. Initialise an empty pandas DataFrame. Define the column names: ["Date", "NERC Region", "Time", "Area Affected",	"Type of Disturbance", "Loss (megawatts)",	"Number of Customers Affected", "Restoration Date/Time"]
2. for sheets 2002 - 2010 we have columns A-H (in these sheets date and time of restoration are in one column called "Restoration Date/Time" which changes to two columns in 2011: "Date of Restoration", "Time of Restoration"):
    a. Select one sheet within the excel spreadsheet. loop rough the sheet line by line
    b. If row starts with "Table", remove row (or skip row)
    c. If row starts with a month or the word "Date", skip row
    d. else, append row to the DataFrame
2. for 2011-2014 we have columns A-I (look at data description to find out the new structure of the timing record). In 2011 "Type of disturbance" changes to "Event type".
3. for sheets 2015 - 2023 we have columns A-K: two columns have been added: "Month" (which replaces the row markers within the sheets, and "Alert Criteria"
3. Repeat step 2 for all sheets in the file
4. Check the output, start proper exploratory data analysis

In [19]:
#Step 0: Exploring openpyxl
excel_workbook = openpyxl.load_workbook("DOE_Electric_Disturbance_Events.xlsx")

# This is an example of how to return data from a predefined sheet
print(excel_workbook["2003"]["F7"].value) 

1000


In [23]:
#slicing the data - getting all cells from a column. I could also use iter_rows or iter_cols
print(excel_workbook["2015"]["A"])

(<Cell '2015'.A1>, <Cell '2015'.A2>, <Cell '2015'.A3>, <Cell '2015'.A4>, <Cell '2015'.A5>, <Cell '2015'.A6>, <Cell '2015'.A7>, <Cell '2015'.A8>, <Cell '2015'.A9>, <Cell '2015'.A10>, <Cell '2015'.A11>, <Cell '2015'.A12>, <Cell '2015'.A13>, <Cell '2015'.A14>, <Cell '2015'.A15>, <Cell '2015'.A16>, <Cell '2015'.A17>, <Cell '2015'.A18>, <Cell '2015'.A19>, <Cell '2015'.A20>, <Cell '2015'.A21>, <Cell '2015'.A22>, <Cell '2015'.A23>, <Cell '2015'.A24>, <Cell '2015'.A25>, <Cell '2015'.A26>, <Cell '2015'.A27>, <Cell '2015'.A28>, <Cell '2015'.A29>, <Cell '2015'.A30>, <Cell '2015'.A31>, <Cell '2015'.A32>, <Cell '2015'.A33>, <Cell '2015'.A34>, <Cell '2015'.A35>, <Cell '2015'.A36>, <Cell '2015'.A37>, <Cell '2015'.A38>, <Cell '2015'.A39>, <Cell '2015'.A40>, <Cell '2015'.A41>, <Cell '2015'.A42>, <Cell '2015'.A43>, <Cell '2015'.A44>, <Cell '2015'.A45>, <Cell '2015'.A46>, <Cell '2015'.A47>, <Cell '2015'.A48>, <Cell '2015'.A49>, <Cell '2015'.A50>, <Cell '2015'.A51>, <Cell '2015'.A52>, <Cell '2015'.A53>, <

In [29]:
row_dict = []
for row in excel_workbook["2015"].iter_rows(min_row=1, max_row=2, min_col=1, max_col=11, values_only=True):
    row_dict.append(row)

print(row_dict)

[('OE-417 Electric Emergency and Disturbance Report - Calendar Year 2015', None, None, None, None, None, None, None, None, None, None), ('Month', 'Date Event Began', 'Time Event Began', 'Date of Restoration', 'Time of Restoration', 'Area Affected', 'NERC Region', 'Alert Criteria', 'Event Type', 'Demand Loss (MW)', 'Number of Customers Affected')]


In [30]:
for row in excel_workbook["2015"].rows:
    print(row)

(<Cell '2015'.A1>, <MergedCell '2015'.B1>, <MergedCell '2015'.C1>, <MergedCell '2015'.D1>, <MergedCell '2015'.E1>, <MergedCell '2015'.F1>, <MergedCell '2015'.G1>, <MergedCell '2015'.H1>, <MergedCell '2015'.I1>, <MergedCell '2015'.J1>, <MergedCell '2015'.K1>, <Cell '2015'.L1>)
(<Cell '2015'.A2>, <Cell '2015'.B2>, <Cell '2015'.C2>, <Cell '2015'.D2>, <Cell '2015'.E2>, <Cell '2015'.F2>, <Cell '2015'.G2>, <Cell '2015'.H2>, <Cell '2015'.I2>, <Cell '2015'.J2>, <Cell '2015'.K2>, <Cell '2015'.L2>)
(<Cell '2015'.A3>, <Cell '2015'.B3>, <Cell '2015'.C3>, <Cell '2015'.D3>, <Cell '2015'.E3>, <Cell '2015'.F3>, <Cell '2015'.G3>, <Cell '2015'.H3>, <Cell '2015'.I3>, <Cell '2015'.J3>, <Cell '2015'.K3>, <Cell '2015'.L3>)
(<Cell '2015'.A4>, <Cell '2015'.B4>, <Cell '2015'.C4>, <Cell '2015'.D4>, <Cell '2015'.E4>, <Cell '2015'.F4>, <Cell '2015'.G4>, <Cell '2015'.H4>, <Cell '2015'.I4>, <Cell '2015'.J4>, <Cell '2015'.K4>, <Cell '2015'.L4>)
(<Cell '2015'.A5>, <Cell '2015'.B5>, <Cell '2015'.C5>, <Cell '2015'.D5>,

In [31]:
for index,row in disturbances.iterrows():
    print(row)

Table B.2.                                          NaN
Major Disturbances and Unusual Occurrences, 2002    NaN
Unnamed: 2                                          NaN
Unnamed: 3                                          NaN
Unnamed: 4                                          NaN
Unnamed: 5                                          NaN
Unnamed: 6                                          NaN
Unnamed: 7                                          NaN
Name: 0, dtype: object
Table B.2.                                                                  Date
Major Disturbances and Unusual Occurrences, 2002                     NERC Region
Unnamed: 2                                                                  Time
Unnamed: 3                                                                  Area
Unnamed: 4                                                   Type of Disturbance
Unnamed: 5                                                      Loss (megawatts)
Unnamed: 6                                 

In [11]:
#Step 1: Load workbook so that we can read the data from the excel sheet with openpyxl. Retrieve the sheet names.

# FYI There are additional reading options to keep in mind: read_only loads a spreadsheet in read-only mode allowing you to open very large Excel files.
# data_only ignores loading formulas and instead loads only the resulting values.

from openpyxl import load_workbook
excel_workbook = load_workbook(filename='DOE_Electric_Disturbance_Events.xlsx')
excel_sheetnames = excel_workbook.sheetnames
sheet = excel_workbook.active

1000


In [None]:
# NOTE: I have lost some of the merged cell data from the area column of 2002 sheet, to revisit and look at data dictionary for better understanding
