In [None]:
#looking at outliers in the data. Checking for the negative durations noticed in Tableau. Will investigate the event start and end date
#to ensure that we get positive duration times
#Want to look at the possible correlations or trends to see if we can model predictions for outages to better prepare.

In [1]:
#Import libraries for EDA and data

import numpy as np
import pandas as pd

outage_df=pd.read_csv('cleaned_outagedata.csv')

In [2]:
outage_df.head()

Unnamed: 0,Event Description,Year,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Respondent,Geographic Areas,NERC Region,Demand Loss (MW),Number of Customers Affected,Tags,Event Start,Event End,Duration
0,Severe Weather - Thunderstorms,2014,2014-06-30,20:00:00,2014-07-02,18:30:00,Exelon Corporation/ComEd,Illinois,RFC,Unknown,420000,"severe weather, thunderstorm",2014-06-30 20:00:00,2014-07-02 18:30:00,1 days 22:30:00
1,Severe Weather - Thunderstorms,2014,2014-06-30,23:20:00,2014-07-01,17:00:00,Northern Indiana Public Service Company,North Central Indiana,RFC,Unknown,127000,"severe weather, thunderstorm",2014-06-30 23:20:00,2014-07-01 17:00:00,0 days 17:40:00
2,Severe Weather - Thunderstorms,2014,2014-06-30,17:55:00,2014-07-01,02:53:00,We Energies,Southeast Wisconsin,MRO,424,120000,"severe weather, thunderstorm",2014-06-30 17:55:00,2014-07-01 02:53:00,0 days 08:58:00
3,Physical Attack - Vandalism,2014,2014-06-24,14:54:00,2014-06-24,14:55:00,Tennessee Valley Authority,"Nashville, Tennessee",SERC,Unknown,Unknown,"vandalism, physical",2014-06-24 14:54:00,2014-06-24 14:55:00,0 days 00:01:00
4,Physical Attack - Vandalism,2014,2014-06-19,08:47:00,2014-06-19,08:48:00,Tennessee Valley Authority,"Nashville, Tennessee",SERC,Unknown,Unknown,"vandalism, physical",2014-06-19 08:47:00,2014-06-19 08:48:00,0 days 00:01:00


In [3]:
print(outage_df.dtypes)

Event Description               object
Year                             int64
Date Event Began                object
Time Event Began                object
Date of Restoration             object
Time of Restoration             object
Respondent                      object
Geographic Areas                object
NERC Region                     object
Demand Loss (MW)                object
Number of Customers Affected    object
Tags                            object
Event Start                     object
Event End                       object
Duration                        object
dtype: object


In [4]:
#check that date event began is less than or equal to the date of restoration
outage_df['Date Event Began']=pd.to_datetime(outage_df['Date Event Began'], errors='coerce')
outage_df['Time Event Began'] = pd.to_datetime(outage_df['Time Event Began'], format='%H:%M:%S', errors='coerce').dt.time

outage_df['Date of Restoration']=pd.to_datetime(outage_df['Date of Restoration'], errors='coerce')
outage_df['Time of Restoration'] = pd.to_datetime(outage_df['Time of Restoration'], format='%H:%M:%S', errors='coerce').dt.time


In [5]:
outage_df['check']=outage_df['Date Event Began']>outage_df['Date of Restoration']

outage_df.head()

Unnamed: 0,Event Description,Year,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Respondent,Geographic Areas,NERC Region,Demand Loss (MW),Number of Customers Affected,Tags,Event Start,Event End,Duration,check
0,Severe Weather - Thunderstorms,2014,2014-06-30,20:00:00,2014-07-02,18:30:00,Exelon Corporation/ComEd,Illinois,RFC,Unknown,420000,"severe weather, thunderstorm",2014-06-30 20:00:00,2014-07-02 18:30:00,1 days 22:30:00,False
1,Severe Weather - Thunderstorms,2014,2014-06-30,23:20:00,2014-07-01,17:00:00,Northern Indiana Public Service Company,North Central Indiana,RFC,Unknown,127000,"severe weather, thunderstorm",2014-06-30 23:20:00,2014-07-01 17:00:00,0 days 17:40:00,False
2,Severe Weather - Thunderstorms,2014,2014-06-30,17:55:00,2014-07-01,02:53:00,We Energies,Southeast Wisconsin,MRO,424,120000,"severe weather, thunderstorm",2014-06-30 17:55:00,2014-07-01 02:53:00,0 days 08:58:00,False
3,Physical Attack - Vandalism,2014,2014-06-24,14:54:00,2014-06-24,14:55:00,Tennessee Valley Authority,"Nashville, Tennessee",SERC,Unknown,Unknown,"vandalism, physical",2014-06-24 14:54:00,2014-06-24 14:55:00,0 days 00:01:00,False
4,Physical Attack - Vandalism,2014,2014-06-19,08:47:00,2014-06-19,08:48:00,Tennessee Valley Authority,"Nashville, Tennessee",SERC,Unknown,Unknown,"vandalism, physical",2014-06-19 08:47:00,2014-06-19 08:48:00,0 days 00:01:00,False


In [6]:
#pull the true rows to update one of the dates to correct the duration and mark as 'Needs Review'
outage_df.loc[outage_df['check'], 'status'] = 'Needs Review'

rows_to_update=outage_df[outage_df['check']]

In [7]:
rows_to_update.head(7)

Unnamed: 0,Event Description,Year,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Respondent,Geographic Areas,NERC Region,Demand Loss (MW),Number of Customers Affected,Tags,Event Start,Event End,Duration,check,status
755,Fuel Supply Deficiency,2011,2011-03-08,08:00:00,2001-03-18,09:00:00,AES Somerset LLC,Western New York,NPCC,676,UNK,"fuel supply emergency, coal",2011-03-08 08:00:00,2001-03-18 09:00:00,-3642 days +01:00:00,True,Needs Review
797,Fuel Supply Deficiency,2010,2010-12-30,14:00:00,2010-01-12,06:00:00,AES Greenidge and Cayuga,New York,RFC,300,,fuel supply emergency,2010-12-30 14:00:00,2010-01-12 06:00:00,-353 days +16:00:00,True,Needs Review
1020,Wind Storm,2008,2008-12-27,16:00:00,2008-01-01,23:30:00,Detroit Edison Company-DTE,Southeastern Michigan,RFC,,247847,"severe weather, wind",2008-12-27 16:00:00,2008-01-01 23:30:00,-361 days +07:30:00,True,Needs Review
1244,Severe Weather,2006,2006-12-30,22:25:00,2006-01-06,14:25:00,Nebraska Public Power District,"Gosper, Harlan, Franklin, Webster, Clay, Adams...",MRO,300-500,15000,severe weather,2006-12-30 22:25:00,2006-01-06 14:25:00,-359 days +16:00:00,True,Needs Review
1590,Vandalism/Insulators,2002,2002-04-08,15:00:00,2002-03-09,12:00:00,Arizona Public Service Co,Arizona,WECC,0,0,vandalism,2002-04-08 15:00:00,2002-03-09 12:00:00,-31 days +21:00:00,True,Needs Review
1614,Severe Weather,2000,2000-08-09,18:30:00,2000-08-07,23:59:00,Cinergy Corp,Ohio,ECAR,,92000,severe weather,2000-08-09 18:30:00,2000-08-07 23:59:00,-2 days +05:29:00,True,Needs Review


In [8]:
#updates based on logic and Google searchs
outage_df.loc[755, 'Date of Restoration'] = '2011-03-18'
outage_df.loc[797, 'Date of Restoration'] = '2011-01-12'
outage_df.loc[1020, 'Date of Restoration'] = '2009-01-01'
outage_df.loc[1244, 'Date of Restoration'] = '2007-01-06'
outage_df.loc[1590, 'Date of Restoration'] = '2002-04-08'
outage_df.loc[1590, 'Date Event Began'] = '2002-04-08'
outage_df.loc[1614, 'Date of Restoration'] = '2000-08-09'
outage_df.loc[1614, 'Date Event Began'] = '2000-08-07'

In [9]:
outage_df['check']=outage_df['Date Event Began']>outage_df['Date of Restoration']

outage_df.head()

Unnamed: 0,Event Description,Year,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Respondent,Geographic Areas,NERC Region,Demand Loss (MW),Number of Customers Affected,Tags,Event Start,Event End,Duration,check,status
0,Severe Weather - Thunderstorms,2014,2014-06-30,20:00:00,2014-07-02,18:30:00,Exelon Corporation/ComEd,Illinois,RFC,Unknown,420000,"severe weather, thunderstorm",2014-06-30 20:00:00,2014-07-02 18:30:00,1 days 22:30:00,False,
1,Severe Weather - Thunderstorms,2014,2014-06-30,23:20:00,2014-07-01,17:00:00,Northern Indiana Public Service Company,North Central Indiana,RFC,Unknown,127000,"severe weather, thunderstorm",2014-06-30 23:20:00,2014-07-01 17:00:00,0 days 17:40:00,False,
2,Severe Weather - Thunderstorms,2014,2014-06-30,17:55:00,2014-07-01,02:53:00,We Energies,Southeast Wisconsin,MRO,424,120000,"severe weather, thunderstorm",2014-06-30 17:55:00,2014-07-01 02:53:00,0 days 08:58:00,False,
3,Physical Attack - Vandalism,2014,2014-06-24,14:54:00,2014-06-24,14:55:00,Tennessee Valley Authority,"Nashville, Tennessee",SERC,Unknown,Unknown,"vandalism, physical",2014-06-24 14:54:00,2014-06-24 14:55:00,0 days 00:01:00,False,
4,Physical Attack - Vandalism,2014,2014-06-19,08:47:00,2014-06-19,08:48:00,Tennessee Valley Authority,"Nashville, Tennessee",SERC,Unknown,Unknown,"vandalism, physical",2014-06-19 08:47:00,2014-06-19 08:48:00,0 days 00:01:00,False,


In [10]:
outage_df['check'].count()

1627

In [11]:
rows_to_update2=outage_df[outage_df['check']]

In [12]:
rows_to_update2.head()

Unnamed: 0,Event Description,Year,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Respondent,Geographic Areas,NERC Region,Demand Loss (MW),Number of Customers Affected,Tags,Event Start,Event End,Duration,check,status


In [13]:
outage_df=outage_df.drop(columns=['check', 'status'])

In [14]:
#now updating the Event Start and Event End
print(outage_df.loc[755])

Event Description                    Fuel Supply Deficiency
Year                                                   2011
Date Event Began                        2011-03-08 00:00:00
Time Event Began                                   08:00:00
Date of Restoration                     2011-03-18 00:00:00
Time of Restoration                                09:00:00
Respondent                                 AES Somerset LLC
Geographic Areas                           Western New York
NERC Region                                            NPCC
Demand Loss (MW)                                        676
Number of Customers Affected                            UNK
Tags                            fuel supply emergency, coal
Event Start                             2011-03-08 08:00:00
Event End                               2001-03-18 09:00:00
Duration                               -3642 days +01:00:00
Name: 755, dtype: object


In [15]:
# update the Event Start column and the Event End column
outage_df['Event Start'] = outage_df.apply(lambda row: pd.Timestamp.combine(row['Date Event Began'], row['Time Event Began']) 
                              if pd.notna(row['Date Event Began']) and pd.notna(row['Time Event Began']) else pd.NaT, axis=1)

outage_df['Event End'] = outage_df.apply(lambda row: pd.Timestamp.combine(row['Date of Restoration'], row['Time of Restoration']) 
                            if pd.notna(row['Date of Restoration']) and pd.notna(row['Time of Restoration']) else pd.NaT, axis=1)


In [16]:
print(outage_df.loc[755])

Event Description                    Fuel Supply Deficiency
Year                                                   2011
Date Event Began                        2011-03-08 00:00:00
Time Event Began                                   08:00:00
Date of Restoration                     2011-03-18 00:00:00
Time of Restoration                                09:00:00
Respondent                                 AES Somerset LLC
Geographic Areas                           Western New York
NERC Region                                            NPCC
Demand Loss (MW)                                        676
Number of Customers Affected                            UNK
Tags                            fuel supply emergency, coal
Event Start                             2011-03-08 08:00:00
Event End                               2011-03-18 09:00:00
Duration                               -3642 days +01:00:00
Name: 755, dtype: object


In [17]:
#update duration column
outage_df['Duration'] = outage_df['Event End'] - outage_df['Event Start']

In [18]:
print(outage_df.loc[755])

Event Description                    Fuel Supply Deficiency
Year                                                   2011
Date Event Began                        2011-03-08 00:00:00
Time Event Began                                   08:00:00
Date of Restoration                     2011-03-18 00:00:00
Time of Restoration                                09:00:00
Respondent                                 AES Somerset LLC
Geographic Areas                           Western New York
NERC Region                                            NPCC
Demand Loss (MW)                                        676
Number of Customers Affected                            UNK
Tags                            fuel supply emergency, coal
Event Start                             2011-03-08 08:00:00
Event End                               2011-03-18 09:00:00
Duration                                   10 days 01:00:00
Name: 755, dtype: object


In [19]:
# Check if any duration is negative
outage_df['Is Negative'] = outage_df['Duration'] < pd.Timedelta(0)

In [20]:
outage_df.head()

Unnamed: 0,Event Description,Year,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Respondent,Geographic Areas,NERC Region,Demand Loss (MW),Number of Customers Affected,Tags,Event Start,Event End,Duration,Is Negative
0,Severe Weather - Thunderstorms,2014,2014-06-30,20:00:00,2014-07-02,18:30:00,Exelon Corporation/ComEd,Illinois,RFC,Unknown,420000,"severe weather, thunderstorm",2014-06-30 20:00:00,2014-07-02 18:30:00,1 days 22:30:00,False
1,Severe Weather - Thunderstorms,2014,2014-06-30,23:20:00,2014-07-01,17:00:00,Northern Indiana Public Service Company,North Central Indiana,RFC,Unknown,127000,"severe weather, thunderstorm",2014-06-30 23:20:00,2014-07-01 17:00:00,0 days 17:40:00,False
2,Severe Weather - Thunderstorms,2014,2014-06-30,17:55:00,2014-07-01,02:53:00,We Energies,Southeast Wisconsin,MRO,424,120000,"severe weather, thunderstorm",2014-06-30 17:55:00,2014-07-01 02:53:00,0 days 08:58:00,False
3,Physical Attack - Vandalism,2014,2014-06-24,14:54:00,2014-06-24,14:55:00,Tennessee Valley Authority,"Nashville, Tennessee",SERC,Unknown,Unknown,"vandalism, physical",2014-06-24 14:54:00,2014-06-24 14:55:00,0 days 00:01:00,False
4,Physical Attack - Vandalism,2014,2014-06-19,08:47:00,2014-06-19,08:48:00,Tennessee Valley Authority,"Nashville, Tennessee",SERC,Unknown,Unknown,"vandalism, physical",2014-06-19 08:47:00,2014-06-19 08:48:00,0 days 00:01:00,False


In [21]:
follow_up=outage_df[outage_df['Is Negative'] == True]

follow_up.head(18)

Unnamed: 0,Event Description,Year,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Respondent,Geographic Areas,NERC Region,Demand Loss (MW),Number of Customers Affected,Tags,Event Start,Event End,Duration,Is Negative
30,Severe Weather - Thunderstorms,2014,2014-04-29,23:30:00,2014-04-29,12:30:00,Southern Company,"Mississippi, Alabama",SERC,355,106648,"severe weather, thunderstorm",2014-04-29 23:30:00,2014-04-29 12:30:00,-1 days +13:00:00,True
833,Severe Weather,2010,2010-08-11,15:21:00,2010-08-11,12:12:00,American Electric Power (AEP),Ohio,RFC,,57000,severe weather,2010-08-11 15:21:00,2010-08-11 12:12:00,-1 days +20:51:00,True
935,Made Public Appeals,2009,2009-08-31,10:31:00,2009-08-31,00:00:00,Los Angeles Department of Water and Power,"City of Los Angeles, California",WECC,,,public appeal,2009-08-31 10:31:00,2009-08-31 00:00:00,-1 days +13:29:00,True
939,Thunderstorms,2009,2009-08-12,18:25:00,2009-08-12,10:00:00,CenterPoint Energy,South Houston Service Area,TRE,491,73000,"severe weather, thunderstorm",2009-08-12 18:25:00,2009-08-12 10:00:00,-1 days +15:35:00,True
1036,Shed Firm Load,2008,2008-11-11,08:30:00,2008-11-11,00:19:00,Puerto Rico Electric Power Authority,Island of Puerto Rico,PR,250,261000,load shedding,2008-11-11 08:30:00,2008-11-11 00:19:00,-1 days +15:49:00,True
1040,Load Shedding,2008,2008-10-02,14:50:00,2008-10-02,09:50:00,Dow Chemical Co,Louisiana,SERC,200,0,load shedding,2008-10-02 14:50:00,2008-10-02 09:50:00,-1 days +19:00:00,True
1042,Shed Firm Load,2008,2008-09-22,17:49:00,2008-09-22,06:39:00,Puerto Rico Electric Power Authority,Island of Puerto Rico,PR,125,43600,load shedding,2008-09-22 17:49:00,2008-09-22 06:39:00,-1 days +12:50:00,True
1131,Load Shedding,2008,2008-05-08,10:21:00,2008-05-08,00:56:00,California ISO,California,WECC,483,0,load shedding,2008-05-08 10:21:00,2008-05-08 00:56:00,-1 days +14:35:00,True
1183,Electrical System Separation/Severe Storms,2007,2007-09-18,05:15:00,2007-09-18,00:00:00,Midwest ISO,"Manitoba, Minnesota, North Dakota, Portions of...",RFC,"8,000-10,000",11175,"severe weather, storm, islanding, load shedding",2007-09-18 05:15:00,2007-09-18 00:00:00,-1 days +18:45:00,True
1535,Hurricane Isabel,2003,2003-09-18,11:45:00,2003-09-18,00:00:00,Carolina Power & Light,Eastern North Carolina,SERC,peak 1655,"peak 320,00 9/18/03 7:00 p.m.","severe weather, hurricane/tropical storm",2003-09-18 11:45:00,2003-09-18 00:00:00,-1 days +12:15:00,True


In [23]:
#dropping negative rows as we did not get good data for those events. It is a very low amount of the data
indices_to_drop= [30,833,935,939,1036,1040,1042,1131,1183,1535,1562,1583,1586,1587,1588,1589,1590,1611]
outage_df=outage_df.drop(indices_to_drop)

In [24]:
follow_up2=outage_df[outage_df['Is Negative'] == True]

follow_up2.head()

Unnamed: 0,Event Description,Year,Date Event Began,Time Event Began,Date of Restoration,Time of Restoration,Respondent,Geographic Areas,NERC Region,Demand Loss (MW),Number of Customers Affected,Tags,Event Start,Event End,Duration,Is Negative


In [25]:
# Save the updated dataframe to a new CSV file
outage_df.to_csv('NoNeg_outagedata.csv', index=False)