# Analysis of poweroutages

**Name(s)**: (your name(s) here)

**Website Link**: (your website link)

In [58]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

# from dsc80_utils import * # Feel free to uncomment and use this.

## Step 1: Introduction

## Step 2: Data Cleaning and Exploratory Data Analysis

### Data cleaning

In [59]:
data = pd.read_excel('data/outage.xlsx', skiprows=5)

In [60]:
data = data.drop(columns = 'variables')
data = data[1:]

In [61]:
data.set_index('OBS', inplace=True)

In [62]:
columns_to_keep = [
    'YEAR',
    'MONTH',
    'U.S._STATE',
    'NERC.REGION',
    'ANOMALY.LEVEL',
    'CLIMATE.CATEGORY',
    'OUTAGE.START.DATE',
    'OUTAGE.START.TIME',
    'OUTAGE.RESTORATION.DATE',
    'OUTAGE.RESTORATION.TIME',
    'CAUSE.CATEGORY',
    'CAUSE.CATEGORY.DETAIL',
    'OUTAGE.DURATION',
    'DEMAND.LOSS.MW',
    'CUSTOMERS.AFFECTED',
    'RES.CUSTOMERS',
    'COM.CUSTOMERS',
    'IND.CUSTOMERS',
    'TOTAL.CUSTOMERS'
]

Dropping irrelevant rows

In [63]:
data = data[columns_to_keep]

Combining Outage.start.date and outage.start.time, same thing with outage.restoration

In [64]:
# Combine 'OUTAGE.START.DATE' and 'OUTAGE.START.TIME' into a single 'OUTAGE.START' column
data['OUTAGE.START'] = pd.to_datetime(data['OUTAGE.START.DATE']) + pd.to_timedelta(data['OUTAGE.START.TIME'].astype(str))

# Combine 'OUTAGE.RESTORATION.DATE' and 'OUTAGE.RESTORATION.TIME' into a single 'OUTAGE.RESTORATION' column
data['OUTAGE.RESTORATION'] = pd.to_datetime(data['OUTAGE.RESTORATION.DATE']) + pd.to_timedelta(data['OUTAGE.RESTORATION.TIME'].astype(str))

# Display the first few rows of the relevant columns to verify the results
data[['OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'OUTAGE.START', 'OUTAGE.RESTORATION.DATE', 'OUTAGE.RESTORATION.TIME', 'OUTAGE.RESTORATION']].head()

Unnamed: 0_level_0,OUTAGE.START.DATE,OUTAGE.START.TIME,OUTAGE.START,OUTAGE.RESTORATION.DATE,OUTAGE.RESTORATION.TIME,OUTAGE.RESTORATION
OBS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,2011-07-01 00:00:00,17:00:00,2011-07-01 17:00:00,2011-07-03 00:00:00,20:00:00,2011-07-03 20:00:00
2.0,2014-05-11 00:00:00,18:38:00,2014-05-11 18:38:00,2014-05-11 00:00:00,18:39:00,2014-05-11 18:39:00
3.0,2010-10-26 00:00:00,20:00:00,2010-10-26 20:00:00,2010-10-28 00:00:00,22:00:00,2010-10-28 22:00:00
4.0,2012-06-19 00:00:00,04:30:00,2012-06-19 04:30:00,2012-06-20 00:00:00,23:00:00,2012-06-20 23:00:00
5.0,2015-07-18 00:00:00,02:00:00,2015-07-18 02:00:00,2015-07-19 00:00:00,07:00:00,2015-07-19 07:00:00


In [66]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1534 entries, 1.0 to 1534.0
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   YEAR                     1534 non-null   float64       
 1   MONTH                    1525 non-null   float64       
 2   U.S._STATE               1534 non-null   object        
 3   NERC.REGION              1534 non-null   object        
 4   ANOMALY.LEVEL            1525 non-null   object        
 5   CLIMATE.CATEGORY         1525 non-null   object        
 6   OUTAGE.START.DATE        1525 non-null   object        
 7   OUTAGE.START.TIME        1525 non-null   object        
 8   OUTAGE.RESTORATION.DATE  1476 non-null   object        
 9   OUTAGE.RESTORATION.TIME  1476 non-null   object        
 10  CAUSE.CATEGORY           1534 non-null   object        
 11  CAUSE.CATEGORY.DETAIL    1063 non-null   object        
 12  OUTAGE.DURATION          1476 non-n

## Exploratory Data Analysis

### Univariate Analysis

In [83]:
cause_counts = data['CAUSE.CATEGORY'].value_counts().reset_index()
cause_counts.columns = ['CAUSE.CATEGORY', 'count']


In [84]:
fig = px.bar(
    cause_counts,
    x='CAUSE.CATEGORY',
    y='count',
    title='Count of Outages by Cause Category',
    labels={'CAUSE.CATEGORY': 'Cause Category', 'count': 'Number of Outages'}
)

# Show the plot
fig.show()

In [86]:
# Count the occurrences of each U.S. state in 'U.S._STATE'
state_counts = data['U.S._STATE'].value_counts().reset_index()

# Rename the columns for clarity
state_counts.columns = ['U.S._STATE', 'count']
# Create a bar plot
fig = px.bar(
    state_counts,
    x='U.S._STATE',
    y='count',
    title='Count of Outages by U.S. State',
    labels={'U.S._STATE': 'U.S. State', 'count': 'Number of Outages'}
)

# Show the plot
fig.show()

## Bivariate analysis

Purpose: Understand how outage duration varies by the cause of the outage. This helps identify if certain causes (like severe weather) are linked to longer outages.

In [88]:
fig = px.box(
    data,
    x='CAUSE.CATEGORY',
    y='CUSTOMERS.AFFECTED',
    title='Customers Affected by Cause Category',
    labels={'CAUSE.CATEGORY': 'Cause Category', 'CUSTOMERS.AFFECTED': 'Customers Affected'}
)

fig.show()

In [113]:
import folium
import geopandas as gpd
from folium.plugins import HeatMap
from geopy.geocoders import Nominatim  # Import Nominatim from geopy
from folium.plugins import TimestampedGeoJson
import time  # To add delays





Customer's affected by state

In [119]:
# URL to a GeoJSON file for U.S. state boundaries
url = 'https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json'

# Create a map centered on the U.S.
m = folium.Map(location=[37.8, -96], zoom_start=4)

# Aggregate total customers affected by state
state_customer_impact = data.groupby('U.S._STATE')['CUSTOMERS.AFFECTED'].sum().reset_index()

# Add a Choropleth layer to visualize the customers affected by state
folium.Choropleth(
    geo_data=url,
    data=state_customer_impact,
    columns=['U.S._STATE', 'CUSTOMERS.AFFECTED'],
    key_on='feature.properties.name',  # Matches GeoJSON 'name' property with 'U.S._STATE'
    fill_color='YlOrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Customers Affected by State'
).add_to(m)

# Display the map
m

Choropleth Map of Outage Duration by State
Purpose: Show which states experience the longest outages on average.



In [123]:
# URL to GeoJSON file for U.S. states
url = 'https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json'

# Calculate the average outage duration for each state
state_duration = data.groupby('U.S._STATE')['OUTAGE.DURATION'].mean().reset_index()

# Create the map centered on the U.S.
m = folium.Map(location=[37.8, -96], zoom_start=4)

# Add a Choropleth layer to visualize average outage duration
folium.Choropleth(
    geo_data=url,
    data=state_duration,
    columns=['U.S._STATE', 'OUTAGE.DURATION'],
    key_on='feature.properties.name',
    fill_color='YlGnBu',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Average Outage Duration by State (hours)'
).add_to(m)

# Display the map
m

## Interesting Aggregates

Purpose: See which outage causes affect the most customers in each U.S. state.

In [131]:
pivot_customers = data.pivot_table(
    index='U.S._STATE',
    columns='CAUSE.CATEGORY',
    values='CUSTOMERS.AFFECTED',
    aggfunc='sum'
)

# View the table
display(pivot_customers)

CAUSE.CATEGORY,equipment failure,fuel supply emergency,intentional attack,islanding,public appeal,severe weather,system operability disruption
U.S._STATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alabama,,,0.0,,,471644.0,
Alaska,14273.0,,,,,,
Arizona,167000.0,,2713.0,,,180911.0,229000.0
Arkansas,0.0,,9200.0,0.0,54094.0,556466.0,
California,1390257.0,0.0,127920.0,131019.0,0.0,20579360.0,3344890.0
Colorado,,0.0,0.0,35230.0,,355058.0,61379.0
Connecticut,,,0.0,,,784410.0,
Delaware,18400.0,,0.0,,,65000.0,0.0
District of Columbia,52000.0,,,,,1700383.0,
Florida,690101.0,,0.0,,0.0,11567578.0,474561.0


## Step 3: Assessment of Missingness

In [133]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1534 entries, 1.0 to 1534.0
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   YEAR                     1534 non-null   float64       
 1   MONTH                    1525 non-null   float64       
 2   U.S._STATE               1534 non-null   object        
 3   NERC.REGION              1534 non-null   object        
 4   ANOMALY.LEVEL            1525 non-null   object        
 5   CLIMATE.CATEGORY         1525 non-null   object        
 6   OUTAGE.START.DATE        1525 non-null   object        
 7   OUTAGE.START.TIME        1525 non-null   object        
 8   OUTAGE.RESTORATION.DATE  1476 non-null   object        
 9   OUTAGE.RESTORATION.TIME  1476 non-null   object        
 10  CAUSE.CATEGORY           1534 non-null   object        
 11  CAUSE.CATEGORY.DETAIL    1063 non-null   object        
 12  OUTAGE.DURATION          1476 non-n

## NMAR

NMAR: DEMAND.LOSS.MW: This column indicates the megawatt demand loss during an outage. The missingness could be NMAR if the probability of missing data depends on the magnitude of the demand loss itself. For example, minor outages might not have detailed demand loss recorded, leading to missing values. Alternatively, in significant outages, the complexity of measuring demand loss might result in missing data.

## Missingness Dependancy

In [137]:
# Used for plotting examples.
def create_kde_plotly(df, group_col, group1, group2, vals_col, title=''):
    fig = ff.create_distplot(
        hist_data=[df.loc[df[group_col] == group1, vals_col], df.loc[df[group_col] == group2, vals_col]],
        group_labels=[group1, group2],
        show_rug=False, show_hist=False,
        colors=['#ef553b', '#636efb'],
    )
    return fig.update_layout(title=title)

In [138]:
create_kde_plotly(data, 'CAUSE.CATEGORY', True, False, 'NERC.REGION', title='KDE Plot of Customers Affected by NERC Region')

NameError: name 'ff' is not defined

## Step 4: Hypothesis Testing

In [69]:
# TODO

## Step 5: Framing a Prediction Problem

In [70]:
# TODO

## Step 6: Baseline Model

In [71]:
# TODO

## Step 7: Final Model

In [72]:
# TODO

## Step 8: Fairness Analysis

In [73]:
# TODO