# Your Title Here

**Name(s)**: (your name(s) here)

**Website Link**: (your website link)

## Code

In [2]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'

### Cleaning and EDA

In [3]:
# TODO

rows_to_skip = list(range(5))
df = pd.read_csv('outage.csv', skiprows=rows_to_skip, index_col='OBS')

# Combine the units line and column names, drop unecessary rows and columns
column = np.array(df.columns).astype('str')
unites = np.array(df.iloc[0].fillna('')).astype('str')
unites = ["(" + i + ")" for i in unites]
for i in range(len(unites)):
    if unites[i] == '()':
        unites[i] = ''
combined_column = np.core.defchararray.add(column, unites)
df.columns = combined_column
df = df.reset_index().drop(0).drop('variables(Units)', axis=1).reset_index(drop=True)

In [4]:
df['YEAR'] = df['YEAR'].astype('int')
# Combine 'OUTAGE.START.DATE' and 'OUTAGE.START.TIME' into a new pd.Timestamp column called 'OUTAGE.START'
start_time = df['OUTAGE.START.TIME(Hour:Minute:Second (AM / PM))']
start_date = df['OUTAGE.START.DATE(Day of the week, Month Day, Year)']
df["OUTAGE.START"] = pd.to_datetime(start_date + " " + start_time)
# combine 'OUTAGE.RESTORATION.DATE' and 'OUTAGE.RESTORATION.TIME' into a new pd.Timestamp column called 'OUTAGE.RESTORATION'.
end_time = df['OUTAGE.RESTORATION.TIME(Hour:Minute:Second (AM / PM))']
end_date = df['OUTAGE.RESTORATION.DATE(Day of the week, Month Day, Year)']
df["OUTAGE.RESTORATION"] = pd.to_datetime(end_date + " " + end_time)
df[['OUTAGE.RESTORATION', 'OUTAGE.START']]

Unnamed: 0,OUTAGE.RESTORATION,OUTAGE.START
0,2011-07-03 20:00:00,2011-07-01 17:00:00
1,2014-05-11 18:39:00,2014-05-11 18:38:00
2,2010-10-28 22:00:00,2010-10-26 20:00:00
3,2012-06-20 23:00:00,2012-06-19 04:30:00
4,2015-07-19 07:00:00,2015-07-18 02:00:00
...,...,...
1529,2011-12-06 20:00:00,2011-12-06 08:00:00
1530,NaT,NaT
1531,2009-08-29 23:53:00,2009-08-29 22:54:00
1532,2009-08-29 14:01:00,2009-08-29 11:00:00


In [8]:
df['HURRICANE.NAMES']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
1529    NaN
1530    NaN
1531    NaN
1532    NaN
1533    NaN
Name: HURRICANE.NAMES, Length: 1534, dtype: object

### Assessment of Missingness

## NMAR Analysis
The concept of Not Missing At Random (NMAR) implies that the probability of a value being missing is dependent on the actual missing value itself. In the context of this dataset, the column in question records the names of hurricanes responsible for specific power outages. However, it is important to note that not all power outages can be attributed to hurricanes. Some outages may result from facility maintenance or other natural disasters.
Consequently, if a particular power outage is not caused by a hurricane, it is reasonable for the corresponding cell in the hurricane column to be missing. Incorporating an additional column, such as one that records the amount of rainfall during the corresponding time period, could provide further context. For instance, if the rainfall amount is significantly higher than usual, it is more likely that a hurricane is responsible for the power outage, and thus, the hurricane name column would be populated. 

In [67]:
# Does missingness of customers affected depend on the cause category? 
df['CAUSE.CATEGORY']
df.groupby('CAUSE.CATEGORY').count()
df_ca_missing = df[df['CUSTOMERS.AFFECTED'].isna()]
df_ca_not_missing = df[~df['CUSTOMERS.AFFECTED'].isna()]
missing_count = df_ca_missing.groupby('CAUSE.CATEGORY').count()['YEAR']
missing_proportion = pd.Series(np.array(missing_count)/((np.array(missing_count)).sum()), index=missing_count.index)
not_missing_count = df_ca_not_missing.groupby('CAUSE.CATEGORY').count()['YEAR']
not_missing_proportion = pd.Series(np.array(not_missing_count)/((np.array(not_missing_count)).sum()), index=not_missing_count.index)
observed_tvd_statistic = abs(not_missing_proportion - missing_proportion).sum()
observed_dataframe = pd.concat([not_missing_proportion, missing_proportion], axis=1, keys=['not_missing_proportion', 'missing_proportion'])
observed_dataframe.plot(kind='barh', title='Causes by missingess of user affected', barmode='group')



In [68]:
def cause_one_permutation(df):
    df = df.copy()
    df['CUSTOMERS.AFFECTED'] = np.random.permutation(df['CUSTOMERS.AFFECTED'])
    df['CAUSE.CATEGORY']
    df.groupby('CAUSE.CATEGORY').count()
    df_ca_missing = df[df['CUSTOMERS.AFFECTED'].isna()]
    df_ca_not_missing = df[~df['CUSTOMERS.AFFECTED'].isna()]
    missing_count = df_ca_missing.groupby('CAUSE.CATEGORY').count()['YEAR']
    missing_proportion = pd.Series(np.array(missing_count)/((np.array(missing_count)).sum()), index=missing_count.index)
    not_missing_count = df_ca_not_missing.groupby('CAUSE.CATEGORY').count()['YEAR']
    not_missing_proportion = pd.Series(np.array(not_missing_count)/((np.array(not_missing_count)).sum()), index=not_missing_count.index)
    observed_tvd_statistic = abs(not_missing_proportion - missing_proportion).sum()
    return observed_tvd_statistic
# do permutation 1000 times
result = []
for i in range(1000):
    result.append(cause_one_permutation(df))
fig = px.histogram(pd.DataFrame(result), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd_statistic, line_color='red')
fig.update_layout(xaxis_range=[0, 1.5])


Unnamed: 0,OBS,YEAR,MONTH,U.S._STATE,POSTAL.CODE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL(numeric),CLIMATE.CATEGORY,"OUTAGE.START.DATE(Day of the week, Month Day, Year)",...,POPDEN_URBAN(persons per square mile),POPDEN_UC(persons per square mile),POPDEN_RURAL(persons per square mile),AREAPCT_URBAN(%),AREAPCT_UC(%),PCT_LAND(%),PCT_WATER_TOT(%),PCT_WATER_INLAND(%),OUTAGE.START,OUTAGE.RESTORATION


### Hypothesis Testing

In [7]:
# TODO