## Data Source
The data here is too large to manage via Git.
It was downloaded from https://www.ncei.noaa.gov/pub/data/swdi/stormevents/

In [4]:
import os.path as path
import pandas as pd
import datetime as dt
import numpy as np
from pprint import pp


In [5]:
input_path = path.join("Resources_Output", "storm_data.csv")

df = pd.read_csv(input_path)

df.head()

Unnamed: 0,BEGIN_DATE_TIME,CZ_TIMEZONE,END_DATE_TIME,DAMAGE_PROPERTY,STATE,EVENT_TYPE,CZ_NAME,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON
0,1994-03-27 11:32:00,CST,1994-03-27 11:32:00,5000000.0,ALABAMA,Tornado,DEKALB,34.43,-85.98,34.47,-85.78
1,1994-05-15 19:30:00,CST,1994-05-15 19:30:00,0.0,ALABAMA,Thunderstorm Wind,DEKALB,,,,
2,1994-06-26 22:20:00,CST,1994-06-26 22:20:00,500000.0,ALABAMA,Thunderstorm Wind,DEKALB,,,,
3,1994-05-15 13:47:00,CST,1994-05-15 13:47:00,0.0,ALABAMA,Hail,TALLAPOOSA,,,,
4,1994-03-27 15:50:00,CST,1994-03-27 15:50:00,0.0,ALABAMA,Hail,TUSCALOOSA,,,,


# State Filtering/Cleanup
***

In [6]:
df['STATE'].unique()

array(['ALABAMA', 'ARKANSAS', 'LOUISIANA', 'MISSOURI', 'MINNESOTA',
       'MICHIGAN', 'NEBRASKA', 'MONTANA', 'NORTH CAROLINA', 'NEW YORK',
       'NORTH DAKOTA', 'OKLAHOMA', 'PENNSYLVANIA', 'MAINE', 'FLORIDA',
       'GEORGIA', 'TEXAS', 'ILLINOIS', 'MISSISSIPPI', 'MASSACHUSETTS',
       'MARYLAND', 'IOWA', 'INDIANA', 'SOUTH CAROLINA', 'NEW JERSEY',
       'KANSAS', 'OHIO', 'CONNECTICUT', 'TENNESSEE', 'SOUTH DAKOTA',
       'NEVADA', 'NEW HAMPSHIRE', 'COLORADO', 'KENTUCKY', 'ARIZONA',
       'CALIFORNIA', 'VERMONT', 'NEW MEXICO', 'IDAHO', 'OREGON',
       'RHODE ISLAND', 'DELAWARE', 'VIRGINIA', 'WYOMING', 'WISCONSIN',
       'WEST VIRGINIA', 'UTAH', 'PUERTO RICO', 'WASHINGTON', 'HAWAII',
       'Kentucky', 'DISTRICT OF COLUMBIA', 'ALASKA', 'GUAM',
       'VIRGIN ISLANDS', 'AMERICAN SAMOA', 'GULF OF MEXICO',
       'ATLANTIC SOUTH', 'LAKE HURON', 'LAKE MICHIGAN', 'LAKE ERIE',
       'E PACIFIC', 'ATLANTIC NORTH', 'LAKE ST CLAIR', 'LAKE ONTARIO',
       'HAWAII WATERS', 'ST LAWRENCE R', 

In [7]:
df['STATE'] = df['STATE'].str.upper()

df.head()

Unnamed: 0,BEGIN_DATE_TIME,CZ_TIMEZONE,END_DATE_TIME,DAMAGE_PROPERTY,STATE,EVENT_TYPE,CZ_NAME,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON
0,1994-03-27 11:32:00,CST,1994-03-27 11:32:00,5000000.0,ALABAMA,Tornado,DEKALB,34.43,-85.98,34.47,-85.78
1,1994-05-15 19:30:00,CST,1994-05-15 19:30:00,0.0,ALABAMA,Thunderstorm Wind,DEKALB,,,,
2,1994-06-26 22:20:00,CST,1994-06-26 22:20:00,500000.0,ALABAMA,Thunderstorm Wind,DEKALB,,,,
3,1994-05-15 13:47:00,CST,1994-05-15 13:47:00,0.0,ALABAMA,Hail,TALLAPOOSA,,,,
4,1994-03-27 15:50:00,CST,1994-03-27 15:50:00,0.0,ALABAMA,Hail,TUSCALOOSA,,,,


In [8]:
df['STATE'].unique()

array(['ALABAMA', 'ARKANSAS', 'LOUISIANA', 'MISSOURI', 'MINNESOTA',
       'MICHIGAN', 'NEBRASKA', 'MONTANA', 'NORTH CAROLINA', 'NEW YORK',
       'NORTH DAKOTA', 'OKLAHOMA', 'PENNSYLVANIA', 'MAINE', 'FLORIDA',
       'GEORGIA', 'TEXAS', 'ILLINOIS', 'MISSISSIPPI', 'MASSACHUSETTS',
       'MARYLAND', 'IOWA', 'INDIANA', 'SOUTH CAROLINA', 'NEW JERSEY',
       'KANSAS', 'OHIO', 'CONNECTICUT', 'TENNESSEE', 'SOUTH DAKOTA',
       'NEVADA', 'NEW HAMPSHIRE', 'COLORADO', 'KENTUCKY', 'ARIZONA',
       'CALIFORNIA', 'VERMONT', 'NEW MEXICO', 'IDAHO', 'OREGON',
       'RHODE ISLAND', 'DELAWARE', 'VIRGINIA', 'WYOMING', 'WISCONSIN',
       'WEST VIRGINIA', 'UTAH', 'PUERTO RICO', 'WASHINGTON', 'HAWAII',
       'DISTRICT OF COLUMBIA', 'ALASKA', 'GUAM', 'VIRGIN ISLANDS',
       'AMERICAN SAMOA', 'GULF OF MEXICO', 'ATLANTIC SOUTH', 'LAKE HURON',
       'LAKE MICHIGAN', 'LAKE ERIE', 'E PACIFIC', 'ATLANTIC NORTH',
       'LAKE ST CLAIR', 'LAKE ONTARIO', 'HAWAII WATERS', 'ST LAWRENCE R',
       'LAKE SUPERI

## Adding climate region data, dropping state data

In [9]:
# list of US states for filtering
# hawaii and alaska dropped
# we only want the lower 48 mainland US states for data consistency when training
us_states = [ "ALABAMA", "ARIZONA", "ARKANSAS", "CALIFORNIA", "COLORADO", "CONNECTICUT", "DELAWARE", "FLORIDA", "GEORGIA", 
             "IDAHO", "ILLINOIS", "INDIANA", "IOWA", "KANSAS", "KENTUCKY", "LOUISIANA", "MAINE", "MARYLAND", "MASSACHUSETTS", 
             "MICHIGAN", "MINNESOTA", "MISSISSIPPI", "MISSOURI", "MONTANA", "NEBRASKA", "NEVADA", "NEW HAMPSHIRE", "NEW JERSEY", 
             "NEW MEXICO", "NEW YORK", "NORTH CAROLINA", "NORTH DAKOTA", "OHIO", "OKLAHOMA", "OREGON", "PENNSYLVANIA", "RHODE ISLAND", 
             "SOUTH CAROLINA", "SOUTH DAKOTA", "TENNESSEE", "TEXAS", "UTAH", "VERMONT", "VIRGINIA", "WASHINGTON", "WEST VIRGINIA", "WISCONSIN", "WYOMING"]

# dictionary of NCEI NOAA climate regions to add to our data
climate_region_dict = {
    "NORTHEAST":    ["CONNECTICUT", "DELAWARE", "MAINE", "MARYLAND", "MASSACHUSETTS", "NEW HAMPSHIRE", "NEW JERSEY", "NEW YORK", "PENNSYLVANIA", "RHODE ISLAND", "VERMONT"],
    "UPPER MIDWEST":["IOWA", "MICHIGAN", "MINNESOTA", "WISCONSIN"],
    "OHIO VALLEY":  ["ILLINOIS", "INDIANA", "KENTUCKY", "MISSOURI", "OHIO", "TENNESSEE", "WEST VIRGINIA"],
    "SOUTHEAST":    ["ALABAMA", "FLORIDA", "GEORGIA", "NORTH CAROLINA", "SOUTH CAROLINA", "VIRGINIA"],
    "NORTHERN ROCKIES AND PLAINS": ["MONTANA", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA", "WYOMING"],
    "SOUTH":        ["ARKANSAS", "KANSAS", "LOUISIANA", "MISSISSIPPI", "OKLAHOMA", "TEXAS"],
    "SOUTHWEST":    ["ARIZONA", "COLORADO", "NEW MEXICO", "UTAH"],
    "NORTHWEST":    ["IDAHO", "OREGON", "WASHINGTON"],
    "WEST":         ["CALIFORNIA", "NEVADA"]
}

In [10]:
# dropping non-states
df = df[df['STATE'].isin(us_states)]

df['STATE'].unique()

array(['ALABAMA', 'ARKANSAS', 'LOUISIANA', 'MISSOURI', 'MINNESOTA',
       'MICHIGAN', 'NEBRASKA', 'MONTANA', 'NORTH CAROLINA', 'NEW YORK',
       'NORTH DAKOTA', 'OKLAHOMA', 'PENNSYLVANIA', 'MAINE', 'FLORIDA',
       'GEORGIA', 'TEXAS', 'ILLINOIS', 'MISSISSIPPI', 'MASSACHUSETTS',
       'MARYLAND', 'IOWA', 'INDIANA', 'SOUTH CAROLINA', 'NEW JERSEY',
       'KANSAS', 'OHIO', 'CONNECTICUT', 'TENNESSEE', 'SOUTH DAKOTA',
       'NEVADA', 'NEW HAMPSHIRE', 'COLORADO', 'KENTUCKY', 'ARIZONA',
       'CALIFORNIA', 'VERMONT', 'NEW MEXICO', 'IDAHO', 'OREGON',
       'RHODE ISLAND', 'DELAWARE', 'VIRGINIA', 'WYOMING', 'WISCONSIN',
       'WEST VIRGINIA', 'UTAH', 'WASHINGTON'], dtype=object)

In [11]:
df['STATE'].value_counts()

STATE
TEXAS             111252
KANSAS             66029
IOWA               51213
MISSOURI           50374
OKLAHOMA           49685
ILLINOIS           45492
NEBRASKA           44685
KENTUCKY           41009
NEW YORK           38450
VIRGINIA           38290
SOUTH DAKOTA       37361
MINNESOTA          37336
NORTH CAROLINA     36784
PENNSYLVANIA       36765
GEORGIA            36618
OHIO               35360
ARKANSAS           35259
COLORADO           33540
TENNESSEE          33054
ALABAMA            32509
MISSISSIPPI        31401
WISCONSIN          29860
INDIANA            28640
FLORIDA            27959
CALIFORNIA         27672
MONTANA            25673
MICHIGAN           24630
SOUTH CAROLINA     23816
LOUISIANA          22109
NORTH DAKOTA       21497
WYOMING            18235
NEW JERSEY         17427
WEST VIRGINIA      16441
NEW MEXICO         15188
MARYLAND           15059
ARIZONA            13559
MASSACHUSETTS      11914
UTAH                8446
VERMONT             8358
IDAHO              

In [12]:
formatted_region_dict = {}
for key in climate_region_dict:
    for item in climate_region_dict[key]:
        formatted_region_dict[item] = key

formatted_region_dict

{'CONNECTICUT': 'NORTHEAST',
 'DELAWARE': 'NORTHEAST',
 'MAINE': 'NORTHEAST',
 'MARYLAND': 'NORTHEAST',
 'MASSACHUSETTS': 'NORTHEAST',
 'NEW HAMPSHIRE': 'NORTHEAST',
 'NEW JERSEY': 'NORTHEAST',
 'NEW YORK': 'NORTHEAST',
 'PENNSYLVANIA': 'NORTHEAST',
 'RHODE ISLAND': 'NORTHEAST',
 'VERMONT': 'NORTHEAST',
 'IOWA': 'UPPER MIDWEST',
 'MICHIGAN': 'UPPER MIDWEST',
 'MINNESOTA': 'UPPER MIDWEST',
 'WISCONSIN': 'UPPER MIDWEST',
 'ILLINOIS': 'OHIO VALLEY',
 'INDIANA': 'OHIO VALLEY',
 'KENTUCKY': 'OHIO VALLEY',
 'MISSOURI': 'OHIO VALLEY',
 'OHIO': 'OHIO VALLEY',
 'TENNESSEE': 'OHIO VALLEY',
 'WEST VIRGINIA': 'OHIO VALLEY',
 'ALABAMA': 'SOUTHEAST',
 'FLORIDA': 'SOUTHEAST',
 'GEORGIA': 'SOUTHEAST',
 'NORTH CAROLINA': 'SOUTHEAST',
 'SOUTH CAROLINA': 'SOUTHEAST',
 'VIRGINIA': 'SOUTHEAST',
 'MONTANA': 'NORTHERN ROCKIES AND PLAINS',
 'NEBRASKA': 'NORTHERN ROCKIES AND PLAINS',
 'NORTH DAKOTA': 'NORTHERN ROCKIES AND PLAINS',
 'SOUTH DAKOTA': 'NORTHERN ROCKIES AND PLAINS',
 'WYOMING': 'NORTHERN ROCKIES AN

In [13]:
# key = region classification
# value = list of states
# this is probably not a very efficient way to do this
df['REGION'] = df['STATE'].map(formatted_region_dict)

df.head()

Unnamed: 0,BEGIN_DATE_TIME,CZ_TIMEZONE,END_DATE_TIME,DAMAGE_PROPERTY,STATE,EVENT_TYPE,CZ_NAME,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,REGION
0,1994-03-27 11:32:00,CST,1994-03-27 11:32:00,5000000.0,ALABAMA,Tornado,DEKALB,34.43,-85.98,34.47,-85.78,SOUTHEAST
1,1994-05-15 19:30:00,CST,1994-05-15 19:30:00,0.0,ALABAMA,Thunderstorm Wind,DEKALB,,,,,SOUTHEAST
2,1994-06-26 22:20:00,CST,1994-06-26 22:20:00,500000.0,ALABAMA,Thunderstorm Wind,DEKALB,,,,,SOUTHEAST
3,1994-05-15 13:47:00,CST,1994-05-15 13:47:00,0.0,ALABAMA,Hail,TALLAPOOSA,,,,,SOUTHEAST
4,1994-03-27 15:50:00,CST,1994-03-27 15:50:00,0.0,ALABAMA,Hail,TUSCALOOSA,,,,,SOUTHEAST


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1327292 entries, 0 to 1389617
Data columns (total 12 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   BEGIN_DATE_TIME  1327292 non-null  object 
 1   CZ_TIMEZONE      1327292 non-null  object 
 2   END_DATE_TIME    1327292 non-null  object 
 3   DAMAGE_PROPERTY  1327292 non-null  float64
 4   STATE            1327292 non-null  object 
 5   EVENT_TYPE       1327292 non-null  object 
 6   CZ_NAME          1327292 non-null  object 
 7   BEGIN_LAT        911935 non-null   float64
 8   BEGIN_LON        911928 non-null   float64
 9   END_LAT          910848 non-null   float64
 10  END_LON          910841 non-null   float64
 11  REGION           1327292 non-null  object 
dtypes: float64(5), object(7)
memory usage: 131.6+ MB


## Event Type Double Checking

In [15]:
df["EVENT_TYPE"].value_counts()

EVENT_TYPE
Thunderstorm Wind                 425957
Hail                              333857
Flash Flood                        79398
High Wind                          56836
Winter Weather                     50388
Flood                              50013
Winter Storm                       48055
Tornado                            41026
Drought                            40788
Heavy Snow                         36710
Strong Wind                        20529
Heavy Rain                         20381
Heat                               14805
Lightning                          14143
Excessive Heat                     13796
Frost/Freeze                       10824
Extreme Cold/Wind Chill             9659
Dense Fog                           9236
Blizzard                            7908
Cold/Wind Chill                     6668
Ice Storm                           6318
Funnel Cloud                        6194
Wildfire                            5378
Tropical Storm                      4016
Coast

In [16]:
grouped = df.groupby(["EVENT_TYPE"])
grouped.sum(["DAMAGE_PROPERTY"]).drop(columns=["BEGIN_LAT", "BEGIN_LON", "END_LAT", "END_LON"]).sort_values("DAMAGE_PROPERTY", ascending=False)

Unnamed: 0_level_0,DAMAGE_PROPERTY
EVENT_TYPE,Unnamed: 1_level_1
Hurricane (Typhoon),83600190000.0
Flash Flood,77106050000.0
Storm Surge/Tide,60071630000.0
Hurricane,52183500000.0
Flood,40681320000.0
Tornado,40445130000.0
Wildfire,33094470000.0
Hail,32067950000.0
Coastal Flood,21738870000.0
Tropical Storm,17473010000.0


In [17]:
df["EVENT_TYPE"].unique()

array(['Tornado', 'Thunderstorm Wind', 'Hail',
       'THUNDERSTORM WINDS/FLOODING', 'HAIL/ICY ROADS', 'HAIL FLOODING',
       'THUNDERSTORM WINDS/FLASH FLOOD', 'THUNDERSTORM WINDS LIGHTNING',
       'THUNDERSTORM WIND/ TREES', 'THUNDERSTORM WIND/ TREE',
       'THUNDERSTORM WINDS FUNNEL CLOU', 'TORNADO/WATERSPOUT',
       'THUNDERSTORM WINDS/HEAVY RAIN', 'THUNDERSTORM WINDS HEAVY RAIN',
       'THUNDERSTORM WINDS/ FLOOD', 'Winter Storm', 'Cold/Wind Chill',
       'Heavy Snow', 'Flood', 'High Wind', 'Flash Flood', 'Blizzard',
       'Ice Storm', 'Lightning', 'Frost/Freeze', 'Heavy Rain',
       'Strong Wind', 'Coastal Flood', 'Wildfire', 'Funnel Cloud',
       'Winter Weather', 'Waterspout', 'Drought', 'Debris Flow', 'Heat',
       'High Surf', 'Tropical Storm', 'Dust Devil', 'Dense Fog',
       'Hurricane (Typhoon)', 'Marine High Wind', 'Dust Storm',
       'Storm Surge/Tide', 'Lake-Effect Snow', 'Rip Current', 'Avalanche',
       'Seiche', 'Extreme Cold/Wind Chill', 'Excessive Heat',

In [18]:
category_trans_dict = {
    'THUNDERSTORM WINDS/FLOODING': 'Flood',
    'HAIL/ICY ROADS': 'Hail',
    'HAIL FLOODING': 'Flood',
    'THUNDERSTORM WINDS/FLASH FLOOD': 'Flash Flood',
    'THUNDERSTORM WINDS LIGHTNING': 'Lightning',
    'THUNDERSTORM WIND/ TREES': 'Thunderstorm Wind',
    'THUNDERSTORM WIND/ TREE': 'Thunderstorm Wind',
    'THUNDERSTORM WINDS FUNNEL CLOU': 'Funnel Cloud',
    'TORNADO/WATERSPOUT': 'Waterspout',
    'THUNDERSTORM WINDS/HEAVY RAIN': 'Heavy Rain',
    'THUNDERSTORM WINDS HEAVY RAIN': 'Heavy Rain',
    'THUNDERSTORM WINDS/ FLOOD': 'Flood',
}


rest_of_cats = ['Tornado', 'Thunderstorm Wind', 'Hail', 'Winter Storm', 'Cold/Wind Chill',
       'Heavy Snow', 'Flood', 'High Wind', 'Flash Flood', 'Blizzard',
       'Ice Storm', 'Lightning', 'Frost/Freeze', 'Heavy Rain',
       'Strong Wind', 'Coastal Flood', 'Wildfire', 'Funnel Cloud',
       'Winter Weather', 'Waterspout', 'Drought', 'Debris Flow', 'Heat',
       'High Surf', 'Tropical Storm', 'Dust Devil', 'Dense Fog',
       'Hurricane (Typhoon)', 'Marine High Wind', 'Dust Storm',
       'Storm Surge/Tide', 'Lake-Effect Snow', 'Rip Current', 'Avalanche',
       'Seiche', 'Extreme Cold/Wind Chill', 'Excessive Heat', 'Tsunami',
       'Sleet', 'Freezing Fog', 'Lakeshore Flood',
       'Astronomical Low Tide', 'Tropical Depression', 'Dense Smoke',
       'Sneakerwave', 'Hurricane']

all_event_transforms = {item: item for item in rest_of_cats}
all_event_transforms.update(category_trans_dict)

pp(all_event_transforms.keys())

dict_keys(['Tornado', 'Thunderstorm Wind', 'Hail', 'Winter Storm', 'Cold/Wind Chill', 'Heavy Snow', 'Flood', 'High Wind', 'Flash Flood', 'Blizzard', 'Ice Storm', 'Lightning', 'Frost/Freeze', 'Heavy Rain', 'Strong Wind', 'Coastal Flood', 'Wildfire', 'Funnel Cloud', 'Winter Weather', 'Waterspout', 'Drought', 'Debris Flow', 'Heat', 'High Surf', 'Tropical Storm', 'Dust Devil', 'Dense Fog', 'Hurricane (Typhoon)', 'Marine High Wind', 'Dust Storm', 'Storm Surge/Tide', 'Lake-Effect Snow', 'Rip Current', 'Avalanche', 'Seiche', 'Extreme Cold/Wind Chill', 'Excessive Heat', 'Tsunami', 'Sleet', 'Freezing Fog', 'Lakeshore Flood', 'Astronomical Low Tide', 'Tropical Depression', 'Dense Smoke', 'Sneakerwave', 'Hurricane', 'THUNDERSTORM WINDS/FLOODING', 'HAIL/ICY ROADS', 'HAIL FLOODING', 'THUNDERSTORM WINDS/FLASH FLOOD', 'THUNDERSTORM WINDS LIGHTNING', 'THUNDERSTORM WIND/ TREES', 'THUNDERSTORM WIND/ TREE', 'THUNDERSTORM WINDS FUNNEL CLOU', 'TORNADO/WATERSPOUT', 'THUNDERSTORM WINDS/HEAVY RAIN', 'THUNDERS

In [19]:
df["EVENT_TYPE"] = df["EVENT_TYPE"].map(all_event_transforms)

df.head()

Unnamed: 0,BEGIN_DATE_TIME,CZ_TIMEZONE,END_DATE_TIME,DAMAGE_PROPERTY,STATE,EVENT_TYPE,CZ_NAME,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,REGION
0,1994-03-27 11:32:00,CST,1994-03-27 11:32:00,5000000.0,ALABAMA,Tornado,DEKALB,34.43,-85.98,34.47,-85.78,SOUTHEAST
1,1994-05-15 19:30:00,CST,1994-05-15 19:30:00,0.0,ALABAMA,Thunderstorm Wind,DEKALB,,,,,SOUTHEAST
2,1994-06-26 22:20:00,CST,1994-06-26 22:20:00,500000.0,ALABAMA,Thunderstorm Wind,DEKALB,,,,,SOUTHEAST
3,1994-05-15 13:47:00,CST,1994-05-15 13:47:00,0.0,ALABAMA,Hail,TALLAPOOSA,,,,,SOUTHEAST
4,1994-03-27 15:50:00,CST,1994-03-27 15:50:00,0.0,ALABAMA,Hail,TUSCALOOSA,,,,,SOUTHEAST


In [20]:
# verify remaining categories
df["EVENT_TYPE"].unique()

array(['Tornado', 'Thunderstorm Wind', 'Hail', 'Flood', 'Flash Flood',
       'Lightning', 'Funnel Cloud', 'Waterspout', 'Heavy Rain',
       'Winter Storm', 'Cold/Wind Chill', 'Heavy Snow', 'High Wind',
       'Blizzard', 'Ice Storm', 'Frost/Freeze', 'Strong Wind',
       'Coastal Flood', 'Wildfire', 'Winter Weather', 'Drought',
       'Debris Flow', 'Heat', 'High Surf', 'Tropical Storm', 'Dust Devil',
       'Dense Fog', 'Hurricane (Typhoon)', 'Marine High Wind',
       'Dust Storm', 'Storm Surge/Tide', 'Lake-Effect Snow',
       'Rip Current', 'Avalanche', 'Seiche', 'Extreme Cold/Wind Chill',
       'Excessive Heat', 'Tsunami', 'Sleet', 'Freezing Fog',
       'Lakeshore Flood', 'Astronomical Low Tide', 'Tropical Depression',
       'Dense Smoke', 'Sneakerwave', 'Hurricane'], dtype=object)

### Some Thoughts on Event Type Consolidation
***

We may want to consider more merges including:

Hurricane (Typhoon) -> 'Hurricane'

All Floods (except maybe flash floods) -> Flood

(storm floods already filtered are a bit ambiguous in this case)

**Ideally, we either want to drop unneeded categories or run them through an unsupervised clustering algorithm to bring the total category count down to <10 for easier neural network processing**

# Preliminary Time Preprocessing and Analysis
***

In [21]:
#since begin and end datetime are objects, we need to convert them
time_cols = ["BEGIN_DATE_TIME", "END_DATE_TIME"]
for col in time_cols:
    df[col] = pd.to_datetime(df[col])

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1327292 entries, 0 to 1389617
Data columns (total 12 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   BEGIN_DATE_TIME  1327292 non-null  datetime64[ns]
 1   CZ_TIMEZONE      1327292 non-null  object        
 2   END_DATE_TIME    1327292 non-null  datetime64[ns]
 3   DAMAGE_PROPERTY  1327292 non-null  float64       
 4   STATE            1327292 non-null  object        
 5   EVENT_TYPE       1327292 non-null  object        
 6   CZ_NAME          1327292 non-null  object        
 7   BEGIN_LAT        911935 non-null   float64       
 8   BEGIN_LON        911928 non-null   float64       
 9   END_LAT          910848 non-null   float64       
 10  END_LON          910841 non-null   float64       
 11  REGION           1327292 non-null  object        
dtypes: datetime64[ns](2), float64(5), object(5)
memory usage: 131.6+ MB


In [22]:
df['DURATION_SEC'] = (df["END_DATE_TIME"] - df["BEGIN_DATE_TIME"]).dt.total_seconds().astype(int)

df.head()

Unnamed: 0,BEGIN_DATE_TIME,CZ_TIMEZONE,END_DATE_TIME,DAMAGE_PROPERTY,STATE,EVENT_TYPE,CZ_NAME,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,REGION,DURATION_SEC
0,1994-03-27 11:32:00,CST,1994-03-27 11:32:00,5000000.0,ALABAMA,Tornado,DEKALB,34.43,-85.98,34.47,-85.78,SOUTHEAST,0
1,1994-05-15 19:30:00,CST,1994-05-15 19:30:00,0.0,ALABAMA,Thunderstorm Wind,DEKALB,,,,,SOUTHEAST,0
2,1994-06-26 22:20:00,CST,1994-06-26 22:20:00,500000.0,ALABAMA,Thunderstorm Wind,DEKALB,,,,,SOUTHEAST,0
3,1994-05-15 13:47:00,CST,1994-05-15 13:47:00,0.0,ALABAMA,Hail,TALLAPOOSA,,,,,SOUTHEAST,0
4,1994-03-27 15:50:00,CST,1994-03-27 15:50:00,0.0,ALABAMA,Hail,TUSCALOOSA,,,,,SOUTHEAST,0


In [23]:
df["DURATION_SEC"].value_counts()

DURATION_SEC
0          646538
300         44154
60          36142
120         29413
600         19355
            ...  
942900          1
254340          1
106560          1
287640          1
2568600         1
Name: count, Length: 7117, dtype: int64

In [24]:
# calculating the Pearson's Correlation Coefficient, r
print("Duration to Property Damage: Pearson's Coeffieient (r): ", df["DURATION_SEC"].corr(df["DAMAGE_PROPERTY"]))

Duration to Property Damage: Pearson's Coeffieient (r):  0.0028503366068963272


# Dropping unneeded columns
***

In [25]:
df = df.drop(columns=["CZ_TIMEZONE", "STATE", "CZ_NAME", "END_DATE_TIME"])

#  "BEGIN_LAT", "BEGIN_LON", "END_LAT", "END_LON" kept for now to potentially run clustering while tweaking features for the predictive model

df.head()

Unnamed: 0,BEGIN_DATE_TIME,DAMAGE_PROPERTY,EVENT_TYPE,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,REGION,DURATION_SEC
0,1994-03-27 11:32:00,5000000.0,Tornado,34.43,-85.98,34.47,-85.78,SOUTHEAST,0
1,1994-05-15 19:30:00,0.0,Thunderstorm Wind,,,,,SOUTHEAST,0
2,1994-06-26 22:20:00,500000.0,Thunderstorm Wind,,,,,SOUTHEAST,0
3,1994-05-15 13:47:00,0.0,Hail,,,,,SOUTHEAST,0
4,1994-03-27 15:50:00,0.0,Hail,,,,,SOUTHEAST,0


In [26]:
df_no_0_no_outlier = df.loc[(df["DURATION_SEC"] != 0) , :]

df_no_0_no_outlier["DURATION_SEC"].value_counts()

DURATION_SEC
300        44154
60         36142
120        29413
600        19355
7200       18666
           ...  
942900         1
254340         1
106560         1
287640         1
2568600        1
Name: count, Length: 7116, dtype: int64

In [27]:
# calculating the Pearson's Correlation Coefficient, r on the truncated data
print("Pearson's Coeffieient (r): ", df_no_0_no_outlier["DURATION_SEC"].corr(df_no_0_no_outlier["DAMAGE_PROPERTY"]))

Pearson's Coeffieient (r):  0.0009954449594414927


In [28]:
min_time = df_no_0_no_outlier["BEGIN_DATE_TIME"].min()
min_time

Timestamp('1994-01-27 17:30:00')

In [29]:
# add a "TIME SINCE BEGIN" column to run a pearson correlation between storm duration and time
df_no_0_no_outlier["TIME_SINCE_BEGIN"] = (df_no_0_no_outlier["BEGIN_DATE_TIME"] - min_time).dt.total_seconds().astype(int)
df_no_0_no_outlier.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_0_no_outlier["TIME_SINCE_BEGIN"] = (df_no_0_no_outlier["BEGIN_DATE_TIME"] - min_time).dt.total_seconds().astype(int)


Unnamed: 0,BEGIN_DATE_TIME,DAMAGE_PROPERTY,EVENT_TYPE,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,REGION,DURATION_SEC,TIME_SINCE_BEGIN
12,1994-06-29 06:06:00,500000.0,Thunderstorm Wind,,,,,SOUTHEAST,2940,13178160
21,1994-06-25 14:00:00,50000.0,Thunderstorm Wind,,,,,SOUTHEAST,3600,12861000
53,1994-03-27 11:02:00,5000000.0,Tornado,,,,,SOUTHEAST,360,5074320
58,1994-05-15 18:15:00,50000.0,Thunderstorm Wind,,,,,SOUTHEAST,1380,9333900
105,1994-05-23 19:25:00,0.0,Thunderstorm Wind,,,,,UPPER MIDWEST,600,10029300


In [30]:
print("Truncated data:")
print("Time to Storm Duration  - Pearson's Coeffieient (r): ", df_no_0_no_outlier["TIME_SINCE_BEGIN"].corr(df_no_0_no_outlier["DURATION_SEC"]))
print("Time to Property Damage - Pearson's Coeffieient (r): ", df_no_0_no_outlier["TIME_SINCE_BEGIN"].corr(df_no_0_no_outlier["DAMAGE_PROPERTY"]))

Truncated data:
Time to Storm Duration  - Pearson's Coeffieient (r):  0.0675452059564068
Time to Property Damage - Pearson's Coeffieient (r):  -0.004319071250220689


This is weakly correlated on a pure 'over time" basis, meaning most insights to glean are likely from a time-series based analysis

# One-Hot Encoding, final preprocessing DF
***

In [31]:
# double check the columns to see what we need encoded
df.head()

Unnamed: 0,BEGIN_DATE_TIME,DAMAGE_PROPERTY,EVENT_TYPE,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,REGION,DURATION_SEC
0,1994-03-27 11:32:00,5000000.0,Tornado,34.43,-85.98,34.47,-85.78,SOUTHEAST,0
1,1994-05-15 19:30:00,0.0,Thunderstorm Wind,,,,,SOUTHEAST,0
2,1994-06-26 22:20:00,500000.0,Thunderstorm Wind,,,,,SOUTHEAST,0
3,1994-05-15 13:47:00,0.0,Hail,,,,,SOUTHEAST,0
4,1994-03-27 15:50:00,0.0,Hail,,,,,SOUTHEAST,0


Columns to encode: "REGION", "EVENT_TYPE"

Could also be worthwhile to target data with only lat-long info and run clustering there, but that is its own endeavor

In [32]:
encoded_cols_df = pd.get_dummies(df[["REGION", "EVENT_TYPE"]]).astype(int)

encoded_cols_df.head()

Unnamed: 0,REGION_NORTHEAST,REGION_NORTHERN ROCKIES AND PLAINS,REGION_NORTHWEST,REGION_OHIO VALLEY,REGION_SOUTH,REGION_SOUTHEAST,REGION_SOUTHWEST,REGION_UPPER MIDWEST,REGION_WEST,EVENT_TYPE_Astronomical Low Tide,...,EVENT_TYPE_Strong Wind,EVENT_TYPE_Thunderstorm Wind,EVENT_TYPE_Tornado,EVENT_TYPE_Tropical Depression,EVENT_TYPE_Tropical Storm,EVENT_TYPE_Tsunami,EVENT_TYPE_Waterspout,EVENT_TYPE_Wildfire,EVENT_TYPE_Winter Storm,EVENT_TYPE_Winter Weather
0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
preproc_df = pd.merge(df, encoded_cols_df, on=df.index).drop(columns=["REGION", "EVENT_TYPE", "key_0"])

preproc_df.head()

Unnamed: 0,BEGIN_DATE_TIME,DAMAGE_PROPERTY,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,DURATION_SEC,REGION_NORTHEAST,REGION_NORTHERN ROCKIES AND PLAINS,REGION_NORTHWEST,...,EVENT_TYPE_Strong Wind,EVENT_TYPE_Thunderstorm Wind,EVENT_TYPE_Tornado,EVENT_TYPE_Tropical Depression,EVENT_TYPE_Tropical Storm,EVENT_TYPE_Tsunami,EVENT_TYPE_Waterspout,EVENT_TYPE_Wildfire,EVENT_TYPE_Winter Storm,EVENT_TYPE_Winter Weather
0,1994-03-27 11:32:00,5000000.0,34.43,-85.98,34.47,-85.78,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1994-05-15 19:30:00,0.0,,,,,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1994-06-26 22:20:00,500000.0,,,,,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1994-05-15 13:47:00,0.0,,,,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1994-03-27 15:50:00,0.0,,,,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


For now, if you want to work with this I'd recommend limiting to top 10 event types by DAMAGE_PROPERTY and filtering out anything else:
Hurricane (Typhoon)
Flash Flood
Storm Surge/Tide
Hurricane
Flood
Tornado
Wildfire
Hail
Coastal Flood
Tropical Storm

# Output to CSV
***

In [34]:
# I'm writing the values without 0 trimmed to csv, feel free to change the input df if you want the other data instead.

out_path = path.join("Resources", "data_preprocessed.csv")

preproc_df.to_csv(out_path)

OSError: Cannot save file into a non-existent directory: 'Resources'

## Truncated Version
reusing same variable to preserve RAM

In [None]:
top_10_by_prop_dam = [
    "Hurricane (Typhoon)", "Flash Flood", "Storm Surge/Tide", "Hurricane", "Flood", "Tornado", "Wildfire", "Hail", "Coastal Flood", "Tropical Storm"
]

preproc_df = df[df["EVENT_TYPE"].isin(top_10_by_prop_dam)]

preproc_df["EVENT_TYPE"].value_counts()

EVENT_TYPE
Hail                   333858
Flash Flood             79399
Flood                   50017
Tornado                 41026
Wildfire                 5378
Tropical Storm           4016
Coastal Flood            2428
Hurricane (Typhoon)       968
Storm Surge/Tide          917
Hurricane                 207
Name: count, dtype: int64

In [None]:
encoded_cols_df = pd.get_dummies(preproc_df[["REGION", "EVENT_TYPE"]]).astype(int)
encoded_cols_df.head()

Unnamed: 0,REGION_NORTHEAST,REGION_NORTHERN ROCKIES AND PLAINS,REGION_NORTHWEST,REGION_OHIO VALLEY,REGION_SOUTH,REGION_SOUTHEAST,REGION_SOUTHWEST,REGION_UPPER MIDWEST,REGION_WEST,EVENT_TYPE_Coastal Flood,EVENT_TYPE_Flash Flood,EVENT_TYPE_Flood,EVENT_TYPE_Hail,EVENT_TYPE_Hurricane,EVENT_TYPE_Hurricane (Typhoon),EVENT_TYPE_Storm Surge/Tide,EVENT_TYPE_Tornado,EVENT_TYPE_Tropical Storm,EVENT_TYPE_Wildfire
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0


In [None]:
preproc_df = pd.merge(preproc_df, encoded_cols_df, on=preproc_df.index).drop(columns=["REGION", "EVENT_TYPE", "key_0"])
preproc_df.head()

Unnamed: 0,BEGIN_DATE_TIME,DAMAGE_PROPERTY,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,DURATION_SEC,REGION_NORTHEAST,REGION_NORTHERN ROCKIES AND PLAINS,REGION_NORTHWEST,...,EVENT_TYPE_Coastal Flood,EVENT_TYPE_Flash Flood,EVENT_TYPE_Flood,EVENT_TYPE_Hail,EVENT_TYPE_Hurricane,EVENT_TYPE_Hurricane (Typhoon),EVENT_TYPE_Storm Surge/Tide,EVENT_TYPE_Tornado,EVENT_TYPE_Tropical Storm,EVENT_TYPE_Wildfire
0,1994-03-27 11:32:00,5000000.0,34.43,-85.98,34.47,-85.78,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1994-05-15 13:47:00,0.0,,,,,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1994-03-27 15:50:00,0.0,,,,,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1994-03-27 15:50:00,0.0,,,,,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1994-03-27 15:55:00,0.0,,,,,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [None]:
out_path_trunc = path.join("Resources", "data_preprocessed_trunc.csv")

preproc_df.to_csv(out_path_trunc)