In [1]:
# Import dependencies #

import altair as alt
import pandas as pd
import os
import numpy as np

### import data ###
# empty list to store data in 
storm_data = []

# path to folder
recent_storm_data = '..\Tornado Analysis Project\Past_5yr_Storm_data'

# recent files
for filename in os.listdir(recent_storm_data): # loop through files in folder
    filepath = os.path.join(recent_storm_data, filename)
    data = pd.read_csv(filepath, low_memory=False) # read in as CSV
    storm_data.append(data) # add data to the list


# combine the data frames
all_storm_data = pd.concat(storm_data, ignore_index=True)

# sample the data
all_storm_data.head()

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,...,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE
0,201905,9,1554,201905,9,1830,137295,824116,TEXAS,48,...,7.0,NNE,SAN GERONIMO,29.7898,-98.6406,29.7158,-98.7744,Thunderstorms developed along a cold front as ...,Thunderstorms produced heavy rain that led to ...,CSV
1,201908,1,0,201908,7,1400,141502,849617,SOUTH DAKOTA,46,...,3.0,W,BRUCE,44.54,-96.96,44.43,-96.94,Minor flooding slowly dwindled during early Au...,"A continuation of flooding from July, the Big ...",CSV
2,201909,25,1823,201909,25,1825,141998,852808,ARIZONA,4,...,24.0,S,OCOTILLO,32.87,-111.88,32.8788,-111.875,Scattered thunderstorms developed over the cen...,Scattered thunderstorms developed across the c...,CSV
3,201902,19,2226,201902,19,2350,134941,808922,ARKANSAS,5,...,,,,,,,,"Rain was heavy at times on the 19th, and there...",One-quarter inch of freezing rain was measured...,CSV
4,201902,19,2255,201902,19,2355,134941,808923,ARKANSAS,5,...,,,,,,,,"Rain was heavy at times on the 19th, and there...",One-quarter inch of freezing rain was measured...,CSV


In [2]:
all_storm_data.columns

Index(['BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH',
       'END_DAY', 'END_TIME', 'EPISODE_ID', 'EVENT_ID', 'STATE', 'STATE_FIPS',
       'YEAR', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_FIPS', 'CZ_NAME',
       'WFO', 'BEGIN_DATE_TIME', 'CZ_TIMEZONE', 'END_DATE_TIME',
       'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
       'DEATHS_INDIRECT', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'SOURCE',
       'MAGNITUDE', 'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'CATEGORY', 'TOR_F_SCALE',
       'TOR_LENGTH', 'TOR_WIDTH', 'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE',
       'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 'BEGIN_RANGE',
       'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH',
       'END_LOCATION', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON',
       'EPISODE_NARRATIVE', 'EVENT_NARRATIVE', 'DATA_SOURCE'],
      dtype='object')

In [3]:
# filter for the rows with tornado data
tornado_df = all_storm_data[all_storm_data['EVENT_TYPE'] == 'Tornado']
tornado_df.count()

BEGIN_YEARMONTH       7426
BEGIN_DAY             7426
BEGIN_TIME            7426
END_YEARMONTH         7426
END_DAY               7426
END_TIME              7426
EPISODE_ID            7426
EVENT_ID              7426
STATE                 7426
STATE_FIPS            7426
YEAR                  7426
MONTH_NAME            7426
EVENT_TYPE            7426
CZ_TYPE               7426
CZ_FIPS               7426
CZ_NAME               7426
WFO                   7426
BEGIN_DATE_TIME       7426
CZ_TIMEZONE           7426
END_DATE_TIME         7426
INJURIES_DIRECT       7426
INJURIES_INDIRECT     7426
DEATHS_DIRECT         7426
DEATHS_INDIRECT       7426
DAMAGE_PROPERTY       6003
DAMAGE_CROPS          6018
SOURCE                7426
MAGNITUDE                0
MAGNITUDE_TYPE           0
FLOOD_CAUSE              0
CATEGORY                 0
TOR_F_SCALE           7426
TOR_LENGTH            7426
TOR_WIDTH             7426
TOR_OTHER_WFO         1023
TOR_OTHER_CZ_STATE    1023
TOR_OTHER_CZ_FIPS     1023
T

In [4]:
# create new data frame without unneccesary columns
columns_to_keep = ['YEAR', 'MONTH_NAME', 'BEGIN_DAY', 'STATE', 'BEGIN_DATE_TIME',
                    'END_DATE_TIME', 'INJURIES_DIRECT', 'DEATHS_DIRECT', 'DAMAGE_PROPERTY',  'DAMAGE_CROPS', 'TOR_F_SCALE',
                    'TOR_LENGTH', 'TOR_WIDTH', 'BEGIN_RANGE', 'BEGIN_LAT', 'BEGIN_LON']

tornado_df = tornado_df[columns_to_keep]

tornado_df.columns

Index(['YEAR', 'MONTH_NAME', 'BEGIN_DAY', 'STATE', 'BEGIN_DATE_TIME',
       'END_DATE_TIME', 'INJURIES_DIRECT', 'DEATHS_DIRECT', 'DAMAGE_PROPERTY',
       'DAMAGE_CROPS', 'TOR_F_SCALE', 'TOR_LENGTH', 'TOR_WIDTH', 'BEGIN_RANGE',
       'BEGIN_LAT', 'BEGIN_LON'],
      dtype='object')

In [5]:
# view data types to see if any need to be changed
tornado_df.dtypes

YEAR                 int64
MONTH_NAME          object
BEGIN_DAY            int64
STATE               object
BEGIN_DATE_TIME     object
END_DATE_TIME       object
INJURIES_DIRECT      int64
DEATHS_DIRECT        int64
DAMAGE_PROPERTY     object
DAMAGE_CROPS        object
TOR_F_SCALE         object
TOR_LENGTH         float64
TOR_WIDTH          float64
BEGIN_RANGE        float64
BEGIN_LAT          float64
BEGIN_LON          float64
dtype: object

In [6]:
len(tornado_df)

7426

In [7]:
### Preprocessing ###

In [8]:
tornado_df['BEGIN_DATE_TIME'] = tornado_df['BEGIN_DATE_TIME'].astype('datetime64[ns]')
tornado_df['END_DATE_TIME'] = tornado_df['END_DATE_TIME'].astype('datetime64[ns]')

In [9]:
tornado_df.dtypes

YEAR                        int64
MONTH_NAME                 object
BEGIN_DAY                   int64
STATE                      object
BEGIN_DATE_TIME    datetime64[ns]
END_DATE_TIME      datetime64[ns]
INJURIES_DIRECT             int64
DEATHS_DIRECT               int64
DAMAGE_PROPERTY            object
DAMAGE_CROPS               object
TOR_F_SCALE                object
TOR_LENGTH                float64
TOR_WIDTH                 float64
BEGIN_RANGE               float64
BEGIN_LAT                 float64
BEGIN_LON                 float64
dtype: object

In [10]:
# check NA count
tornado_df.isna().sum()

YEAR                  0
MONTH_NAME            0
BEGIN_DAY             0
STATE                 0
BEGIN_DATE_TIME       0
END_DATE_TIME         0
INJURIES_DIRECT       0
DEATHS_DIRECT         0
DAMAGE_PROPERTY    1423
DAMAGE_CROPS       1408
TOR_F_SCALE           0
TOR_LENGTH            0
TOR_WIDTH             0
BEGIN_RANGE           9
BEGIN_LAT             9
BEGIN_LON             9
dtype: int64

In [11]:
# only 9 entries null for lat and lon, so they can be dropped
tornado_df = tornado_df.dropna(subset = ['BEGIN_LAT', 'BEGIN_LON'])

# only null values left are in the damage columns, they can be filled with 0 because possible that they simply did no damage
tornado_df = tornado_df.fillna(0)

EDA

In [12]:
### calculate and identify total occurrences for each state or broader regions ###
### proportions for sampling purposes ###

### new column for broader region ###
REGIONS = {
    "NORTHEAST": ["CONNECTICUT", "DELAWARE", "MAINE", "MARYLAND", "MASSACHUSETTS", "NEW HAMPSHIRE", "NEW JERSEY", "NEW YORK", "PENNSYLVANIA", "RHODE ISLAND", "VERMONT"],
    "UPPER MIDWEST": ["IOWA", "MICHIGAN", "MINNESOTA", "WISCONSIN"],
    "OHIO VALLEY": ["ILLINOIS", "INDIANA", "KENTUCKY", "MISSOURI", "OHIO", "TENNESSEE", "WEST VIRGINIA"],
    "SOUTHEAST": ["ALABAMA", "FLORIDA", "GEORGIA", "NORTH CAROLINA", "SOUTH CAROLINA", "VIRGINIA"],
    "NORTHERN ROCKIES AND PLAINS": ["MONTANA", "NEBRASKA", "NORTH DAKOTA", "SOUTH DAKOTA", "WYOMING"],
    "SOUTH": ["ARKANSAS", "KANSAS", "LOUISIANA", "MISSISSIPPI", "OKLAHOMA", "TEXAS"],
    "SOUTHWEST": ["ARIZONA", "COLORADO", "NEW MEXICO", "UTAH"],
    "NORTHWEST": ["IDAHO", "OREGON", "WASHINGTON"],
    "WEST": ["CALIFORNIA", "NEVADA"]
}

state_to_region = {state: region for region, states in REGIONS.items() for state in states}

tornado_df['Region'] = tornado_df['STATE'].map(state_to_region)

tornado_df.head()

Unnamed: 0,YEAR,MONTH_NAME,BEGIN_DAY,STATE,BEGIN_DATE_TIME,END_DATE_TIME,INJURIES_DIRECT,DEATHS_DIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,TOR_F_SCALE,TOR_LENGTH,TOR_WIDTH,BEGIN_RANGE,BEGIN_LAT,BEGIN_LON,Region
2,2019,September,25,ARIZONA,2019-09-25 18:23:00,2019-09-25 18:25:00,0,0,0.00K,0.00K,EF0,0.67,50.0,24.0,32.87,-111.88,SOUTHWEST
5,2019,May,17,OKLAHOMA,2019-05-17 18:32:00,2019-05-17 18:39:00,0,0,0.00K,0.00K,EFU,2.88,50.0,5.0,36.9622,-100.5635,SOUTH
25,2019,August,2,IDAHO,2019-08-02 12:55:00,2019-08-02 13:05:00,0,0,0.00K,0.00K,EF0,1.0,10.0,12.0,42.6165,-113.5521,NORTHWEST
206,2019,April,24,LOUISIANA,2019-04-24 23:11:00,2019-04-24 23:28:00,1,0,30.00K,0.00K,EF1,10.0,900.0,3.0,31.8455,-93.6039,SOUTH
243,2019,April,25,LOUISIANA,2019-04-25 00:25:00,2019-04-25 00:26:00,0,0,25.00K,0.00K,EF1,1.03,200.0,1.0,32.355,-92.9253,SOUTH


In [13]:
tornado_df = tornado_df.groupby(['YEAR', 'MONTH_NAME', 'BEGIN_DAY', 'STATE'], as_index = False).first()

tornado_df.head()

Unnamed: 0,YEAR,MONTH_NAME,BEGIN_DAY,STATE,BEGIN_DATE_TIME,END_DATE_TIME,INJURIES_DIRECT,DEATHS_DIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,TOR_F_SCALE,TOR_LENGTH,TOR_WIDTH,BEGIN_RANGE,BEGIN_LAT,BEGIN_LON,Region
0,2019,April,6,TEXAS,2019-04-06 13:10:00,2019-04-06 13:12:00,0,0,10.00K,0.00K,EF0,0.31,150.0,1.0,30.6936,-97.1968,SOUTH
1,2019,April,7,LOUISIANA,2019-04-07 19:19:00,2019-04-07 19:22:00,0,0,0,0.00K,EF0,2.58,50.0,1.0,29.9528,-90.2369,SOUTH
2,2019,April,7,MISSISSIPPI,2019-04-07 17:02:00,2019-04-07 17:04:00,0,0,0,0.00K,EF1,1.72,125.0,6.0,31.0184,-91.0712,SOUTH
3,2019,April,7,TEXAS,2019-04-07 06:24:00,2019-04-07 06:25:00,0,0,1.25M,0.00K,EF0,0.58,30.0,0.0,28.3691,-97.6643,SOUTH
4,2019,April,8,ALABAMA,2019-04-08 06:10:00,2019-04-08 06:29:00,1,0,0,0,EF1,6.1,219.0,1.0,34.2274,-86.3294,SOUTHEAST


Climate Regions as dictated by NCEI (https://www.ncei.noaa.gov/access/monitoring/reference-maps/us-climate-regions)

In [14]:
import plotly.graph_objects as go

region_counts = tornado_df['Region'].value_counts()

fig = go.Figure(data=[go.Bar(
    x=region_counts.index,
    y=region_counts.values,  
    marker_color='skyblue'   
)])

# Update layout with title and axis labels
fig.update_layout(
    title={
        'text': "Number of Events Per Region",
        'x': 0.5,         
        'xanchor': 'center'
    },
    xaxis_title='Region',
    yaxis_title='Number of Events',
    xaxis_tickangle=-45 
)

# Show the plot
fig.show()

In [15]:
### Group tornado events by month  and calculate count totals per season ###
### proportion as well for sampling ###
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
tornado_df['MONTH_NAME'] = pd.Categorical(tornado_df['MONTH_NAME'], categories=month_order, ordered=True)

monthly_occurrences = tornado_df['MONTH_NAME'].value_counts()

sorted_value_counts = monthly_occurrences.sort_index()

fig = go.Figure(data = [go.Line(
    x = sorted_value_counts.index,
    y = sorted_value_counts.values,
    mode = 'lines+markers',
    line = dict(color = 'thistle'), name = 'RGBA Color'
)]    
)

fig.update_layout(
    title={
        'text': "Monthly Occurrences",
        'x': 0.5,         
        'xanchor': 'center'
    },
    xaxis_title = 'Month',
    yaxis_title = 'Count'

)

fig.show()


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [16]:
import plotly.express as px

region_counts = tornado_df['Region'].value_counts()
regional_pie = px.pie(
    region_counts,
    values = region_counts.values,
    names = region_counts.index,
    color_discrete_sequence=px.colors.sequential.Greens[::-1]
)

regional_pie.update_traces(
    hovertemplate='%{label}: %{value}<extra></extra>',
    textposition = 'inside'
)

regional_pie.update_layout(
    uniformtext_minsize=12,
    uniformtext_mode='hide',
    width=750, 
    height=500, 
    margin=dict(
        l=75,  
        r=100,  
        t=100,  
        b=100    
    ),
    title={
        'text': "Events Per Region",
        'x': 0.5,           
        'xanchor': 'center'
        }
)

regional_pie.show()

monthly_pie = px.pie(
    monthly_occurrences,
    values = monthly_occurrences.values,
    names = monthly_occurrences.index,
    color_discrete_sequence=px.colors.sequential.Blues[::-1]
)

monthly_pie.update_traces(
    hovertemplate='%{label}: %{value}<extra></extra>',
    textposition= 'inside'
)


monthly_pie.update_layout(
    uniformtext_minsize=12,
    uniformtext_mode='hide',
    width=750, 
    height=500, 
    margin=dict(
        l=75,  
        r=255,  
        t=100,  
        b=100    
    ),
    title={
        'text': "Events Per Month",
        'x': 0.5,           
        'xanchor': 'center'
        }
)

monthly_pie.show()

In [17]:
### Create non_tornado_df ###

from itertools import product 
states = [
    'ALABAMA', 'ALASKA', 'ARIZONA', 'ARKANSAS', 'CALIFORNIA', 'COLORADO', 'CONNECTICUT', 'DELAWARE', 'FLORIDA', 'GEORGIA',
    'HAWAII', 'IDAHO', 'ILLINOIS', 'INDIANA', 'IOWA', 'KANSAS', 'KENTUCKY', 'LOUISIANA', 'MAINE', 'MARYLAND',
    'MASSACHUSETTS', 'MICHIGAN', 'MINNESOTA', 'MISSISSIPPI', 'MISSOURI', 'MONTANA', 'NEBRASKA', 'NEVADA', 'NEW HAMPSHIRE', 'NEW JERSEY',
    'NEW MEXICO', 'NEW YORK', 'NORTH CAROLINA', 'NORTH DAKOTA', 'OHIO', 'OKLAHOMA', 'OREGON', 'PENNSYLVANIA', 'RHODE ISLAND', 'SOUTH CAROLINA',
    'SOUTH DAKOTA', 'TENNESSEE', 'TEXAS', 'UTAH', 'VERMONT', 'VIRGINIA', 'WASHINGTON', 'WEST VIRGINIA', 'WISCONSIN', 'WYOMING'
]

date_range = pd.date_range(start='2019-01-01', end='2023-12-31')

combinations = list(product(states, date_range))

df = pd.DataFrame(combinations, columns=['state', 'date'])

df = df.sort_values(['state', 'date']).reset_index(drop=True)

df.head()
len(df)

91300

In [18]:
# Map the region to the states for the non_tornado_df #
df['Region'] = df['state'].map(state_to_region)

In [19]:
# Drop all day rows that exist in the tornado_df so that we know there is no overlap #

tornado_df['BEGIN_DATE'] = pd.to_datetime(tornado_df['BEGIN_DATE_TIME']).dt.date
tornado_dates = tornado_df['BEGIN_DATE'].unique()
df_no_tornados = df[~df['date'].isin(tornado_dates)]
df_no_tornados = df_no_tornados.reset_index(drop=True)

len(df_no_tornados)

47900

In [20]:
# Add month name column #

df_no_tornados['MONTH_NAME'] = df_no_tornados['date'].dt.strftime('%B')
df_no_tornados.head()

Unnamed: 0,state,date,Region,MONTH_NAME
0,ALABAMA,2019-01-01,SOUTHEAST,January
1,ALABAMA,2019-01-02,SOUTHEAST,January
2,ALABAMA,2019-01-03,SOUTHEAST,January
3,ALABAMA,2019-01-05,SOUTHEAST,January
4,ALABAMA,2019-01-07,SOUTHEAST,January


In [21]:
df_no_tornados['MONTH_NAME'] = pd.Categorical(df_no_tornados['MONTH_NAME'], categories=month_order, ordered=True)

In [22]:
def create_matrix(df):
    matrix = pd.pivot_table(
        df,
        index='Region',        # Rows of the matrix (months)
        columns='MONTH_NAME',      # Columns of the matrix (states)
        aggfunc='size',       # Count entries
        fill_value=0          # Fill missing values with 0
    )

    return matrix

region_matrix = create_matrix(tornado_df)
region_matrix

MONTH_NAME,January,February,March,April,May,June,July,August,September,October,November,December
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
NORTHEAST,1,2,2,21,17,15,27,37,17,7,6,0
NORTHERN ROCKIES AND PLAINS,0,0,1,7,38,54,44,32,6,6,0,1
NORTHWEST,0,0,2,0,10,7,2,1,2,6,2,0
OHIO VALLEY,17,13,48,41,73,60,34,39,11,19,2,17
SOUTH,24,14,67,85,150,73,30,33,13,31,22,25
SOUTHEAST,37,24,54,68,49,36,22,38,19,21,14,29
SOUTHWEST,0,1,6,2,34,36,34,18,10,4,2,1
UPPER MIDWEST,1,0,8,10,31,32,47,41,22,9,1,3
WEST,5,4,4,4,5,2,0,3,3,1,2,2


In [23]:
# non_zero_entries = (region_matrix > 0).sum().sum()
# min_samples_per_entry = 5
# tornado_sample_size =  non_zero_entries * min_samples_per_entry
# non_tornado_sample_size = tornado_sample_size * 3
# print(f"Sampling {tornado_sample_size} tornado days and {non_tornado_sample_size} non-tornado days")


In [24]:
total_tornados = region_matrix.sum().sum()
proportions_matrix = region_matrix / total_tornados

proportion_series = proportions_matrix.stack()
proportion_series.index.names = ['Region', 'MONTH_NAME']

# Set batch parameters
batch_size = 1250
tornado_ratio = 0.3
tornado_size = int(batch_size * tornado_ratio)
non_tornado_size = batch_size - tornado_size

# Sample tornado days
tornado_samples = (proportion_series * tornado_size).round().astype(int)
tornado_samples = tornado_samples.where(tornado_samples > 0, 1 * (proportion_series > 0))

sampled_tornado_data = []
for (region, month), num_samples in tornado_samples.items():
    if num_samples > 0:
        subset = tornado_df[(tornado_df['Region'] == region) & (tornado_df['MONTH_NAME'] == month)]
        if len(subset) < num_samples:
            sampled_subset = resample(subset, n_samples=num_samples, replace=True)
        else:
            sampled_subset = subset.sample(n=num_samples, replace=False)
        sampled_tornado_data.append(sampled_subset)

# Sample non-tornado days
sampled_non_tornado_data = []
for region in region_matrix.index:
    for month in month_order:
        num_samples = max(1, int(non_tornado_size * proportion_series.get((region, month), 0)))
        subset = df_no_tornados[(df_no_tornados['Region'] == region) & (df_no_tornados['MONTH_NAME'] == month)]
        if len(subset) < num_samples:
            sampled_subset = resample(subset, n_samples=num_samples, replace=True)
        else:
            sampled_subset = subset.sample(n=num_samples, replace=False)
        sampled_non_tornado_data.append(sampled_subset)

# Combine tornado and non-tornado samples
sampled_tornado_df = pd.concat(sampled_tornado_data, ignore_index=True)
sampled_non_tornado_df = pd.concat(sampled_non_tornado_data, ignore_index=True)
len(sampled_non_tornado_df)


859

In [25]:
# region_matrix = pd.crosstab(tornado_df['Region'], tornado_df['MONTH_NAME'])
# total_tornados = region_matrix.sum().sum()
# proportions_matrix = region_matrix / total_tornados

# proportion_series = proportions_matrix.stack()
# proportion_series.index.names = ['Region', 'MONTH_NAME']

# tornado_sample_size = 200
# non_tornado_sample_size = 800

# tornado_samples = (proportion_series * tornado_sample_size).round().astype(int)
# sampled_tornado_data = []
# sampled_non_tornado_data = []

# for (region, month), num_samples in tornado_samples.items():
#     if num_samples > 0:
#         subset = tornado_df[(tornado_df['Region'] == region) & (tornado_df['MONTH_NAME'] == month)]
#         if len(subset) < num_samples:
#             sampled_subset = resample(subset, n_samples=num_samples, replace=True)
#         else:
#             sampled_subset = subset.sample(n=num_samples, replace=False)
#         sampled_tornado_data.append(sampled_subset)

# sampled_tornado_df = pd.concat(sampled_tornado_data, ignore_index=True)

# for region in tornado_df['Region'].unique():
#     for month in tornado_df['MONTH_NAME'].unique():
#         subset = df_no_tornados[(df_no_tornados['Region'] == region) & (df_no_tornados['MONTH_NAME'] == month)]
#         num_samples = int(non_tornado_sample_size * proportion_series.get((region, month), 0))
#         if num_samples > 0:
#             if len(subset) < num_samples:
#                 sampled_subset = resample(subset, n_samples=num_samples, replace=True)
#             else:
#                 sampled_subset = subset.sample(n=num_samples, replace=False)
#             sampled_non_tornado_data.append(sampled_subset)

# sampled_tornado_df = pd.concat(sampled_tornado_data, ignore_index=True)
# sampled_non_tornado_df = pd.concat(sampled_non_tornado_data, ignore_index=True)

# print("Sampled tornado days:", len(sampled_tornado_df))
# print("Sampled non-tornado days:", len(sampled_non_tornado_df))


In [26]:
### Create matrices and heat maps

def create_matrix(df):
    matrix = pd.pivot_table(
        df,
        index='Region',        # Rows of the matrix (months)
        columns='MONTH_NAME',      # Columns of the matrix (states)
        aggfunc='size',       # Count entries
        fill_value=0          # Fill missing values with 0
    )

    return matrix

region_matrix = create_matrix(tornado_df)
region_matrix

MONTH_NAME,January,February,March,April,May,June,July,August,September,October,November,December
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
NORTHEAST,1,2,2,21,17,15,27,37,17,7,6,0
NORTHERN ROCKIES AND PLAINS,0,0,1,7,38,54,44,32,6,6,0,1
NORTHWEST,0,0,2,0,10,7,2,1,2,6,2,0
OHIO VALLEY,17,13,48,41,73,60,34,39,11,19,2,17
SOUTH,24,14,67,85,150,73,30,33,13,31,22,25
SOUTHEAST,37,24,54,68,49,36,22,38,19,21,14,29
SOUTHWEST,0,1,6,2,34,36,34,18,10,4,2,1
UPPER MIDWEST,1,0,8,10,31,32,47,41,22,9,1,3
WEST,5,4,4,4,5,2,0,3,3,1,2,2


In [27]:
tornado_sample_matrix = create_matrix(sampled_tornado_df)
tornado_sample_matrix

MONTH_NAME,January,February,March,April,May,June,July,August,September,October,November,December
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
NORTHEAST,1,1,1,4,3,3,5,7,3,1,1,0
NORTHERN ROCKIES AND PLAINS,0,0,1,1,7,10,8,6,1,1,0,1
NORTHWEST,0,0,1,0,2,1,1,1,1,1,1,0
OHIO VALLEY,3,2,9,7,13,11,6,7,2,3,1,3
SOUTH,4,2,12,15,27,13,5,6,2,6,4,4
SOUTHEAST,7,4,10,12,9,6,4,7,3,4,2,5
SOUTHWEST,0,1,1,1,6,6,6,3,2,1,1,1
UPPER MIDWEST,1,0,1,2,6,6,8,7,4,2,1,1
WEST,1,1,1,1,1,1,0,1,1,1,1,1


In [28]:
non_tornado_sample_matrix = create_matrix(sampled_non_tornado_df)
non_tornado_sample_matrix

MONTH_NAME,January,February,March,April,May,June,July,August,September,October,November,December
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
NORTHEAST,1,1,1,8,7,6,11,15,7,2,2,1
NORTHERN ROCKIES AND PLAINS,1,1,1,2,15,22,18,13,2,2,1,1
NORTHWEST,1,1,1,1,4,2,1,1,1,2,1,1
OHIO VALLEY,7,5,19,16,30,24,14,16,4,7,1,7
SOUTH,9,5,27,35,62,30,12,13,5,12,9,10
SOUTHEAST,15,9,22,28,20,14,9,15,7,8,5,12
SOUTHWEST,1,1,2,1,14,14,14,7,4,1,1,1
UPPER MIDWEST,1,1,3,4,12,13,19,16,9,3,1,1
WEST,2,1,1,1,2,1,1,1,1,1,1,1


In [29]:
fig1 = px.imshow(
    region_matrix,
    labels={'x': 'State', 'y': 'Month', 'color': 'Number of Entries'},
    title='Tornado Events by Month and State'
)

fig1.update_layout(
    title={
        'text': "Events Per Month",
        'x': 0.45,
        'xanchor': 'center'
        }
) 

fig2 = px.imshow(
    tornado_sample_matrix,
    labels={'x': 'State', 'y': 'Month', 'color': 'Number of Entries'},
    title='Tornado Events by Month and State'
)

fig2.update_layout(
    title={
        'text': "Events Per Month",
        'x': 0.45,           
        'xanchor': 'center'
        }
)

fig3 = px.imshow(
    non_tornado_sample_matrix,
    labels={'x': 'State', 'y': 'Month', 'color': 'Number of Entries'},
    title='Tornado Events by Month and State'
)

fig3.update_layout(
    title={
        'text': "Events Per Month",
        'x': 0.45,           
        'xanchor': 'center'
        }
)

# Show the heatmap
fig1.show()
# Show the heatmap
fig2.show()

fig3.show()

In [30]:
# Save to CSV for fetching data with api #
sampled_tornado_df.to_csv('tornado_sample.csv', index = False, header = True)
sampled_non_tornado_df.to_csv('non_tornado_sample.csv', index = False, header = True)

In [31]:
# import weather data #
tornado_weather_data = pd.read_csv('long_tornado_weather_data.csv')
non_tornado_weather_data = pd.read_csv('long_non_tornado_weather_data.csv')

In [32]:
tornado_weather_data.head()

Unnamed: 0.1,Unnamed: 0,datetime,datetimeEpoch,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,...,sunriseEpoch,sunset,sunsetEpoch,moonphase,conditions,description,icon,stations,source,severerisk
0,0,2019-01-08,1546923600,42.7,33.1,37.8,39.8,26.8,34.0,34.4,...,1546950613,16:58:25,1546984705,0.09,"Snow, Rain, Overcast",Cloudy skies throughout the day with rain or s...,rain,"['KMDT', 'KTHV', 'KMUI', 'AV047', 'E3000', 'MN...",obs,
1,1,2023-02-21,1676955600,50.6,35.0,43.2,50.6,33.0,40.7,36.7,...,1676979867,17:41:29,1677019289,0.05,"Rain, Partially cloudy",Partly cloudy throughout the day with a chance...,rain,"['KTTN', 'KWRI', '72409514792', '72408594732',...",obs,10.0
2,2,2022-03-31,1648699200,74.9,41.6,56.7,74.9,38.4,56.3,46.0,...,1648723971,19:31:07,1648769467,0.98,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,rain,"['KMDT', 'D5425', 'KTHV', 'KMUI', 'AV047', '72...",obs,10.0
3,3,2023-04-22,1682136000,72.0,58.9,65.2,72.0,58.9,65.2,55.9,...,1682158789,19:49:52,1682207392,0.08,"Rain, Partially cloudy",Becoming cloudy in the afternoon with rain.,rain,"['99999913752', 'F7211', '72215899999', '72406...",obs,10.0
4,4,2023-04-22,1682136000,75.5,56.0,62.3,75.5,56.0,62.3,53.3,...,1682158764,19:53:28,1682207608,0.08,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,rain,"['KMDT', 'KTHV', 'AV047', 'MNGP1', '7251181475...",obs,10.0


In [33]:

columns_to_drop = ['datetimeEpoch', 'sunrise', 'sunriseEpoch', 'sunset', 'sunsetEpoch', 'moonphase',
                    'description', 'stations', 'source', 'feelslikemin', 'feelslike', 'feelslikemax', 'snow', 'snowdepth', 'temp', 'severerisk', 'preciptype', 'precipcover', 'precipprob']

tornado_weather_data = tornado_weather_data.drop(columns = columns_to_drop)
non_tornado_weather_data = non_tornado_weather_data.drop(columns = columns_to_drop)
tornado_weather_data.head()

Unnamed: 0,datetime,tempmax,tempmin,dew,humidity,precip,windgust,windspeed,winddir,pressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,conditions,icon
0,2019-01-08,42.7,33.1,34.4,87.7,0.151,23.0,8.6,108.0,1013.3,94.2,5.9,627.6,53.9,1.0,"Snow, Rain, Overcast",rain
1,2023-02-21,50.6,35.0,36.7,78.6,0.171,33.9,13.9,359.3,1003.9,66.1,8.4,68.0,5.8,5.0,"Rain, Partially cloudy",rain
2,2022-03-31,74.9,41.6,46.0,68.0,0.855,40.1,24.8,169.5,1002.6,86.0,8.8,46.5,4.0,4.0,"Rain, Partially cloudy",rain
3,2023-04-22,72.0,58.9,55.9,73.0,0.455,32.2,17.2,157.4,1010.5,34.3,9.0,78.1,6.6,5.0,"Rain, Partially cloudy",rain
4,2023-04-22,75.5,56.0,53.3,74.4,1.123,49.4,18.9,141.3,1009.5,57.5,8.8,108.0,9.1,6.0,"Rain, Partially cloudy",rain


In [34]:
# Map the regions back from the sample dataframe #
tornado_weather_data['Region'] = sampled_tornado_df['Region']
non_tornado_weather_data['Region'] = sampled_non_tornado_df['Region']

# add the tornado column
tornado_weather_data['Tornado'] = 1
non_tornado_weather_data['Tornado'] = 0

# calculate temp difference because this may indicate unstable environment rather than the individual temperatures of the day as they can differ throughout the seasons #
tornado_weather_data['Temp_Difference'] = tornado_weather_data['tempmax'] - tornado_weather_data['tempmin']
non_tornado_weather_data['Temp_Difference'] = non_tornado_weather_data['tempmax'] - non_tornado_weather_data['tempmin']
tornado_weather_data = tornado_weather_data.drop(columns=['tempmin', 'tempmax'])
non_tornado_weather_data = non_tornado_weather_data.drop(columns=['tempmin', 'tempmax'])



In [35]:
tornado_weather_data.isna().sum()

datetime            0
dew                 0
humidity            0
precip              0
windgust           13
windspeed           0
winddir             0
pressure            0
cloudcover          0
visibility          0
solarradiation      0
solarenergy         0
uvindex             0
conditions          0
icon                0
Region              1
Tornado             0
Temp_Difference     0
dtype: int64

In [36]:
rows_with_na = tornado_weather_data[tornado_weather_data['Region'].isna()]
rows_with_na

Unnamed: 0,datetime,dew,humidity,precip,windgust,windspeed,winddir,pressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,conditions,icon,Region,Tornado,Temp_Difference
394,2019-12-25,40.9,79.0,0.064,23.0,14.4,53.9,1011.0,65.5,9.6,62.4,5.3,3.0,"Rain, Partially cloudy",rain,,1,9.1


In [37]:
tornado_weather_data['Region'].value_counts()

SOUTH                          100
SOUTHEAST                       73
OHIO VALLEY                     67
UPPER MIDWEST                   39
NORTHERN ROCKIES AND PLAINS     36
NORTHEAST                       30
SOUTHWEST                       29
WEST                            11
NORTHWEST                        9
Name: Region, dtype: int64

In [38]:
sampled_tornado_df['Region'].value_counts()

SOUTH                          100
SOUTHEAST                       73
OHIO VALLEY                     67
UPPER MIDWEST                   39
NORTHERN ROCKIES AND PLAINS     36
NORTHEAST                       30
SOUTHWEST                       29
WEST                            11
NORTHWEST                        9
Name: Region, dtype: int64

In [39]:
tornado_weather_data = tornado_weather_data.dropna(subset = ['Region'])
tornado_weather_data.isna().sum()

datetime            0
dew                 0
humidity            0
precip              0
windgust           13
windspeed           0
winddir             0
pressure            0
cloudcover          0
visibility          0
solarradiation      0
solarenergy         0
uvindex             0
conditions          0
icon                0
Region              0
Tornado             0
Temp_Difference     0
dtype: int64

In [40]:
### Leave other NA values for right now because this could indicate a very low windgust that is not able to be measured ###

In [41]:

# # Concat both dataframes together #
combined_weather_data = pd.concat([non_tornado_weather_data, tornado_weather_data], axis = 0)

In [42]:
weather_by_date = combined_weather_data.sort_values(by = 'datetime')
weather_by_date.head()

Unnamed: 0,datetime,dew,humidity,precip,windgust,windspeed,winddir,pressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,conditions,icon,Region,Tornado,Temp_Difference
164,2019-01-01,31.4,91.4,0.583,21.9,11.4,315.9,1024.8,99.5,5.6,15.3,1.2,1.0,"Snow, Rain, Overcast",rain,OHIO VALLEY,0,7.1
545,2019-01-03,43.2,89.2,0.172,,8.6,250.5,1016.9,65.9,8.0,32.0,2.6,1.0,"Rain, Partially cloudy",rain,SOUTHEAST,0,14.0
244,2019-01-04,50.1,85.9,1.227,34.2,18.9,207.4,1008.9,77.6,6.5,70.3,5.9,6.0,"Rain, Partially cloudy",rain,SOUTHEAST,1,19.6
845,2019-01-07,31.0,71.6,0.064,51.4,18.5,234.9,1017.4,68.7,9.0,26.8,2.3,2.0,"Rain, Partially cloudy",snow,WEST,0,8.9
0,2019-01-08,34.4,87.7,0.151,23.0,8.6,108.0,1013.3,94.2,5.9,627.6,53.9,1.0,"Snow, Rain, Overcast",rain,NORTHEAST,1,9.6


In [43]:
weather_by_date.isna().sum()

datetime             0
dew                  0
humidity             0
precip               0
windgust           130
windspeed            0
winddir              0
pressure             0
cloudcover           0
visibility           0
solarradiation       0
solarenergy          0
uvindex              0
conditions           0
icon                 0
Region               0
Tornado              0
Temp_Difference      0
dtype: int64

In [45]:
weather_by_date = weather_by_date.fillna(0)

In [49]:
weather_by_date.dtypes

datetime            object
dew                float64
humidity           float64
precip             float64
windgust           float64
windspeed          float64
winddir            float64
pressure           float64
cloudcover         float64
visibility         float64
solarradiation     float64
solarenergy        float64
uvindex            float64
conditions          object
icon                object
Region              object
Tornado              int64
Temp_Difference    float64
dtype: object

In [51]:
weather_by_date.to_csv('full_weather_data.csv', index = False, header = True)

: 