In [140]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [141]:
data2024 = pd.read_csv('StormEvents_details-ftp_v1.0_d2024_c20250317.csv')
data2023 = pd.read_csv('StormEvents_details-ftp_v1.0_d2023_c20250317.csv')
data2022 = pd.read_csv('StormEvents_details-ftp_v1.0_d2022_c20241121.csv')
data2021 = pd.read_csv('StormEvents_details-ftp_v1.0_d2021_c20240716.csv')
data2020 = pd.read_csv('StormEvents_details-ftp_v1.0_d2020_c20240620.csv')
stormdata2 = pd.concat([data2020, data2021, data2022, data2023], ignore_index=True)
stormdata3 = stormdata2.drop(columns=[
    'EPISODE_ID', 'EVENT_ID', 'BEGIN_DATE_TIME', 'END_DATE_TIME',
    'MONTH_NAME', 'STATE', 'CZ_NAME', 'WFO', 'BEGIN_LOCATION',
    'END_LOCATION', 'BEGIN_AZIMUTH', 'END_AZIMUTH', 'TOR_OTHER_WFO',
    'TOR_OTHER_CZ_STATE', 'TOR_OTHER_CZ_NAME', 'CZ_TIMEZONE',
    'EPISODE_NARRATIVE', 'EVENT_NARRATIVE', 'SOURCE', 'MAGNITUDE_TYPE',
    'FLOOD_CAUSE', 'CATEGORY', 'TOR_F_SCALE', 'DATA_SOURCE', 'DAMAGE_CROPS', 'TOR_OTHER_CZ_FIPS', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT'])

stormdata3.head(n=10)

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,STATE_FIPS,YEAR,EVENT_TYPE,CZ_TYPE,...,DAMAGE_PROPERTY,MAGNITUDE,TOR_LENGTH,TOR_WIDTH,BEGIN_RANGE,END_RANGE,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON
0,202006,24,1620,202006,24,1620,13,2020,Thunderstorm Wind,C,...,0.00K,50.0,,,1.0,1.0,31.7,-83.89,31.7,-83.89
1,202006,20,1930,202006,20,1930,20,2020,Hail,C,...,,1.0,,,8.0,8.0,39.7571,-99.6684,39.7571,-99.6684
2,202006,3,1550,202006,3,1550,20,2020,Hail,C,...,,0.75,,,14.0,14.0,39.9137,-101.9753,39.9137,-101.9753
3,202006,19,1900,202006,19,1900,20,2020,Thunderstorm Wind,C,...,,52.0,,,2.0,2.0,39.34,-101.37,39.34,-101.37
4,202006,20,1900,202006,20,1900,20,2020,Hail,C,...,,1.25,,,1.0,1.0,39.84,-99.89,39.84,-99.89
5,202006,20,1837,202006,20,1837,20,2020,Hail,C,...,,0.75,,,6.0,6.0,39.9397,-99.8877,39.9397,-99.8877
6,202006,23,1511,202006,23,1511,8,2020,Hail,C,...,,0.75,,,6.0,6.0,38.7204,-102.6783,38.7204,-102.6783
7,202006,26,1830,202006,26,1830,8,2020,Hail,C,...,,0.75,,,3.0,3.0,39.9134,-102.2277,39.9134,-102.2277
8,202006,26,1845,202006,26,1850,8,2020,Hail,C,...,,1.75,,,3.0,3.0,39.86,-102.18,39.86,-102.18
9,202006,9,1344,202006,9,1344,20,2020,High Wind,Z,...,0.00K,59.0,,,,,,,,


In [142]:
data2024 = pd.read_csv('StormEvents_details-ftp_v1.0_d2024_c20250317.csv')
data2024 = data2024.drop(columns=[
    'EPISODE_ID', 'EVENT_ID', 'BEGIN_DATE_TIME', 'END_DATE_TIME',
    'MONTH_NAME', 'STATE', 'CZ_NAME', 'WFO', 'BEGIN_LOCATION',
    'END_LOCATION', 'BEGIN_AZIMUTH', 'END_AZIMUTH', 'TOR_OTHER_WFO',
    'TOR_OTHER_CZ_STATE', 'TOR_OTHER_CZ_NAME', 'CZ_TIMEZONE',
    'EPISODE_NARRATIVE', 'EVENT_NARRATIVE', 'SOURCE', 'MAGNITUDE_TYPE',
    'FLOOD_CAUSE', 'CATEGORY', 'TOR_F_SCALE', 'DATA_SOURCE', 'DAMAGE_CROPS', 'TOR_OTHER_CZ_FIPS', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT'])

In [143]:
#calculating duration hour

import pandas as pd
from datetime import datetime

def compute_duration(row):
    # --- Parse BEGIN date/time ---
    begin_year = row['BEGIN_YEARMONTH'] // 100    # e.g. 202006 -> year=2020
    begin_month = row['BEGIN_YEARMONTH'] % 100    # e.g. 202006 -> month=06
    begin_day = row['BEGIN_DAY']
    
    # Convert BEGIN_TIME (like 1620) into hours/minutes
    begin_time_str = f"{row['BEGIN_TIME']:04d}"   # ensures 4 digits, e.g. "1620"
    begin_hour = int(begin_time_str[:2])
    begin_minute = int(begin_time_str[2:])
    
    # Create a datetime object for the begin time
    begin_dt = datetime(begin_year, begin_month, begin_day, begin_hour, begin_minute)
    
    # --- Parse END date/time ---
    end_year = row['END_YEARMONTH'] // 100
    end_month = row['END_YEARMONTH'] % 100
    end_day = row['END_DAY']
    
    end_time_str = f"{row['END_TIME']:04d}"
    end_hour = int(end_time_str[:2])
    end_minute = int(end_time_str[2:])
    
    # Create a datetime object for the end time
    end_dt = datetime(end_year, end_month, end_day, end_hour, end_minute)
    
    # Return the difference (as a Timedelta)
    return end_dt - begin_dt

# Example usage:
# Assuming 'stormdata' is your DataFrame
stormdata3['duration'] = stormdata3.apply(compute_duration, axis=1)

# The new 'duration' column will be a pandas Timedelta.
# If you want it in hours (numeric), you could do:
stormdata3['DURATION_HOURS'] = stormdata3['duration'].dt.total_seconds() / 3600

# Drop the 'duration' column, keep only 'duration_hours'
stormdata = stormdata3.drop(columns=['duration', 'END_YEARMONTH', 'END_DAY', 'END_TIME'])

stormdata.head(n=10)

# Example usage:
# Assuming 'stormdata' is your DataFrame
data2024['duration'] = data2024.apply(compute_duration, axis=1)

# The new 'duration' column will be a pandas Timedelta.
# If you want it in hours (numeric), you could do:
data2024['DURATION_HOURS'] = data2024['duration'].dt.total_seconds() / 3600

# Drop the 'duration' column, keep only 'duration_hours'
data2024 = data2024.drop(columns=['duration', 'END_YEARMONTH', 'END_DAY', 'END_TIME'])

In [144]:
stormdata.rename(columns={'BEGIN_YEARMONTH': 'MONTH'}, inplace=True)
stormdata['MONTH'] = stormdata['MONTH'].astype(str).str[-2:]
stormdata.head(n=10)

Unnamed: 0,MONTH,BEGIN_DAY,BEGIN_TIME,STATE_FIPS,YEAR,EVENT_TYPE,CZ_TYPE,CZ_FIPS,DAMAGE_PROPERTY,MAGNITUDE,TOR_LENGTH,TOR_WIDTH,BEGIN_RANGE,END_RANGE,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,DURATION_HOURS
0,6,24,1620,13,2020,Thunderstorm Wind,C,321,0.00K,50.0,,,1.0,1.0,31.7,-83.89,31.7,-83.89,0.0
1,6,20,1930,20,2020,Hail,C,137,,1.0,,,8.0,8.0,39.7571,-99.6684,39.7571,-99.6684,0.0
2,6,3,1550,20,2020,Hail,C,23,,0.75,,,14.0,14.0,39.9137,-101.9753,39.9137,-101.9753,0.0
3,6,19,1900,20,2020,Thunderstorm Wind,C,193,,52.0,,,2.0,2.0,39.34,-101.37,39.34,-101.37,0.0
4,6,20,1900,20,2020,Hail,C,137,,1.25,,,1.0,1.0,39.84,-99.89,39.84,-99.89,0.0
5,6,20,1837,20,2020,Hail,C,137,,0.75,,,6.0,6.0,39.9397,-99.8877,39.9397,-99.8877,0.0
6,6,23,1511,8,2020,Hail,C,17,,0.75,,,6.0,6.0,38.7204,-102.6783,38.7204,-102.6783,0.0
7,6,26,1830,8,2020,Hail,C,125,,0.75,,,3.0,3.0,39.9134,-102.2277,39.9134,-102.2277,0.0
8,6,26,1845,8,2020,Hail,C,125,,1.75,,,3.0,3.0,39.86,-102.18,39.86,-102.18,0.083333
9,6,9,1344,20,2020,High Wind,Z,2,0.00K,59.0,,,,,,,,,0.0


In [145]:
data2024.rename(columns={'BEGIN_YEARMONTH': 'MONTH'}, inplace=True)
data2024['MONTH'] = data2024['MONTH'].astype(str).str[-2:]
data2024.head(n=10)

Unnamed: 0,MONTH,BEGIN_DAY,BEGIN_TIME,STATE_FIPS,YEAR,EVENT_TYPE,CZ_TYPE,CZ_FIPS,DAMAGE_PROPERTY,MAGNITUDE,TOR_LENGTH,TOR_WIDTH,BEGIN_RANGE,END_RANGE,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,DURATION_HOURS
0,5,23,1947,40,2024,Hail,C,65,,1.5,,,4.0,4.0,34.638,-99.2167,34.638,-99.2167,0.0
1,11,16,230,41,2024,Heavy Snow,Z,509,0.00K,,,,,,,,,,59.85
2,5,19,1839,40,2024,Tornado,C,39,150.00K,,6.7,400.0,8.0,5.0,35.71,-99.001,35.737,-98.891,0.383333
3,5,23,2155,40,2024,Thunderstorm Wind,C,51,10.00K,61.0,,,2.0,2.0,34.9501,-97.9523,34.9501,-97.9523,0.0
4,5,24,1405,28,2024,Thunderstorm Wind,C,115,1.00K,52.0,,,0.0,0.0,34.18,-89.03,34.18,-89.03,0.083333
5,11,1,0,53,2024,Heavy Snow,Z,211,0.00K,,,,,,,,,,16.0
6,11,1,0,41,2024,Heavy Snow,Z,127,0.00K,,,,,,,,,,16.0
7,5,14,1510,28,2024,Hail,C,141,0.00K,1.0,,,1.0,1.0,34.51,-88.21,34.51,-88.21,0.083333
8,5,14,1352,47,2024,Hail,C,71,0.00K,0.88,,,1.0,1.0,35.2,-88.24,35.2,-88.24,0.083333
9,11,17,1100,41,2024,Heavy Snow,Z,127,0.00K,,,,,,,,,,34.0


In [146]:
#change in datatypes
columns = ['TOR_LENGTH', 'TOR_WIDTH', 
               'BEGIN_RANGE', 'END_RANGE', 
               'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON']
stormdata[columns] = stormdata[columns].fillna(0)
stormdata.head(n=10)
stormdata['MONTH'] = stormdata['MONTH'].astype('category')
stormdata['EVENT_TYPE'] = stormdata['EVENT_TYPE'].astype('category')
stormdata['CZ_TYPE'] = stormdata['CZ_TYPE'].astype('category')
stormdata.info()
#data2024
data2024[columns] = data2024[columns].fillna(0)
data2024.head(n=10)
data2024['MONTH'] = data2024['MONTH'].astype('category')
data2024['EVENT_TYPE'] = data2024['EVENT_TYPE'].astype('category')
data2024['CZ_TYPE'] = data2024['CZ_TYPE'].astype('category')
data2024.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268231 entries, 0 to 268230
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   MONTH            268231 non-null  category
 1   BEGIN_DAY        268231 non-null  int64   
 2   BEGIN_TIME       268231 non-null  int64   
 3   STATE_FIPS       268231 non-null  int64   
 4   YEAR             268231 non-null  int64   
 5   EVENT_TYPE       268231 non-null  category
 6   CZ_TYPE          268231 non-null  category
 7   CZ_FIPS          268231 non-null  int64   
 8   DAMAGE_PROPERTY  208593 non-null  object  
 9   MAGNITUDE        138986 non-null  float64 
 10  TOR_LENGTH       268231 non-null  float64 
 11  TOR_WIDTH        268231 non-null  float64 
 12  BEGIN_RANGE      268231 non-null  float64 
 13  END_RANGE        268231 non-null  float64 
 14  BEGIN_LAT        268231 non-null  float64 
 15  BEGIN_LON        268231 non-null  float64 
 16  END_LAT          268

In [151]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
    --------------------------------------- 3.4/150.0 MB 16.8 MB/s eta 0:00:09
   - -------------------------------------- 7.3/150.0 MB 18.9 MB/s eta 0:00:08
   -- ------------------------------------- 9.4/150.0 MB 15.9 MB/s eta 0:00:09
   --- ------------------------------------ 11.8/150.0 MB 13.9 MB/s eta 0:00:10
   --- ------------------------------------ 13.6/150.0 MB 13.0 MB/s eta 0:00:11
   ---- ----------------------------------- 15.5/150.0 MB 12.6 MB/s eta 0:00:11
   ---- ----------------------------------- 17.6/150.0 MB 12.0 MB/s eta 0:00:11
   ----- ---------------------------------- 18.9/150.0 MB 11.3 MB/s eta 0:00:12
   ----- ---------------------------------- 21.2/150.0 MB 11.3 MB/s eta 0:00:12
   ------ --------------------------------- 22.8/150.0 MB 11.0 M

In [148]:
#cleaned data

def convert_damage(value):
    if pd.isna(value):
        return 0.0
    value = value.strip().upper()
    if value.endswith('K'):
        return float(value[:-1]) * 1_000
    elif value.endswith('M'):
        return float(value[:-1]) * 1_000_000
    elif value.endswith('B'):  # Just in case there are billions
        return float(value[:-1]) * 1_000_000_000
    else:
        return float(value)  # fallback

# Apply to the DAMAGE_PROPERTY column
stormdata['DAMAGE_PROPERTY'] = stormdata['DAMAGE_PROPERTY'].apply(convert_damage).astype(float)
stormdata.info()
data2024['DAMAGE_PROPERTY'] = data2024['DAMAGE_PROPERTY'].apply(convert_damage).astype(float)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268231 entries, 0 to 268230
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   MONTH            268231 non-null  category
 1   BEGIN_DAY        268231 non-null  int64   
 2   BEGIN_TIME       268231 non-null  int64   
 3   STATE_FIPS       268231 non-null  int64   
 4   YEAR             268231 non-null  int64   
 5   EVENT_TYPE       268231 non-null  category
 6   CZ_TYPE          268231 non-null  category
 7   CZ_FIPS          268231 non-null  int64   
 8   DAMAGE_PROPERTY  268231 non-null  float64 
 9   MAGNITUDE        138986 non-null  float64 
 10  TOR_LENGTH       268231 non-null  float64 
 11  TOR_WIDTH        268231 non-null  float64 
 12  BEGIN_RANGE      268231 non-null  float64 
 13  END_RANGE        268231 non-null  float64 
 14  BEGIN_LAT        268231 non-null  float64 
 15  BEGIN_LON        268231 non-null  float64 
 16  END_LAT          268

In [149]:
#encoding categorical columns and normalizing numerical columns
categorical_columns = ['MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE']
stormdata = pd.get_dummies(stormdata, columns=['MONTH', 'EVENT_TYPE', 'CZ_TYPE'], drop_first=True) #one hot encoding categorical variables
data2024 = pd.get_dummies(data2024, columns=['MONTH', 'EVENT_TYPE', 'CZ_TYPE'], drop_first=True) #one hot encoding categorical variables
numerical_cols = stormdata.select_dtypes(include=['int64', 'float64']).columns.difference(['DAMAGE_PROPERTY'])
numerical_cols2 = data2024.select_dtypes(include=['int64', 'float64']).columns.difference(['DAMAGE_PROPERTY'])

# Step 2: Normalize numerical columns using (x - mean) / std
stormdata_normalized = stormdata.copy()
stormdata_normalized[numerical_cols] = (stormdata[numerical_cols] - stormdata[numerical_cols].mean()) / stormdata[numerical_cols].std()
stormdata_normalized.head(n=100)

data2024_normalized = data2024.copy()
data2024_normalized[numerical_cols] = (data2024[numerical_cols] - data2024[numerical_cols].mean()) / data2024[numerical_cols].std()

In [171]:
#Yall probably don't need this block so you can delete this portion maybe except for denzel, can just start from here
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X_train = stormdata_normalized.drop(columns=['DAMAGE_PROPERTY'], errors='ignore')
y_train = stormdata_normalized['DAMAGE_PROPERTY']
y_train_log = np.log1p(y_train)



model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, max_depth=4, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train_log)


#comparing with train data
y_pred_train = model.predict(X_train)
y_pred_train = np.expm1(y_pred_train)
rmse = mean_squared_error(y_train, y_pred_train, squared=False)
mae = mean_absolute_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print(f"Train RMSE: {rmse:.2f}")
print(f"Train MAE: {mae:.2f}")
print(f"Train R²: {r2:.4f}")

#comparing with test data
X_test = data2024_normalized.drop(columns=['DAMAGE_PROPERTY'], errors='ignore')
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
y_test = data2024_normalized['DAMAGE_PROPERTY']
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)

# Step 4: Evaluate
rmse_test = mean_squared_error(y_test, y_pred, squared=False)
mae_test = mean_absolute_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print(f"\n🧪 Test Set Performance (2024):")
print(f"RMSE: {rmse_test:.2f}")
print(f"MAE: {mae_test:.2f}")
print(f"R²: {r2_test:.4f}")



Train RMSE: 24979998.30
Train MAE: 282777.35
Train R²: 0.0569

🧪 Test Set Performance (2024):
RMSE: 9684425.98
MAE: 228894.30
R²: -0.0086


