# United States Extreme Climate Events Analysis

Predictive analysis of extreme weather events trends in the US using NOAA data and machine learning

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Libraries loaded successfully")

✅ Libraries loaded successfully


In [6]:
# Cell 2: Load consolidated dataset
df = pd.read_csv('../data/processed/final_dataset.csv', low_memory=False)

print(f"Dataset shape: {df.shape}")
print(f"Total events: {len(df):,}")

# Show first few rows
df.head()

Dataset shape: (1520485, 52)
Total events: 1,520,485


Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,...,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE,TOTAL_DEATHS
0,200012,31,600,200012,31,900,1104812,5165377,FLORIDA,12.0,...,,,,,,,Temperatures fell into the mid-20s over Glades...,,PDC,0
1,200012,31,600,200012,31,900,1104812,5165378,FLORIDA,12.0,...,,,,,,,Temperatures fell into the mid-20s over Glades...,,PDC,0
2,200012,31,700,200012,31,800,1104812,5165379,FLORIDA,12.0,...,,,,,,,Temperatures fell into the mid-20s over Glades...,,PDC,0
3,200012,13,2200,200012,14,400,1105342,5165449,WEST VIRGINIA,54.0,...,,,,,,,"A mix of sleet, freezing rain and snow spread ...",,PDC,0
4,200008,3,1410,200008,3,1410,1101140,5172568,MISSISSIPPI,28.0,...,,FORKVILLE,32.45,-89.65,32.45,-89.65,,Several trees were blown down along and onto h...,PDC,0


In [7]:
# Cell 3: Quick data overview
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 1520485 entries, 0 to 1520484
Data columns (total 52 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   BEGIN_YEARMONTH     1520485 non-null  int64  
 1   BEGIN_DAY           1520485 non-null  int64  
 2   BEGIN_TIME          1520485 non-null  int64  
 3   END_YEARMONTH       1520485 non-null  int64  
 4   END_DAY             1520485 non-null  int64  
 5   END_TIME            1520485 non-null  int64  
 6   EPISODE_ID          1520485 non-null  int64  
 7   EVENT_ID            1520485 non-null  int64  
 8   STATE               1520484 non-null  str    
 9   STATE_FIPS          1520484 non-null  float64
 10  YEAR                1520485 non-null  int64  
 11  MONTH_NAME          1520485 non-null  str    
 12  EVENT_TYPE          1520485 non-null  str    
 13  CZ_TYPE             1520485 non-null  str    
 14  CZ_FIPS             1520485 non-null  int64  
 15  CZ_NAME             152048

In [10]:
# Cell 4: Check for missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing': missing,
    'Percentage': missing_pct
}).sort_values('Percentage', ascending=False)

print(missing_df[missing_df['Percentage'] > 0].head(15))

                    Missing  Percentage
CATEGORY            1519925   99.963170
TOR_OTHER_CZ_FIPS   1517028   99.772638
TOR_OTHER_CZ_STATE  1517028   99.772638
TOR_OTHER_WFO       1517028   99.772638
TOR_OTHER_CZ_NAME   1517028   99.772638
TOR_F_SCALE         1484774   97.651342
TOR_WIDTH           1484774   97.651342
TOR_LENGTH          1484774   97.651342
FLOOD_CAUSE         1402307   92.227612
MAGNITUDE_TYPE      1006236   66.178621
MAGNITUDE            718686   47.266892
BEGIN_RANGE          714591   46.997570
BEGIN_AZIMUTH        714591   46.997570
END_RANGE            714525   46.993229
END_AZIMUTH          714525   46.993229
