In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
import os
from dotenv import load_dotenv
warnings.filterwarnings('ignore')

In [2]:
#loading data
import kagglehub

# Download latest version
path = kagglehub.dataset_download("firecastrl/us-wildfire-dataset")

In [3]:
load_dotenv(override=True)
data_path = os.getenv("DATA_PATH")
df = pd.read_csv(data_path)

In [4]:
#shows if object or numeric
df.head()      # Show first 5 rows
df.info()      # Show data types and non-null counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9509925 entries, 0 to 9509924
Data columns (total 19 columns):
 #   Column     Dtype  
---  ------     -----  
 0   latitude   float64
 1   longitude  float64
 2   datetime   object 
 3   Wildfire   object 
 4   pr         float64
 5   rmax       float64
 6   rmin       float64
 7   sph        float64
 8   srad       float64
 9   tmmn       float64
 10  tmmx       float64
 11  vs         float64
 12  bi         float64
 13  fm100      float64
 14  fm1000     float64
 15  erc        float64
 16  etr        float64
 17  pet        float64
 18  vpd        float64
dtypes: float64(17), object(2)
memory usage: 1.3+ GB


In [5]:
print(df['Wildfire'].value_counts())

Wildfire
No     9007860
Yes     502065
Name: count, dtype: int64


In [6]:
print(df.describe())

           latitude     longitude            pr          rmax          rmin  \
count  9.509925e+06  9.509925e+06  9.509925e+06  9.509925e+06  9.509925e+06   
mean   3.914362e+01 -1.064347e+02  9.036891e+01  1.645754e+02  1.220707e+02   
std    5.267332e+00  1.456924e+01  1.701836e+03  1.698086e+03  1.700272e+03   
min    2.526027e+01 -1.244370e+02  0.000000e+00  5.000000e+00  1.000000e+00   
25%    3.468867e+01 -1.183439e+02  0.000000e+00  6.180000e+01  1.930000e+01   
50%    3.853161e+01 -1.109258e+02  0.000000e+00  8.000000e+01  3.080000e+01   
75%    4.367004e+01 -9.541990e+01  0.000000e+00  9.520000e+01  4.550000e+01   
max    4.899873e+01 -6.701250e+01  3.276700e+04  3.276700e+04  3.276700e+04   

                sph          srad          tmmn          tmmx            vs  \
count  9.509925e+06  9.509925e+06  9.509925e+06  9.509925e+06  9.509925e+06   
mean   8.864304e+01  3.091587e+02  3.682137e+02  3.821415e+02  9.239142e+01   
std    1.701914e+03  1.692717e+03  1.687376e+03  1.

In [7]:
print(df.head(10))

    latitude  longitude    datetime Wildfire   pr   rmax  rmin      sph  \
0  48.128431 -97.276685  2018-08-15       No  0.0   78.6  14.9  0.00582   
1  48.128431 -97.276685  2018-08-16       No  0.0   80.4  13.9  0.00676   
2  48.128431 -97.276685  2018-08-17       No  0.0   70.9  20.4  0.00672   
3  48.128431 -97.276685  2018-08-18       No  5.0   65.2  19.4  0.00756   
4  48.128431 -97.276685  2018-08-19       No  0.0  100.0  42.2  0.00895   
5  48.128431 -97.276685  2018-08-20       No  0.0   72.1  22.2  0.00588   
6  48.128431 -97.276685  2018-08-21       No  0.0   70.3  17.4  0.00539   
7  48.128431 -97.276685  2018-08-22       No  0.0   56.2  11.9  0.00526   
8  48.128431 -97.276685  2018-08-23       No  0.0   89.3  34.8  0.00959   
9  48.128431 -97.276685  2018-08-24       No  0.0  100.0  47.4  0.01186   

    srad   tmmn   tmmx   vs    bi  fm100  fm1000   erc   etr  pet   vpd  
0  272.6  282.0  301.6  3.0  40.0   10.2    12.2  54.0   7.5  5.5  1.59  
1  264.0  283.9  304.9  3.

In [8]:
#looking for missing values 
print(df.isnull().sum())

latitude     0
longitude    0
datetime     0
Wildfire     0
pr           0
rmax         0
rmin         0
sph          0
srad         0
tmmn         0
tmmx         0
vs           0
bi           0
fm100        0
fm1000       0
erc          0
etr          0
pet          0
vpd          0
dtype: int64


In [9]:
# Convert Wildfire to binary
df['Wildfire_binary'] = (df['Wildfire'] == 'Yes').astype('int8')

print("Wildfire distribution:")
print(df['Wildfire_binary'].value_counts())
print(f"Wildfire rate: {df['Wildfire_binary'].mean():.3f}")

Wildfire distribution:
Wildfire_binary
0    9007860
1     502065
Name: count, dtype: int64
Wildfire rate: 0.053


In [10]:
#normalization with z score
normalization_cols = ['pr', 'rmax', 'rmin', 'sph', 'srad', 'tmmn', 'tmmx', 'vs', 
                     'bi', 'fm100', 'fm1000', 'erc', 'etr', 'pet', 'vpd']

scaler = StandardScaler()
df[normalization_cols] = scaler.fit_transform(df[normalization_cols])

In [11]:
df.head(10)

Unnamed: 0,latitude,longitude,datetime,Wildfire,pr,rmax,rmin,sph,srad,tmmn,tmmx,vs,bi,fm100,fm1000,erc,etr,pet,vpd,Wildfire_binary
0,48.128431,-97.276685,2018-08-15,No,-0.053101,-0.050631,-0.063031,-0.052081,-0.021598,-0.051093,-0.047752,-0.05253,-0.048324,-0.053601,-0.053433,-0.045941,-0.050984,-0.051275,-0.051811,0
1,48.128431,-97.276685,2018-08-16,No,-0.053101,-0.049571,-0.06362,-0.05208,-0.026678,-0.049967,-0.045796,-0.05253,-0.048324,-0.053895,-0.053551,-0.044765,-0.050573,-0.05104,-0.051612,0
2,48.128431,-97.276685,2018-08-17,No,-0.053101,-0.055165,-0.059797,-0.05208,-0.025733,-0.048841,-0.048286,-0.052471,-0.048324,-0.054189,-0.053609,-0.044765,-0.051161,-0.051393,-0.051858,0
3,48.128431,-97.276685,2018-08-18,No,-0.050163,-0.058522,-0.060385,-0.05208,-0.028214,-0.046945,-0.046744,-0.051296,-0.071849,-0.053777,-0.053551,-0.054178,-0.049515,-0.050453,-0.051659,0
4,48.128431,-97.276685,2018-08-19,No,-0.053101,-0.038028,-0.046975,-0.052079,-0.084101,-0.050027,-0.050776,-0.051531,-0.047736,-0.052661,-0.053492,-0.050059,-0.052688,-0.052509,-0.052358,0
5,48.128431,-97.276685,2018-08-20,No,-0.053101,-0.054459,-0.058738,-0.052081,-0.026206,-0.050204,-0.050183,-0.051942,-0.045971,-0.053307,-0.053551,-0.04653,-0.051161,-0.051452,-0.052029,0
6,48.128431,-97.276685,2018-08-21,No,-0.053101,-0.055519,-0.061561,-0.052081,-0.029159,-0.050738,-0.049471,-0.052883,-0.050088,-0.053836,-0.053668,-0.044765,-0.051748,-0.051804,-0.051935,0
7,48.128431,-97.276685,2018-08-22,No,-0.053101,-0.063822,-0.064796,-0.052081,-0.028391,-0.048841,-0.047456,-0.05159,-0.041855,-0.054483,-0.053786,-0.043,-0.049574,-0.05057,-0.051629,0
8,48.128431,-97.276685,2018-08-23,No,-0.053101,-0.04433,-0.051327,-0.052079,-0.035244,-0.047775,-0.048582,-0.051649,-0.044795,-0.053954,-0.053786,-0.045941,-0.051396,-0.051569,-0.052105,0
9,48.128431,-97.276685,2018-08-24,No,-0.053101,-0.038028,-0.043917,-0.052077,-0.065846,-0.048308,-0.048938,-0.052354,-0.050676,-0.052719,-0.053668,-0.049471,-0.052865,-0.052509,-0.052381,0


In [12]:
#makes a file
df.to_csv('wildfire_data_preprocessed.csv', index=False)