In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from tqdm import tqdm
import warnings
import os
warnings.filterwarnings('ignore')

In [34]:
os.chdir(r'C:\Users\isabe\CSE2600-Group-Project')
print(os.getcwd())

C:\Users\isabe\CSE2600-Group-Project


In [None]:
df = pd.read_csv('wildfire_data_preprocessed.csv')
print(f"Data loaded: {df.shape[0]:,} rows × {df.shape[1]} columns")

In [None]:
print(f"Total records: {df.shape[0]:,}")
print(f"Wildfire days: {df['Wildfire_binary'].sum():,}")
print(f"Non-wildfire days: {(df['Wildfire_binary'] == 0).sum():,}")
print(f"Wildfire rate: {df['Wildfire_binary'].mean():.3%}")

In [None]:
display(df.head())

In [None]:
#Days in which fire occurred
fire_days = df[df['Wildfire_binary'] == 1]
print(f"Fire days: {len(fire_days):,}")

In [None]:
#creating a size for clustering
sample_size = min(50000, len(fire_days))
fire_sample = fire_days.sample(n=sample_size, random_state=42)
print(f"Using sample of {sample_size:,} fire days for clustering")

In [None]:
#preparing features for clustering
normalization_cols = ['pr', 'rmax', 'rmin', 'sph', 'srad', 'tmmn', 'tmmx', 'vs', 
                     'bi', 'fm100', 'fm1000', 'erc', 'etr', 'pet', 'vpd']

cluster_features = ['latitude', 'longitude'] + normalization_cols
X_cluster = fire_sample[cluster_features]

print(f"Clustering matrix shape: {X_cluster.shape}")
print(f"Features used: {len(cluster_features)}")
print("Feature list:")
for i, feature in enumerate(cluster_features, 1):
    print(f"  {i:2d}. {feature}")

In [None]:
#K Means Clustering - Finding Optimal Clustering
k_range = range(2, 8)
wcss = []  # Within-cluster sum of squares

for k in tqdm(k_range):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=5, max_iter=100)
    kmeans.fit(X_cluster)
    wcss.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_range, wcss, 'bo-', linewidth=2, markersize=8, label='WCSS')
plt.xlabel('# of clusters')
plt.ylabel('WCSS')
plt.title('Optimal KMeans Clustering')
plt.grid(True, alpha=0.3)
plt.legend()
plt.show()

print("WCSS values:")
for k, w in zip(k_range, wcss):
    print(f"  K={k}: {w:,.0f}")

In [None]:
#KMeans Clustering with Optimal
optimal_k = 6  # Adjust based on elbow plot above

kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
fire_sample = fire_sample.copy()
fire_sample['fire_cluster'] = kmeans.fit_predict(X_cluster)

print("Cluster distribution:")
cluster_counts = fire_sample['fire_cluster'].value_counts().sort_index()
for cluster, count in cluster_counts.items():
    percentage = (count / len(fire_sample)) * 100
    print(f"  Cluster {cluster}: {count:,} fires ({percentage:.1f}%)")

fig = plt.figure(figsize=(20, 16))

In [None]:
#Geographic distribution of fire clusters
plt.subplot(2, 2, 1)
scatter = plt.scatter(fire_sample['longitude'], fire_sample['latitude'], 
                     c=fire_sample['fire_cluster'], cmap='tab10', alpha=0.7, s=15)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Geographic Distribution of 6 Fire Clusters\n(Regional Fire Patterns Across US)')
plt.colorbar(scatter, label='Cluster ID')
plt.grid(True, alpha=0.3)

In [None]:
#PCA Visualization
plt.subplot(2, 2, 2)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cluster)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=fire_sample['fire_cluster'], 
           cmap='tab10', alpha=0.7, s=15)
plt.xlabel(f'PC 1 ({pca.explained_variance_ratio_[0]:.1%} var.)')
plt.ylabel(f'PC 2 ({pca.explained_variance_ratio_[1]:.1%} var.)')
plt.title(f'PCA Visualization of 6 Fire Clusters')
plt.colorbar(label='K Clusters')
plt.grid(True, alpha=0.3)

In [None]:
# Feature contributions to principal components
pca_components_df = pd.DataFrame(
    pca.components_.T,
    columns=['PC1', 'PC2'],
    index=cluster_features
)

print("Feature contributions to principal components:")
display(pca_components_df.sort_values('PC1', key=abs, ascending=False).head(10))

In [None]:
#Cluster Sizes
plt.subplot(2, 2, 3)
colors = plt.cm.Set3(np.linspace(0, 1, 6))
bars = plt.bar(cluster_counts.index, cluster_counts.values, color=colors, alpha=0.8)
plt.xlabel('K Clusters')
plt.ylabel('Number of Fires')
plt.title('Fire Count by Cluster (K=6)')
plt.xticks(range(6))
for bar, count in zip(bars, cluster_counts.values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 100, 
             f'{count:,}', ha='center', va='bottom', fontweight='bold')

In [None]:
#Meteorological Profiles Heatmap
plt.subplot(2, 2, 4)
cluster_profile = fire_sample.groupby('fire_cluster')[normalization_cols].mean()
sns.heatmap(cluster_profile.T, annot=True, cmap='RdBu_r', center=0, fmt='.2f',
            cbar_kws={'label': 'Z-score Normalized Value'})
plt.title('Meteorological Profiles by Fire Cluster\n(6 Regional Fire Weather Patterns)')

plt.tight_layout()
plt.show()

In [None]:
#Analyzing the 6 regional fire patterns

In [None]:
#By Geographically means
cluster_centers_geo = fire_sample.groupby('fire_cluster')[['latitude', 'longitude']].mean()
print("\nApproximate Geographic Centers:")
for cluster, (lat, lon) in cluster_centers_geo.iterrows():
    print(f"   Cluster {cluster}: Lat {lat:.2f}°, Lon {lon:.2f}°")

In [None]:
#Top differntiating weather variables
variable_importance = cluster_profile.T.abs().mean(axis=1).sort_values(ascending=False)
print(f"\nTop 5 Most Important Weather Factors:")
for i, (var, importance) in enumerate(variable_importance.head(5).items(), 1):
    print(f"   {i}. {var.upper():<8} (impact score: {importance:.3f})")

In [None]:
# Create a folder for all cluster files
folder_name = 'wildfire_clusters'
os.makedirs(folder_name, exist_ok=True)

for cluster_num in range(optimal_k):
    # Get data for this cluster
    cluster_data = fire_sample[fire_sample['fire_cluster'] == cluster_num]
    
    # Create filename with cluster info
    fire_count = len(cluster_data)
    percentage = (fire_count / len(fire_sample)) * 100
    
    filename = f'{folder_name}/cluster_{cluster_num}_{fire_count}fires_{percentage:.1f}percent.csv'
    
    # Save the file
    cluster_data.to_csv(filename, index=False)

In [None]:
# Check correlation between PET and your key features
correlations = df[['pet', 'srad', 'rmin', 'fm100']].corr()
print(correlations)