In [None]:
!pip install lightgbm

In [None]:
!pip install imblearn

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from imblearn.combine import SMOTEENN
import joblib

# 1. Load Data from Google Drive
print(" Step 1: Loading Data ")
# This block is for mounting your Google Drive in the Colab environment.
try:
    from google.colab import drive
    drive.mount('/content/drive')
    #Download the csv file  from the drive link(provided in DATA.txt) to your google drive and paste the path of the file instead of this path.
    file_path = '/content/drive/MyDrive/fires_cleaned_final.csv'
    df = pd.read_csv(file_path, low_memory=False)
    print(" CSV Data loaded successfully from Google Drive.")
except Exception as e:
    print(f" An error occurred while loading the data from Google Drive: {e}")
    exit()


# 2. Clean and Filter Data
print("\n Cleaning and Filtering Data ")
initial_rows = len(df)
vague_causes = ['Missing/Undefined', 'Miscellaneous']
df_filtered = df[~df['STAT_CAUSE_DESCR'].isin(vague_causes)]
print(f"Removed {initial_rows - len(df_filtered)} rows with vague causes.")
print(f"Remaining classes for training: {df_filtered['STAT_CAUSE_DESCR'].unique()}")


# 3. Advanced Feature Engineering
print("\n Advanced Feature Engineering ")
features = ['FIRE_YEAR', 'DISCOVERY_DOY', 'FIRE_SIZE', 'LATITUDE', 'LONGITUDE',
            'OWNER_CODE', 'STATE', 'NWCG_REPORTING_AGENCY']
target = 'STAT_CAUSE_DESCR'
df_model = df_filtered[features + [target]].copy()
df_model.dropna(inplace=True)

# Create cyclical time features
df_model['doy_sin'] = np.sin(2 * np.pi * df_model['DISCOVERY_DOY']/365.0)
df_model['doy_cos'] = np.cos(2 * np.pi * df_model['DISCOVERY_DOY']/365.0)

# Create interaction feature
df_model['lat_lon_interaction'] = df_model['LATITUDE'] * df_model['LONGITUDE']

# Prepare final feature set
X = df_model.drop([target, 'DISCOVERY_DOY'], axis=1)
y = df_model[target]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Convert categorical columns to integer codes for SMOTEENN
categorical_features = ['OWNER_CODE', 'STATE', 'NWCG_REPORTING_AGENCY']
category_maps = {}
for col in categorical_features:
    X[col] = X[col].astype('category')
    category_maps[col] = dict(zip(X[col].cat.categories, X[col].cat.codes))
    X[col] = X[col].cat.codes
print(" Advanced feature engineering complete. Categorical features are now integer encoded.")

# 4. Split Data
print("\n Splitting Data ")
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
print(" Data splitting complete.")

#5. Apply SMOTEENN
print("\n Applying SMOTEENN ")
sme = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = sme.fit_resample(X_train, y_train)
print(f" SMOTEENN complete. New training set size: {len(X_train_resampled)}")

#6. Train a More Aggressive LightGBM Model
print("\n Training High-Performance Model  ")

lgbm_tuned = lgb.LGBMClassifier(
    objective='multiclass',
    n_jobs=-1,
    num_class=len(label_encoder.classes_),
    n_estimators=2000,
    learning_rate=0.02, num_leaves=61, max_depth=15,
    reg_alpha=0.1, reg_lambda=0.1, random_state=42,
    colsample_bytree=0.8, subsample=0.8
)
# Tell LightGBM which integer columns should be treated as categories
lgbm_tuned.fit(X_train_resampled, y_train_resampled, eval_set=[(X_test, y_test)],
               eval_metric='multi_logloss', callbacks=[lgb.early_stopping(150, verbose=False)],
               categorical_feature=categorical_features)
print(" Model training complete.")

#7. Evaluate and Save Artifacts
print("\n Evaluating and Saving Model ")
y_pred = lgbm_tuned.predict(X_test)
print("\nFinal Classification Report (Focused Model):")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Save the model and artifacts to your Google Drive for later use
output_path = '/content/drive/MyDrive/'
joblib.dump(lgbm_tuned, output_path + 'wildfire_cause_model_focused.joblib')
joblib.dump(label_encoder, output_path + 'label_encoder_focused.joblib')
joblib.dump(X.columns.tolist(), output_path + 'training_columns_focused.joblib')
joblib.dump(category_maps, output_path + 'category_maps_focused.joblib')
#Added location data back to the defaults
default_values = {
    'FIRE_YEAR': X['FIRE_YEAR'].mode()[0], 'FIRE_SIZE': X['FIRE_SIZE'].median(),
    'DISCOVERY_DOY': 182,
    'LATITUDE': 39.8283, # Center of the US
    'LONGITUDE': -98.5795, # Center of the US
    'OWNER_CODE': X['OWNER_CODE'].mode()[0], 'STATE': X['STATE'].mode()[0],
    'NWCG_REPORTING_AGENCY': X['NWCG_REPORTING_AGENCY'].mode()[0]
}
joblib.dump(default_values, output_path + 'default_values_focused.joblib')
print(f"\n Focused model and artifacts have been saved successfully to your Google Drive at: {output_path}")