Data Preprocessing


In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE


df = pd.read_csv("/content/coastal_historical_india_expanded.csv")


features = [
    "wind_speed_kmh","pressure_hpa","humidity_pct","rainfall_mm","air_temp_c",
    "tide_height_m","wave_height_m","sst_c","salinity_ppt",
    "chlorophyll_mg_m3","turbidity_ntu","month","region","season"
]
target = "threat_type"

X = df[features].copy()
y = df[target]


le_region = LabelEncoder()
X["region"] = le_region.fit_transform(X["region"])

le_season = LabelEncoder()
X["season"] = le_season.fit_transform(X["season"])

le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)


X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


smote = SMOTE(random_state=42,k_neighbors=1)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)


"""import joblib
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le_target, "label_encoder.pkl")
joblib.dump(le_region, "region_encoder.pkl")|
joblib.dump(le_season, "season_encoder.pkl")"""

print("Train shape:", X_train_scaled.shape)
print("Test shape:", X_test_scaled.shape)


Train shape: (20600, 14)
Test shape: (1100, 14)


In [66]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


X_train_scaled, X_test_scaled, y_train_balanced, y_test = X_train_scaled, X_test_scaled, y_train_balanced, y_test


xgb_model = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    random_state=42
)


xgb_model.fit(X_train_scaled, y_train_balanced)

y_pred = xgb_model.predict(X_test_scaled)

# 4. Model Evaluation
print("\n XGBoost Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le_target.classes_,zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 5. Save the model, scaler, and label encoders
"""joblib.dump(xgb_model, "coastal_threat_xgb_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le_target, "label_encoder.pkl")
joblib.dump(le_region, "region_encoder.pkl")
joblib.dump(le_season, "season_encoder.pkl")"""
#print("\n✅ XGBoost Model, Preprocessors Saved!")



 XGBoost Results:
Accuracy: 0.9427272727272727
              precision    recall  f1-score   support

 Algal Bloom       0.00      0.00      0.00        18
     Cyclone       0.00      0.00      0.00         1
     Erosion       0.81      0.43      0.57        30
   No Threat       0.95      0.99      0.97      1030
   Pollution       0.00      0.00      0.00        21

    accuracy                           0.94      1100
   macro avg       0.35      0.29      0.31      1100
weighted avg       0.91      0.94      0.92      1100

Confusion Matrix:
 [[   0    0    0   18    0]
 [   0    0    0    1    0]
 [   0    0   13   17    0]
 [   2    0    3 1024    1]
 [   0    0    0   21    0]]


'joblib.dump(xgb_model, "coastal_threat_xgb_model.pkl")\njoblib.dump(scaler, "scaler.pkl")\njoblib.dump(le_target, "label_encoder.pkl")\njoblib.dump(le_region, "region_encoder.pkl")\njoblib.dump(le_season, "season_encoder.pkl")'