In [None]:
# -------------------------------
# Step 1: Import Libraries
# -------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.cluster import KMeans
import joblib

# -------------------------------
# Step 2: Load Dataset
# -------------------------------
df = pd.read_csv("flood_risk_dataset_india.csv")
df.columns = df.columns.str.strip()

# -------------------------------
# Step 3: Data Preprocessing
# -------------------------------
# Fill missing numeric values with mean
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill missing categorical values with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Encode categorical features
categorical_features = ['Land Cover', 'Soil Type']
le_dict = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le

# -------------------------------
# Step 4: Select Features and Target
# -------------------------------
features = ['Latitude', 'Longitude', 'Rainfall (mm)', 'Temperature (°C)',
            'Humidity (%)', 'River Discharge (m³/s)', 'Water Level (m)',
            'Elevation (m)', 'Land Cover', 'Soil Type', 'Population Density',
            'Infrastructure', 'Historical Floods']

X = df[features]
y = df['Flood Occurred']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# -------------------------------
# Step 5: Random Forest Hyperparameter Tuning
# -------------------------------
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
param_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

grid_search = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
print("Best Random Forest Parameters:", grid_search.best_params_)

# Train Logistic Regression for comparison
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)

# -------------------------------
# Step 6: Evaluate Models
# -------------------------------
def evaluate_model(y_true, y_pred, model_name):
    acc = accuracy_score(y_true, y_pred)
    print(f"\n--- {model_name} ---")
    print("Accuracy:", acc)
    print("Classification Report:\n", classification_report(y_true, y_pred))
    
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{model_name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
    
    return acc

rf_preds = best_rf.predict(X_test)
lr_preds = lr_model.predict(X_test)

rf_acc = evaluate_model(y_test, rf_preds, "Random Forest")
lr_acc = evaluate_model(y_test, lr_preds, "Logistic Regression")

# -------------------------------
# Step 7: Select Best Model
# -------------------------------
final_model = best_rf if rf_acc >= lr_acc else lr_model
print("Selected Model:", "Random Forest" if rf_acc >= lr_acc else "Logistic Regression")

# Save the model
joblib.dump(final_model, "flood_model.joblib")
print("Model saved as 'flood_model.joblib'")

# -------------------------------
# Step 8: Clustering for Risk Areas
# -------------------------------
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
df['risk_cluster'] = kmeans.fit_predict(X_scaled)
cluster_labels = {0:'Low Risk', 1:'Moderate Risk', 2:'High Risk'}
df['risk_label'] = df['risk_cluster'].map(cluster_labels)

# -------------------------------
# Step 9: Visualize Clusters
# -------------------------------
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='Rainfall (mm)', y='Water Level (m)', hue='risk_label', palette='viridis', s=100)
plt.title("Flood Risk Clusters")
plt.xlabel("Rainfall")
plt.ylabel("River Level")
plt.show()

sns.pairplot(df, vars=features[:6], hue='risk_label', palette='viridis')
plt.show()
