# Importing Libraries

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [89]:
df=pd.read_parquet('delhi_flood_dataset_demo.parquet')
df.head()

Unnamed: 0,Grid_ID,Hour,Elevation,Road_Density,Drain_Density,Pop_Density,Historical_Flood_Score,Rain_mm,Rain_Past3h,Drain_Water_Level,Soil_Moisture,Score,Flood_Risk
0,0,2025-07-01 00:00:00,218.0,0.0,,,,18.727006,18.727006,1.455706,0.642108,0.284651,Low
1,0,2025-07-01 01:00:00,218.0,0.0,,,,47.535715,66.262721,0.433888,0.022384,0.501866,High
2,0,2025-07-01 02:00:00,218.0,0.0,,,,36.599697,102.862418,1.30256,0.202508,0.528328,High
3,0,2025-07-01 03:00:00,218.0,0.0,,,,29.932924,114.068337,0.054226,0.085105,0.39631,Medium
4,0,2025-07-01 04:00:00,218.0,0.0,,,,7.800932,74.333553,1.603467,0.132223,0.282467,Low


In [90]:
df.describe()

Unnamed: 0,Grid_ID,Hour,Elevation,Road_Density,Drain_Density,Pop_Density,Historical_Flood_Score,Rain_mm,Rain_Past3h,Drain_Water_Level,Soil_Moisture,Score
count,1180368.0,1180368,1180368.0,1180368.0,0.0,0.0,0.0,1180368.0,1180368.0,1180368.0,1180368.0,1180368.0
mean,3512.5,2025-07-04 11:29:59.999999232,221.5029,7194.046,,,,25.00913,74.57679,0.9992122,0.5001084,0.3751308
min,0.0,2025-07-01 00:00:00,192.9524,0.0,,,,2.529462e-05,0.004918323,7.549152e-07,9.958058e-07,0.004000012
25%,1756.0,2025-07-02 17:45:00,212.1875,2238.915,,,,12.50956,56.83629,0.499964,0.2501865,0.2584603
50%,3512.5,2025-07-04 11:30:00,215.0947,5753.769,,,,25.00987,74.66838,0.9986985,0.5002418,0.3751899
75%,5269.0,2025-07-06 05:15:00,220.5391,10933.51,,,,37.49787,92.41491,1.498773,0.7498687,0.4916497
max,7025.0,2025-07-07 23:00:00,310.525,32230.6,,,,49.99992,149.0755,1.999999,0.9999973,0.740449
std,2028.232,,18.27186,6249.672,,,,14.42845,25.27804,0.5772238,0.2887504,0.1441337


In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1180368 entries, 0 to 1180367
Data columns (total 13 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   Grid_ID                 1180368 non-null  int64         
 1   Hour                    1180368 non-null  datetime64[ns]
 2   Elevation               1180368 non-null  float64       
 3   Road_Density            1180368 non-null  float64       
 4   Drain_Density           0 non-null        float64       
 5   Pop_Density             0 non-null        float64       
 6   Historical_Flood_Score  0 non-null        float64       
 7   Rain_mm                 1180368 non-null  float64       
 8   Rain_Past3h             1180368 non-null  float64       
 9   Drain_Water_Level       1180368 non-null  float64       
 10  Soil_Moisture           1180368 non-null  float64       
 11  Score                   1180368 non-null  float64       
 12  Flood_Risk    

# Data Preprocessing

In [92]:
# Drop columns that are either IDs or completely empty
df = df.drop(columns=['Grid_ID', 'Drain_Density', 'Pop_Density', 'Historical_Flood_Score', 'Score'])



In [93]:
# Extract useful time features from Hour
df['hour_of_day'] = df['Hour'].dt.hour
df['month'] = df['Hour'].dt.month
df['day_of_week'] = df['Hour'].dt.dayofweek
df.drop(columns=['Hour'], inplace=True)

In [94]:
df.head()

Unnamed: 0,Elevation,Road_Density,Rain_mm,Rain_Past3h,Drain_Water_Level,Soil_Moisture,Flood_Risk,hour_of_day,month,day_of_week
0,218.0,0.0,18.727006,18.727006,1.455706,0.642108,Low,0,7,1
1,218.0,0.0,47.535715,66.262721,0.433888,0.022384,High,1,7,1
2,218.0,0.0,36.599697,102.862418,1.30256,0.202508,High,2,7,1
3,218.0,0.0,29.932924,114.068337,0.054226,0.085105,Medium,3,7,1
4,218.0,0.0,7.800932,74.333553,1.603467,0.132223,Low,4,7,1


In [95]:
# Handle missing values
df.fillna(df.mean(numeric_only=True), inplace=True)



In [96]:
df.isnull().sum()


Unnamed: 0,0
Elevation,0
Road_Density,0
Rain_mm,0
Rain_Past3h,0
Drain_Water_Level,0
Soil_Moisture,0
Flood_Risk,0
hour_of_day,0
month,0
day_of_week,0


In [97]:
# Encode target
from sklearn.preprocessing import StandardScaler, LabelEncoder
le = LabelEncoder()
df['Flood_Risk'] = le.fit_transform(df['Flood_Risk'])

| Original Label | Encoded Value |
| -------------- | ------------- |
| **High**       | 0             |
| **Low**        | 1             |
| **Medium**     | 2             |


In [98]:
df.head()

Unnamed: 0,Elevation,Road_Density,Rain_mm,Rain_Past3h,Drain_Water_Level,Soil_Moisture,Flood_Risk,hour_of_day,month,day_of_week
0,218.0,0.0,18.727006,18.727006,1.455706,0.642108,1,0,7,1
1,218.0,0.0,47.535715,66.262721,0.433888,0.022384,0,1,7,1
2,218.0,0.0,36.599697,102.862418,1.30256,0.202508,0,2,7,1
3,218.0,0.0,29.932924,114.068337,0.054226,0.085105,2,3,7,1
4,218.0,0.0,7.800932,74.333553,1.603467,0.132223,1,4,7,1


In [99]:
print(df['Flood_Risk'].value_counts())


Flood_Risk
0    401325
1    389522
2    389521
Name: count, dtype: int64


We can observe now that all the risks are now equally divided


# Making Target and Non target Variables

In [100]:
# Split into X and y
X = df.drop(columns=['Flood_Risk'])
y = df['Flood_Risk']

# Scaling Values

In [102]:
# Select continuous features to scale
continuous_features = [
    'Elevation', 'Road_Density', 'Rain_mm', 'Rain_Past3h',
    'Drain_Water_Level', 'Soil_Moisture'
]

# Scale continuous columns
scaler = StandardScaler()
X_cont_scaled = scaler.fit_transform(df[continuous_features])
X_cont_scaled = pd.DataFrame(X_cont_scaled, columns=continuous_features)

# Handle time features (cyclical encoding)
df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day'] / 24)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# Combine scaled continuous + time features
X_final = pd.concat([
    X_cont_scaled,
    df[['hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'dow_sin', 'dow_cos']]
], axis=1)

print(X_final.head())


   Elevation  Road_Density   Rain_mm  Rain_Past3h  Drain_Water_Level  \
0  -0.191709     -1.151108 -0.435399    -2.209420           0.790845   
1  -0.191709     -1.151108  1.561263    -0.328905          -0.979385   
2  -0.191709     -1.151108  0.803314     1.118981           0.525529   
3  -0.191709     -1.151108  0.341256     1.562288          -1.637123   
4  -0.191709     -1.151108 -1.192658    -0.009622           1.046830   

   Soil_Moisture  hour_sin  hour_cos  month_sin  month_cos   dow_sin  dow_cos  
0       0.491773  0.000000  1.000000       -0.5  -0.866025  0.781831  0.62349  
1      -1.654456  0.258819  0.965926       -0.5  -0.866025  0.781831  0.62349  
2      -1.030649  0.500000  0.866025       -0.5  -0.866025  0.781831  0.62349  
3      -1.437239  0.707107  0.707107       -0.5  -0.866025  0.781831  0.62349  
4      -1.274060  0.866025  0.500000       -0.5  -0.866025  0.781831  0.62349  


# Train Test Split

In [103]:
from sklearn.model_selection import train_test_split


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# Train The Model

In [104]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.98      0.98     80297
           1       0.99      0.98      0.99     78026
           2       0.96      0.98      0.97     77751

    accuracy                           0.98    236074
   macro avg       0.98      0.98      0.98    236074
weighted avg       0.98      0.98      0.98    236074



# Making predictions

In [106]:
import pandas as pd
import numpy as np

# Example multiple grids (you can add as many rows as you want)
multi_grids = pd.DataFrame({
    'Elevation': [215.0, 220.0, 180.0],
    'Road_Density': [0.7, 0.5, 0.9],
    'Rain_mm': [12.0, 25.0, 50.0],
    'Rain_Past3h': [5.0, 10.0, 30.0],
    'Drain_Water_Level': [0.8, 1.2, 2.0],
    'Soil_Moisture': [0.3, 0.6, 0.8],
    'hour_of_day': [14, 9, 18],
    'month': [8, 9, 7],
    'day_of_week': [2, 5, 0]
})

def predict_flood_risk_multi(new_data, model, scaler, le):
    df = new_data.copy()

    # 1️⃣ Scale continuous features
    continuous_cols = ['Elevation','Road_Density','Rain_mm','Rain_Past3h','Drain_Water_Level','Soil_Moisture']
    df[continuous_cols] = scaler.transform(df[continuous_cols])

    # 2️⃣ Encode cyclical time features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day']/24)
    df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
    df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week']/7)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week']/7)

    # Drop original time columns
    df = df.drop(columns=['hour_of_day','month','day_of_week'])

    # 3️⃣ Predict
    pred_class = model.predict(df)
    pred_label = le.inverse_transform(pred_class)

    df['Pred_Class'] = pred_class
    df['Pred_Label'] = pred_label
    return df

# -------------------------------
# Predict multiple grids
result_multi = predict_flood_risk_multi(multi_grids, model, scaler, le)
print(result_multi[['Pred_Class','Pred_Label']])


   Pred_Class Pred_Label
0           1        Low
1           2     Medium
2           0       High


# Export

In [107]:
import joblib

# 1️⃣ Save RandomForest model
joblib.dump(model, 'flood_model.pkl')

# 2️⃣ Save StandardScaler
joblib.dump(scaler, 'scaler.pkl')

# 3️⃣ Save LabelEncoder
joblib.dump(le, 'label_encoder.pkl')

print("All objects exported successfully!")


All objects exported successfully!
