In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [12]:
file_path = '/content/Flood_Medium_Term_Model_Data.csv'
flood_data = pd.read_csv(file_path)

In [13]:
flood_data.head()


Unnamed: 0,UEI,Districts,State,Category Code,temperature_2m_mean,precipitation_sum,rain_sum,snowfall_sum,wind_speed_10m_max,et0_fao_evapotranspiration,Weather Date,river_discharge
0,UEI-IMD-FL-1990-0001,Adilabad,Telangana,SUN,30.504831,0.0,0.0,0.0,11.808878,7.952146,4/3/1990,0.156112
1,UEI-IMD-FL-1990-0001,Adilabad,Telangana,SUN,30.502748,0.0,0.0,0.0,10.787993,7.523847,4/4/1990,0.156112
2,UEI-IMD-FL-1990-0001,Adilabad,Telangana,SUN,30.75067,0.0,0.0,0.0,21.42227,6.526473,4/5/1990,0.156112
3,UEI-IMD-FL-1990-0001,Adilabad,Telangana,MRAIN,31.475662,0.2,0.2,0.0,15.905319,6.98988,4/6/1990,0.156112
4,UEI-IMD-FL-1990-0001,Adilabad,Telangana,SUN,23.917334,0.0,0.0,0.0,8.287822,4.250965,12/4/1989,0.124605


In [14]:
flood_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147256 entries, 0 to 147255
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   UEI                         147256 non-null  object 
 1   Districts                   147256 non-null  object 
 2   State                       147256 non-null  object 
 3   Category Code               147256 non-null  object 
 4   temperature_2m_mean         147256 non-null  float64
 5   precipitation_sum           147256 non-null  float64
 6   rain_sum                    147256 non-null  float64
 7   snowfall_sum                147256 non-null  float64
 8   wind_speed_10m_max          147256 non-null  float64
 9   et0_fao_evapotranspiration  147256 non-null  float64
 10  Weather Date                147256 non-null  object 
 11  river_discharge             147256 non-null  float64
dtypes: float64(7), object(5)
memory usage: 13.5+ MB


In [15]:
columns_to_drop = ['UEI','Weather Date']

df = flood_data.drop(columns_to_drop, axis=1)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147256 entries, 0 to 147255
Data columns (total 10 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Districts                   147256 non-null  object 
 1   State                       147256 non-null  object 
 2   Category Code               147256 non-null  object 
 3   temperature_2m_mean         147256 non-null  float64
 4   precipitation_sum           147256 non-null  float64
 5   rain_sum                    147256 non-null  float64
 6   snowfall_sum                147256 non-null  float64
 7   wind_speed_10m_max          147256 non-null  float64
 8   et0_fao_evapotranspiration  147256 non-null  float64
 9   river_discharge             147256 non-null  float64
dtypes: float64(7), object(3)
memory usage: 11.2+ MB


In [17]:
df.head()

Unnamed: 0,Districts,State,Category Code,temperature_2m_mean,precipitation_sum,rain_sum,snowfall_sum,wind_speed_10m_max,et0_fao_evapotranspiration,river_discharge
0,Adilabad,Telangana,SUN,30.504831,0.0,0.0,0.0,11.808878,7.952146,0.156112
1,Adilabad,Telangana,SUN,30.502748,0.0,0.0,0.0,10.787993,7.523847,0.156112
2,Adilabad,Telangana,SUN,30.75067,0.0,0.0,0.0,21.42227,6.526473,0.156112
3,Adilabad,Telangana,MRAIN,31.475662,0.2,0.2,0.0,15.905319,6.98988,0.156112
4,Adilabad,Telangana,SUN,23.917334,0.0,0.0,0.0,8.287822,4.250965,0.124605


In [18]:
label_encoders = {}
for col in ['Districts', 'State']:
    le = LabelEncoder()
    flood_data[col] = le.fit_transform(flood_data[col])
    label_encoders[col] = le

In [19]:
features = flood_data.drop(columns=['UEI', 'Weather Date', 'Category Code'])
target = flood_data['Category Code']

In [20]:
numerical_cols = features.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
features[numerical_cols] = scaler.fit_transform(features[numerical_cols])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [25]:
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

In [26]:
rf_model = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf_model, rf_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)


In [27]:
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

Fitting 5 folds for each of 18 candidates, totalling 90 fits


KeyboardInterrupt: 

In [None]:
y_pred_rf = best_rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_report = classification_report(y_test, y_pred_rf)

In [None]:
print("Best parameters for Random Forest:", rf_grid.best_params_)
print("Best cross-validation accuracy for Random Forest:", rf_grid.best_score_)
print("Test Accuracy for Random Forest:", rf_accuracy)
print("Classification Report for Random Forest:\n", rf_report)

In [None]:
gb_params = {
    'n_estimators': [100, 150],
    'learning_rate': [0.1, 0.05],
    'max_depth': [3, 5, 7]
}

In [None]:
gb_model = GradientBoostingClassifier(random_state=42)
gb_grid = GridSearchCV(gb_model, gb_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

In [None]:
gb_grid.fit(X_train, y_train)
best_gb = gb_grid.best_estimator_


In [None]:
y_pred_gb = best_gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
gb_report = classification_report(y_test, y_pred_gb)


In [None]:
print("Best parameters for Gradient Boosting:", gb_grid.best_params_)
print("Best cross-validation accuracy for Gradient Boosting:", gb_grid.best_score_)
print("Test Accuracy for Gradient Boosting:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", gb_report)

In [None]:
xgb_params = {
    'n_estimators': [100, 150],
    'learning_rate': [0.1, 0.05],
    'max_depth': [3, 5, 7]
}


In [None]:
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

In [None]:
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_

In [None]:
y_pred_xgb = best_xgb.predict(X_test)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_report = classification_report(y_test, y_pred_xgb)

In [None]:
print("Best parameters for XGBoost:", xgb_grid.best_params_)
print("Best cross-validation accuracy for XGBoost:", xgb_grid.best_score_)
print("Test Accuracy for XGBoost:", xgb_accuracy)
print("Classification Report for XGBoost:\n", xgb_report)

In [28]:
from sklearn.svm import SVC

svm_params = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

svm_model = SVC()
svm_grid = GridSearchCV(svm_model, svm_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
svm_grid.fit(X_train, y_train)

best_svm = svm_grid.best_estimator_

Fitting 3 folds for each of 12 candidates, totalling 36 fits


KeyboardInterrupt: 

In [None]:
from lightgbm import LGBMClassifier

lgbm_params = {
    'n_estimators': [100, 150],
    'learning_rate': [0.1, 0.05],
    'max_depth': [3, 5, 7]
}

lgbm_model = LGBMClassifier(random_state=42)
lgbm_grid = GridSearchCV(lgbm_model, lgbm_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
lgbm_grid.fit(X_train, y_train)

best_lgbm = lgbm_grid.best_estimator_

In [None]:
from sklearn.linear_model import LogisticRegression

lr_params = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['liblinear', 'saga']
}

lr_model = LogisticRegression(max_iter=500, random_state=42)
lr_grid = GridSearchCV(lr_model, lr_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
lr_grid.fit(X_train, y_train)

best_lr = lr_grid.best_estimator_

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_params = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001]
}

mlp_model = MLPClassifier(max_iter=500, random_state=42)
mlp_grid = GridSearchCV(mlp_model, mlp_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
mlp_grid.fit(X_train, y_train)

best_mlp = mlp_grid.best_estimator_

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn_model = KNeighborsClassifier()
knn_grid = GridSearchCV(knn_model, knn_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
knn_grid.fit(X_train, y_train)

best_knn = knn_grid.best_estimator_