# Regression with a Flood Prediction Dataset
> Playground Series - Season 4, Episode 5

# Importing Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor,BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor,VotingRegressor,StackingRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Conv1D, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Inspecting the DATA

In [None]:
df_train=pd.read_csv("/kaggle/input/playground-series-s4e5/train.csv")
df_test=pd.read_csv("/kaggle/input/playground-series-s4e5/test.csv")
sample_sub=pd.read_csv("/kaggle/input/playground-series-s4e5/sample_submission.csv")

In [None]:
df_train=df_train.drop('id', axis=1)

In [None]:
df_train.head()

In [None]:
df_train.describe(include='all')

In [None]:
df_train.dtypes

In [None]:
df_train=df_train.astype(float)

In [None]:
df_train.info()

# Feature Engineering

In [None]:
def simplified_getFeats(df):
    num_cols = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors']

    scaler = StandardScaler().fit(df[num_cols])
    df[num_cols] = scaler.transform(df[num_cols])  # Scale early

    # Basic Stats
    df['mean'] = df[num_cols].mean(axis=1)
    df['std'] = df[num_cols].std(axis=1)
    df['max'] = df[num_cols].max(axis=1)
    df['min'] = df[num_cols].min(axis=1)

    # Interaction Features (Simplified)
    df['Climate_Risk'] = df['MonsoonIntensity'] * df['ClimateChange']  
    df['Infrastructure_Risk'] = df['DamsQuality'] * df['DrainageSystems'] 

    return df

df_train['typ']=0
df_test['typ']=1
# Combine Data
df_all = pd.concat([df_train, df_test], axis=0)
df_all = simplified_getFeats(df_all)

# Split Back
df_train = df_all[df_all['typ'] == 0].drop(['typ'], axis=1)
df_test = df_all[df_all['typ'] == 1].drop(['typ'], axis=1)

# Prepare for Model
X = df_train.drop(['id', 'FloodProbability'], axis=1)
y = df_train['FloodProbability']


In [None]:
for feature in df_train.columns[:-1]:
    plt.scatter(df_train[feature], df_train['FloodProbability'])
    plt.xlabel(feature)
    plt.ylabel('FloodProbability')
    plt.show()

In [None]:
df_sample = df_train.sample(frac=0.05)
sns.pairplot(df_sample)
plt.show()


In [None]:
corr_matrix = df_train.corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.show()

In [None]:
df_train.hist(figsize=(24, 18))
plt.show()

In [None]:
sns.boxplot(data=df_train)
plt.show()

# Analysing Various Regression type Models

### Kfolds = 5

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'SGD Regressor': SGDRegressor(), 
    'Bayesian Ridge Regression': BayesianRidge(),
    'Bagging Regressor': BaggingRegressor(base_estimator=lgb.LGBMRegressor(verbose=-1)),
    'XGBoost Regressor': xgb.XGBRegressor(),
    'LightGBM Regressor': lgb.LGBMRegressor(verbose=-1),
}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)


for name, model in models.items():
    mse_values = []
    r2_values = []
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        r2_values.append(r2_score(y_val, y_pred))
    avg_r2 = np.mean(r2_values)
    print(f'{name}: R2 = {avg_r2:.4f}')

In [None]:
# Create a pipeline with standard scaler and a voting regressor
voting_reg = make_pipeline(StandardScaler(),
                           VotingRegressor([
                               ('XGBoost Regressor', xgb.XGBRegressor()),
                               ('LightGBM Regressor', lgb.LGBMRegressor(verbose=-1)),
                               ('Bagging Regressor', BaggingRegressor(base_estimator=lgb.LGBMRegressor(verbose=-1))),
                               ('Bayesian Ridge Regression', BayesianRidge()),
                               ('SGD Regressor', SGDRegressor()),
                               ('Ridge Regression', Ridge()),
                               ('Linear Regression', LinearRegression())
                           ]))

# Train the model with cross-validation and evaluate using mean squared error and R2 score
scores = cross_val_score(voting_reg, X_train, y_train, cv=5, scoring='r2')
r2 = scores[1].mean()
print(f'R2 (CV) = {r2:.4f}')

# Train the final model and make predictions
voting_reg.fit(X_train, y_train)
y_pred = voting_reg.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'R2 = {r2:.4f}')

# Create a stacking regressor with a ridge regression as the final estimator
stacking_reg = make_pipeline(StandardScaler(),
                             StackingRegressor(estimators=[
                                 ('XGBoost Regressor', xgb.XGBRegressor()),
                                 ('LightGBM Regressor', lgb.LGBMRegressor()),
                                 ('Bagging Regressor', BaggingRegressor(base_estimator=LinearRegression())),
                                 ('Bayesian Ridge Regression', BayesianRidge()),
                                 ('SGD Regressor', SGDRegressor()),
                                 ('Ridge Regression', Ridge()),
                                 ('Linear Regression', LinearRegression())
                             ], final_estimator=Ridge()))

# Train the model with cross-validation and evaluate using mean squared error and R2 score
scores = cross_val_score(stacking_reg, X_train, y_train, cv=5, scoring='r2')
r2 = scores[1].mean()
print(f'R2 (CV) = {r2:.4f}')

# Train the final model and make predictions
stacking_reg.fit(X_train, y_train)
y_pred = stacking_reg.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'R2 = {r2:.4f}')

> **Voting Regressor**                                                
R2(CV) : 0.8578                                             
R2 : 0.8606

> **Stacking Regressor**                                                
R2(CV) : 0.8649                                     
R2 : 0.8692

# Analysing different architechture Neural Networks

## Dense + Dropout NN

> Train R2: 0.80, Test R2: 0.80

In [None]:
def build_model_1():
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dropout(0.2))
    model.add(Dense(96, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    return model

model1 = build_model_1()

# 1. Learning Rate Scheduler: ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)

# 2. Model Checkpoint: Save the best model based on validation loss
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True, mode='min')

# Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

# Model training with History
history = model1.fit(X_train, y_train, 
                    epochs=100, 
                    batch_size=64, 
                    validation_data=(X_test, y_test), 
                    callbacks=[early_stopping, reduce_lr, checkpoint])

# Calculate R2 scores
train_r2 = r2_score(y_train, model1.predict(X_train))
test_r2 = r2_score(y_test, model1.predict(X_test))
print(f'Train R2: {train_r2:.2f}, Test R2: {test_r2:.2f}')

# Plotting the training and validation loss
plt.plot(np.arange(len(history.history['loss'])), history.history['loss'], label='train')
plt.plot(np.arange(len(history.history['val_loss'])), history.history['val_loss'], label='validation') 
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss During Training') 
plt.show()

# Plotting the R2 score during training
plt.plot(np.arange(len(history.history['loss'])), [r2_score(y_train, model.predict(X_train)) for _ in range(len(history.history['loss']))], label='train')
plt.plot(np.arange(len(history.history['val_loss'])), [r2_score(y_test, model.predict(X_test)) for _ in range(len(history.history['val_loss']))], label='validation') 
plt.xlabel('Epoch')
plt.ylabel('R2 Score')
plt.legend()
plt.title('R2 Score During Training') 
plt.show()

In [None]:
import torch
import tensorflow as tf
import gc
torch.cuda.empty_cache()
tf.keras.backend.clear_session()
gc.collect()
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

## Dense + Dropout + BatchNormalization NN

> Train R2: 0.86, Test R2: 0.86

In [None]:
def build_model_2():
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(96, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    return model

model2 = build_model_2()

# Advanced Concepts:
# 1. Learning Rate Scheduler: ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)

# 2. Model Checkpoint: Save the best model based on validation loss
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True, mode='min')

# Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Model training with History
history = model2.fit(X_train, y_train, 
                    epochs=100, 
                    batch_size=64, 
                    validation_data=(X_test, y_test), 
                    callbacks=[early_stopping, reduce_lr, checkpoint])

# Calculate R2 scores
train_r2 = r2_score(y_train, model2.predict(X_train))
test_r2 = r2_score(y_test, model2.predict(X_test))
print(f'Train R2: {train_r2:.2f}, Test R2: {test_r2:.2f}')

# Plotting the training and validation loss
plt.plot(np.arange(len(history.history['loss'])), history.history['loss'], label='train')
plt.plot(np.arange(len(history.history['val_loss'])), history.history['val_loss'], label='validation') 
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss During Training') 
plt.show()

# Plotting the R2 score during training
plt.plot(np.arange(len(history.history['loss'])), [r2_score(y_train, model2.predict(X_train)) for _ in range(len(history.history['loss']))], label='train')
plt.plot(np.arange(len(history.history['val_loss'])), [r2_score(y_test, model2.predict(X_test)) for _ in range(len(history.history['val_loss']))], label='validation') 
plt.xlabel('Epoch')
plt.ylabel('R2 Score')
plt.legend()
plt.title('R2 Score During Training') 
plt.show()

In [None]:
import torch
import tensorflow as tf
import gc
torch.cuda.empty_cache()
tf.keras.backend.clear_session()
gc.collect()
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

## Conv1D + Flatten + Dense NN
> Train R2: 0.83, Test R2: 0.84

In [None]:
def build_model_3():
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Flatten()) 
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1)) 
    model.compile(optimizer='adam', loss='mse')
    return model

model3 = build_model_3()

# Advanced Concepts:
# 1. Learning Rate Scheduler: ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)

# 2. Model Checkpoint: Save the best model based on validation loss
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True, mode='min')

# Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Model training with History
history = model3.fit(X_train, y_train, 
                    epochs=100, 
                    batch_size=64, 
                    validation_data=(X_test, y_test), 
                    callbacks=[early_stopping, reduce_lr, checkpoint])

# Calculate R2 scores
train_r2 = r2_score(y_train, model3.predict(X_train))
test_r2 = r2_score(y_test, model3.predict(X_test))
print(f'Train R2: {train_r2:.2f}, Test R2: {test_r2:.2f}')

# Plotting the training and validation loss
plt.plot(np.arange(len(history.history['loss'])), history.history['loss'], label='train')
plt.plot(np.arange(len(history.history['val_loss'])), history.history['val_loss'], label='validation') 
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss During Training') 
plt.show()

# Plotting the R2 score during training
plt.plot(np.arange(len(history.history['loss'])), [r2_score(y_train, model3.predict(X_train)) for _ in range(len(history.history['loss']))], label='train')
plt.plot(np.arange(len(history.history['val_loss'])), [r2_score(y_test, model3.predict(X_test)) for _ in range(len(history.history['val_loss']))], label='validation') 
plt.xlabel('Epoch')
plt.ylabel('R2 Score')
plt.legend()
plt.title('R2 Score During Training') 
plt.show()

In [None]:
import torch
import tensorflow as tf
import gc
torch.cuda.empty_cache()
tf.keras.backend.clear_session()
gc.collect()
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# Generating the submission using Stacking Regressor

In [None]:
df_test=df_test.drop("id", axis=1)
df_test=df_test.drop("FloodProbability", axis=1)
df_test.head()

In [None]:
sample_sub["FloodProbability"]=stacking_reg.predict(df_test)
sample_sub.to_csv("submission.csv", index=False)

In [None]:
xgb_model=xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
sample_sub["FloodProbability"]=xgb_model.predict(df_test)
sample_sub.to_csv("submission.csv", index=False)

> **Stacking Regressor**  Public Score : 0.86663                                   
> **XGBRegressor**  Public Score : 0.86631

# Let's Fine-Tune the hyperparameters for XGBRegressor, LGBMRegressor and then blending the predictions...

In [None]:
import optuna
import xgboost as xgb
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
    }

    # Split data into training and validation sets
    X_train_fold, X_val_fold, y_train_fold, y_val_fold = train_test_split(
        X_train, y_train, test_size=0.2, random_state=trial.number
    )

    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dvalid = xgb.DMatrix(X_val_fold, label=y_val_fold)

    model = xgb.train(
        params, 
        dtrain, 
        evals=[(dvalid, "validation")],  
        early_stopping_rounds=20,       
        verbose_eval=False               
    )
    
    # Get the best score from the trained model
    best_score = model.best_score

    # Report the best score as the objective value
    return best_score  

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

xgb_best_params = study.best_params
print("Best parameters:", xgb_best_params)

In [None]:
import optuna.visualization as ov

# Plot optimization history
ov.plot_optimization_history(study)

# Plot parameter importance
ov.plot_param_importances(study)

# Plot slice plot (interactive)
ov.plot_slice(study)

In [None]:
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# ... (Assuming X_train and y_train are already defined)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', -1, 15),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'objective': 'regression',  
        'metric': 'rmse',         
        'verbose': -1,
    }

    X_train_fold, X_val_fold, y_train_fold, y_val_fold = train_test_split(
        X_train, y_train, test_size=0.2, random_state=trial.number
    )
    
    # Create LightGBM datasets 
    dtrain = lgb.Dataset(X_train_fold, label=y_train_fold)
    dvalid = lgb.Dataset(X_val_fold, label=y_val_fold)

    # Train with early stopping
    model = lgb.train(
        params, 
        dtrain, 
        valid_sets=[dvalid],
    )

    # Predict and calculate RMSE on validation set
    y_pred = model.predict(X_val_fold)
    rmse = mean_squared_error(y_val_fold, y_pred, squared=False)

    # Optuna minimizes the objective, so return the RMSE directly
    return rmse 

# Create study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

lgb_best_params = study.best_params
print("Best parameters:", lgb_best_params)


In [None]:
import optuna.visualization as ov

# Plot optimization history
ov.plot_optimization_history(study)

# Plot parameter importance
ov.plot_param_importances(study)

# Plot slice plot (interactive)
ov.plot_slice(study)

In [None]:
optuna_xgb_params={'n_estimators': 160, 'max_depth': 3, 'learning_rate': 0.010764655250991451, 'subsample': 0.9665734102634232, 'colsample_bytree': 0.6544043152953372, 'gamma': 0.0015546399226326925, 'reg_alpha': 1.2907247281476344e-06, 'reg_lambda': 6.371675646557661e-08}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
optuna_xgb=xgb.XGBRegressor(**optuna_xgb_params)
optuna_xgb.fit(X_train, y_train)
sample_sub["FloodProbability_xgb"]=optuna_xgb.predict(df_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
optuna_lgb=lgb.LGBMRegressor(**lgb_best_params, verbose=-1)
optuna_lgb.fit(X_train, y_train)
sample_sub["FloodProbability_lgb"]=optuna_lgb.predict(df_test)

In [None]:
sample_sub["FloodProbability"]=(sample_sub["FloodProbability_lgb"]+sample_sub["FloodProbability_xgb"])/2
sample_sub.drop(["FloodProbability_lgb", "FloodProbability_xgb"], axis=1, inplace=True)
sample_sub.set_index("id", axis=1, inplace=True)
sample_sub

In [None]:
sample_sub.to_csv("submission.csv")

> **Optuna XGBRegressor and LGBMRegressor Blended** Public Score : 0.86679