### First approach


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pandas.plotting import bootstrap_plot
import plotly.express as px

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore")

import catboost
import xgboost
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score,make_scorer
from xgboost import XGBRegressor

In [None]:
train_data = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')
submission_data = pd.read_csv('/kaggle/input/playground-series-s4e5/sample_submission.csv')

In [None]:
print(train_data.shape)
print(test_data.shape)
print(submission_data.shape)

data = train_data.drop('id',axis=1)

In [None]:
data.describe().T

In [None]:
sns.kdeplot(data['FloodProbability'], fill=True,gridsize=100)
plt.title('FloodProbability')
plt.grid()
plt.show()

In [None]:
features = data.keys()

In [None]:
fig, axes = plt.subplots(nrows=7, ncols=3, figsize=(15, 25))
axes = axes.flatten()

for i, feature in enumerate(features):
    sns.violinplot(y=data[feature], ax=axes[i])
    axes[i].set_title(feature)
    axes[i].set_xlabel('')
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=7, ncols=3, figsize=(15, 25))
axes = axes.flatten()

for i, feature in enumerate(features):
    sns.boxplot(y=data[feature], ax=axes[i])
    ax=axes[i].set_title(feature)
    ax=axes[i].set_xlabel('')
plt.tight_layout()
plt.show()

In [None]:
sns.barplot(x='MonsoonIntensity', y='FloodProbability', data=data)

In [None]:
plt.figure(figsize=(25, 25))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
Data ,_ = train_test_split(data,test_size=0.8,random_state=42)

In [None]:
fig = px.scatter_3d(Data,x='RiverManagement',y='WetlandLoss',z='AgriculturalPractices',color='FloodProbability')
fig.show()

In [None]:
num_cols = len(data.columns)
num_rows = int(np.ceil(num_cols / 3))  #ensure that there are enough rows to accommodate all columns

fig, axs = plt.subplots(num_rows, 3, figsize=(20, num_rows * 5))
for i, col in enumerate(data.columns):
    ax = axs[i // 3, i % 3]
    ax.hist(data[col], bins=10)
    ax.set_title(f'Histogram of {col}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data['FloodProbability'], kde=True)
plt.title('Distribution of Flood Probability')
plt.xlabel('Flood Probability')
plt.ylabel('Frequency')
plt.grid(True, linestyle=':', alpha=0.7)
plt.show()

In [None]:
plt.figure(figsize=(25, 10))
sns.boxplot(data=data[features])
plt.xticks(rotation=90)
plt.title('Features Boxplot',fontsize = 20)
plt.ylabel('Frequencies',fontsize = 20)
plt.show()

In [None]:
subset_features = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
                   'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality']

sns.pairplot(Data[subset_features])
plt.title('Pairplot of Selected Features')
plt.show()

In [None]:
Data[subset_features].plot(subplots = True)
plt.subplots_adjust(hspace=0.5)
plt.show()

In [None]:
def plot_two_plots(data):

    num_features = len(data.columns)
    num_cols = 2
    num_rows = (num_features + 1) // num_cols

    print("Number of features:", num_features)

    fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, num_rows*5))
    axs = axs.flatten()

    for i, feature in enumerate(data.columns):
        try:
            sns.violinplot(y=data[feature], ax=axs[i*2])
            axs[i*2].set_title('Violinplot of ' + feature)
            sns.histplot(data=data, x=feature, kde=True, ax=axs[i*2+1])
            axs[i*2+1].set_title('Histogram of ' + feature)
        except IndexError:
            pass

    plt.tight_layout()
    plt.show()

In [None]:
mean_values = data.mean()
plt.figure(figsize=(12, 6))
sns.barplot(x=mean_values.index, y=mean_values.values)
plt.xticks(rotation=90)
plt.title('Mean Value of Features')
plt.ylabel('Mean Value')
plt.show()

In [None]:
bootstrap_plot(data[feature], size=50, samples=50000, color='blue')

In [None]:
train_data['fsum'] = train_data.iloc[:, :-2].sum(axis=1)
test_data['fsum'] = test_data.iloc[:, :-1].sum(axis=1)

In [None]:
X= train_data.drop(['FloodProbability'], axis=1)
y= train_data['FloodProbability']

In [None]:
test = test_data.drop('id',axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
cat = CatBoostRegressor(random_seed=12,
                        iterations=1500,
                        depth=7,
                        colsample_bylevel=1.0,
                        verbose=False)

In [None]:
cat.fit(X_train,y_train)

In [None]:
prediction = cat.predict(test_data)

In [None]:
submission_data.keys()

In [None]:
id = submission_data['id']

In [None]:
submission = pd.DataFrame({'id':id, 'FloodProbability': prediction})
submission.to_csv('submission.csv', index=False)

In [None]:
submission

### Second approach

In [1]:
!pip install autogluon.tabular

import numpy as np
import pandas as pd
import os
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

Collecting autogluon.tabular
  Downloading autogluon.tabular-1.1.0-py3-none-any.whl (308 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.5/308.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn<1.4.1,>=1.3.0 (from autogluon.tabular)
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Collecting autogluon.core==1.1.0 (from autogluon.tabular)
  Downloading autogluon.core-1.1.0-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.features==1.1.0 (from autogluon.tabular)
  Downloading autogluon.features-1.1.0-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.0/63.0 kB[0m [31m858.1 kB/s[0m eta [36m0:00:00[0m
Co

In [None]:
data_url_train = '/kaggle/input/playground-series-s4e5/train.csv'
train_data = TabularDataset(data_url_train)
train_data.head()

In [None]:
data_url_test = '/kaggle/input/playground-series-s4e5/test.csv'
test_data = TabularDataset(data_url_test)
test_data.head()

In [None]:
initial_features = list(test_data.drop(columns=["id"]).columns)
initial_features

In [None]:
unique_vals = []
for df in [train_data, test_data]:
    for col in initial_features:
        unique_vals += list(df[col].unique())

unique_vals = list(set(unique_vals))
unique_vals

In [None]:
for df in [train_data, test_data]:
    df['fsum'] = df[initial_features].sum(axis=1)
    df['fstd'] = df[initial_features].std(axis=1)
    df['special1'] = df['fsum'].isin(np.arange(72, 76))
    df['fskew'] = df[initial_features].skew(axis=1)
    df['fkurtosis'] = df[initial_features].kurtosis(axis=1)

    for i in [0.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0]:
        df['q_{}'.format(int(i*100))] = df[initial_features].quantile(i, axis = 1)

    for v in unique_vals:
        df['cnt_{}'.format(v)] = (df[initial_features] == v).sum(axis=1)

In [None]:
X_train, X_test = train_test_split(
     train_data.drop(columns=["id"]), test_size=0.1, random_state=42, stratify=train_data.FloodProbability)

In [None]:
hyperparameter_tune_kwargs = {
    'num_trials': 40,
    'scheduler' : 'local',
    'searcher'  : 'auto',
}

predictor = TabularPredictor(label = 'FloodProbability',
                             eval_metric = 'r2',
                             problem_type = "regression",
                            )
predictor.fit(X_train,
              time_limit = 11*60*60,
              hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
              presets = 'good_quality',
              save_space = True,
              keep_only_best = False,
             )

In [None]:
predictor.evaluate(X_test)

In [None]:
LB = predictor.leaderboard(X_test)
LB

In [None]:
test_preds = predictor.predict(test_data.drop(columns=["id"]) )
submission = pd.read_csv("/kaggle/input/playground-series-s4e5/sample_submission.csv")
submission.FloodProbability = test_preds.values
submission.head()

In [None]:
submission.to_csv("submission.csv",index=False)