In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pandas.plotting import bootstrap_plot
import plotly.express as px

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore")

import catboost
import xgboost
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score,make_scorer
from xgboost import XGBRegressor

In [None]:
train_data = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')
submission_data = pd.read_csv('/kaggle/input/playground-series-s4e5/sample_submission.csv')

In [None]:
print(train_data.shape)
print(test_data.shape)
print(submission_data.shape)

data = train_data.drop('id',axis=1)

In [None]:
data.describe().T

In [None]:
sns.kdeplot(data['FloodProbability'], fill=True,gridsize=100)
plt.title('FloodProbability')
plt.grid()
plt.show()

In [None]:
features = data.keys()

In [None]:
fig, axes = plt.subplots(nrows=7, ncols=3, figsize=(15, 25))
axes = axes.flatten()

for i, feature in enumerate(features):
    sns.violinplot(y=data[feature], ax=axes[i])
    axes[i].set_title(feature)
    axes[i].set_xlabel('')
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=7, ncols=3, figsize=(15, 25))
axes = axes.flatten()

for i, feature in enumerate(features):
    sns.boxplot(y=data[feature], ax=axes[i])
    ax=axes[i].set_title(feature)
    ax=axes[i].set_xlabel('')
plt.tight_layout()
plt.show()

In [None]:
sns.barplot(x='MonsoonIntensity', y='FloodProbability', data=data)

In [None]:
plt.figure(figsize=(25, 25))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
Data ,_ = train_test_split(data,test_size=0.8,random_state=42)

In [None]:
fig = px.scatter_3d(Data,x='RiverManagement',y='WetlandLoss',z='AgriculturalPractices',color='FloodProbability')
fig.show()

In [None]:
num_cols = len(data.columns)
num_rows = int(np.ceil(num_cols / 3))  #ensure that there are enough rows to accommodate all columns

fig, axs = plt.subplots(num_rows, 3, figsize=(20, num_rows * 5))
for i, col in enumerate(data.columns):
    ax = axs[i // 3, i % 3]
    ax.hist(data[col], bins=10)
    ax.set_title(f'Histogram of {col}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data['FloodProbability'], kde=True)
plt.title('Distribution of Flood Probability')
plt.xlabel('Flood Probability')
plt.ylabel('Frequency')
plt.grid(True, linestyle=':', alpha=0.7)
plt.show()

In [None]:
plt.figure(figsize=(25, 10))
sns.boxplot(data=data[features])
plt.xticks(rotation=90)
plt.title('Features Boxplot',fontsize = 20)
plt.ylabel('Frequencies',fontsize = 20)
plt.show()

In [None]:
subset_features = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
                   'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality']

sns.pairplot(Data[subset_features])
plt.title('Pairplot of Selected Features')
plt.show()

In [None]:
Data[subset_features].plot(subplots = True)
plt.subplots_adjust(hspace=0.5)
plt.show()

In [None]:
def plot_two_plots(data):

    num_features = len(data.columns)
    num_cols = 2
    num_rows = (num_features + 1) // num_cols

    print("Number of features:", num_features)

    fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, num_rows*5))
    axs = axs.flatten()

    for i, feature in enumerate(data.columns):
        try:
            sns.violinplot(y=data[feature], ax=axs[i*2])
            axs[i*2].set_title('Violinplot of ' + feature)
            sns.histplot(data=data, x=feature, kde=True, ax=axs[i*2+1])
            axs[i*2+1].set_title('Histogram of ' + feature)
        except IndexError:
            pass

    plt.tight_layout()
    plt.show()