In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Data Loading

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')

In [None]:
df = train_df

df

In [None]:
df.head()

# Data Cleaning

In [None]:
# Dropping the 'id' column
df = df.drop('id', axis=1)

In [None]:
# Checking the datatype
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
# Consider using a more descriptive variable name
features_1 = [
    'MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
    'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
    'Siltation', 'AgriculturalPractices', 'Encroachments',
    'IneffectiveDisasterPreparedness', 'DrainageSystems'
]

features_2 = [
    'CoastalVulnerability', 'Landslides', 'Watersheds',
    'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
    'InadequatePlanning', 'PoliticalFactors', 'FloodProbability'
]

In [None]:
# Set figure size
plt.figure(figsize=(12, 6))  # Adjust width and height as needed
sns.boxplot(data=df[features_1])  # Create the boxplot

# Rotate x-axis tick labels by 90 degrees
plt.xticks(rotation=90)

# Add a title
plt.title('Boxplot Matrix')

plt.show()

In [None]:
# Set figure size
plt.figure(figsize=(12, 6))  # Adjust width and height as needed
sns.boxplot(data=df[features_2])  # Create the boxplot

# Rotate x-axis tick labels by 90 degrees
plt.xticks(rotation=90)

# Add a title
plt.title('Boxplot Matrix')

plt.show()

In [None]:
# Set figure size
plt.figure(figsize=(5, 5))  # Adjust width and height as needed
sns.boxplot(data=df['FloodProbability'])  # Create the boxplot

# Rotate x-axis tick labels by 90 degrees
plt.xticks(rotation=90)

# Add a title
plt.title('Boxplot Matrix')

plt.show()

In [None]:
# Identify the outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

condition = ~((df < (Q1 - 1.5*IQR)) | (df > (Q3 + 1.5*IQR))).any(axis=1)
df_cl = df.loc[condition, df.columns]

In [None]:
all_features = df_cl.columns
 
# Standardization
scaler = StandardScaler()
df_cl[all_features] = scaler.fit_transform(df_cl[all_features])

In [None]:
df_cl.duplicated().sum()

In [None]:
df_cl.isna().sum()

In [None]:
df_cl.describe()

In [None]:
# Count the number of variables
num_vars = df_cl.shape[1]

# Determine the number of rows and columns for the subplot grid
n_cols = 4  # Desired number of columns
n_rows = -(-num_vars // n_cols)  # Ceiling division to determine the number of rows

# Create subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 4))

# Flatten the axes array for easier iteration
axes = axes.flatten()

# Plot each variable
for i, column in enumerate(df_cl.columns):
    df_cl[column].hist(ax=axes[i], bins=20, edgecolor='black', grid=False)
    axes[i].set_title(column)
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')

# Remove unused subplots (if any)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

In [None]:
# Calculate the correlation matrix
corr_matrix = df_cl.corr()

# Create a heatmap
plt.figure(figsize=(20, 20))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Calculating correlation the feature to the target
target_corr = df_cl.corr()['FloodProbability']
 
# Sorting the result by the correlation value
target_corr_sorted = target_corr.abs().sort_values(ascending=False)
 
plt.figure(figsize=(10, 6))
target_corr_sorted.plot(kind='bar')
plt.title(f'Correlation with Flood Probability')
plt.xlabel('Features')
plt.ylabel('Correlation Coefficient')
plt.show()

In [None]:
# Splitting the feature (X) and target (y)
X = df_cl.drop(columns=['FloodProbability'])
y = df_cl['FloodProbability']

# Splitting train and test data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

# Linear Regression

In [None]:
LR = LinearRegression().fit(x_train, y_train)
 
y_pred = LR.predict(x_test)
 
mae_LR = mean_absolute_error(y_test, y_pred)
mse_LR = mean_squared_error(y_test, y_pred)
r2_LR = r2_score(y_test, y_pred)
 
print(f"MAE: {mae_LR}")
print(f"MSE: {mse_LR}")
print(f"R²: {r2_LR}")