In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')


## Data Cleaning


In [None]:
# Create a copy of the DataFrame
cleaned_train_df = df.drop(['id', 'FloodProbability'], axis=1).copy()

# Define the multiplier for identifying outliers based on standard deviation
multiplier = 3

# Calculate the upper outlier threshold for each column
upper_thresholds = cleaned_train_df.mean() + multiplier * cleaned_train_df.std()

# Identify rows containing outliers in each column
outlier_rows = (cleaned_train_df > upper_thresholds).any(axis=1)

# Drop outlier rows from the DataFrame
cleaned_train_df = cleaned_train_df[~outlier_rows]
cleaned_target_df = df.loc[~outlier_rows,'FloodProbability']

# Print the shape of the cleaned DataFrame
print("Original DataFrame shape:", df.shape)
print("Cleaned DataFrame shape:", cleaned_train_df.shape)


## Modelling and Hyper Parameter Tuning



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import keras_tuner as kt

import tensorflow as tf
from tensorflow.keras import backend as K

def r2_score_metric(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return 1 - SS_res/(SS_tot + K.epsilon())


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(cleaned_train_df, cleaned_target_df, test_size=0.3, random_state=11)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def build_model(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)))

    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(tf.keras.layers.Dense(
            units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32),
            activation='relu',
            kernel_regularizer=tf.keras.regularizers.l1_l2(
                l1=hp.Float('l1_' + str(i), 1e-5, 1e-2, sampling='LOG'),
                l2=hp.Float('l2_' + str(i), 1e-5, 1e-2, sampling='LOG')
            )
        ))
        model.add(tf.keras.layers.Dropout(hp.Float('dropout_' + str(i), 0.0, 0.5, step=0.1)))

    model.add(tf.keras.layers.Dense(1))

    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='LOG')
        ),
        loss='mean_squared_error',
        metrics=[r2_score_metric]
    )
    return model

# Create a tuner
tuner = kt.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=2,
    directory='my_dir',
    project_name='intro_to_kt'
)

# Search for the best hyperparameters
tuner.search(X_train_scaled, y_train, epochs=11, validation_split=0.2)

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred)
print("R2 Score of the best model:", r2)


In [None]:
best_model.summary()



In [None]:
# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best hyperparameters:")
for key, value in best_hps.values.items():
    print(f"{key}: {value}")

In [None]:
def build_fixed_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)))
    model.add(tf.keras.layers.Dense(242, activation='relu'))
    model.add(tf.keras.layers.Dense(12, activation='relu'))
    model.add(tf.keras.layers.Dense(1))
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mean_squared_error',
        metrics=[r2_score_metric]
    )
    return model

# Build and compile the model
fixed_model = build_fixed_model()

# Print the summary of the model
fixed_model.summary()

# Train the model
fixed_model.fit(X_train_scaled, y_train, epochs=5, validation_split=0.2)



## Evaluate the Model


In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
# Evaluate the model on the test set
y_pred = fixed_model.predict(X_test_scaled)

# Calculate and print detailed metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("R² Score of the fixed model:", r2)
print("Mean Squared Error of the fixed model:", mse)
print("Mean Absolute Error of the fixed model:", mae)

In [None]:

test_df = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')

test1_df = test_df.drop('id',axis=1)

# 3. Make Predictions
X_test = scaler.transform(test1_df)  # Preprocess the test data
predictions = fixed_model.predict(X_test)  # Make predictions

# 4. Prepare Submission File
submission_df = pd.DataFrame({
    'id': test_df['id'],  # Assuming 'id' is the identifier column
    'FloodProbability': predictions.flatten()  # Assuming 'FloodProbability' is the target column
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

