# S4E5 Regression with a Flood Prediction Dataset

## Preparation

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

DATA_PATH = './data/'
ORIGINAL_DATA_PATH = DATA_PATH
SUBMISSIONS_PATH = './submissions/'
MODELS_PATH = './trained_models/'
TEMP_PATH = './temp/'

# Load the data
train = pd.read_csv(DATA_PATH + 'train.csv', index_col='id')
test = pd.read_csv(DATA_PATH + 'test.csv', index_col='id')
original = pd.read_csv(ORIGINAL_DATA_PATH + 'flood.csv')
original.index.rename('id', inplace=True)
new_train = pd.concat([train, original], axis=0)


In [2]:
NON_FEATURES = ['FloodProbability', 'fold']
BASE_FEATURES = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']

# meta features from https://www.kaggle.com/code/igorvolianiuk/flood-prediction-lgbm
def add_features(df):
    df['total'] = df[BASE_FEATURES].sum(axis=1)
    df['amplified_sum'] = (df[BASE_FEATURES] ** 1.5).sum(axis=1)
    df['fskew'] = df[BASE_FEATURES].skew(axis=1)
    df['fkurtosis'] = df[BASE_FEATURES].kurtosis(axis=1)
    df['mean'] = df[BASE_FEATURES].mean(axis=1)
    df['std'] = df[BASE_FEATURES].std(axis=1)
    df['max'] = df[BASE_FEATURES].max(axis=1)
    df['min'] = df[BASE_FEATURES].min(axis=1)
    df['range'] = df['max'] - df['min']
    df['median'] = df[BASE_FEATURES].median(axis=1)
    df['ptp'] = df[BASE_FEATURES].values.ptp(axis=1)
    df['q25'] = df[BASE_FEATURES].quantile(0.25, axis=1)
    df['q75'] = df[BASE_FEATURES].quantile(0.75, axis=1)
    return df

new_train = add_features(new_train)
test = add_features(test)

## Defining model

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Set device to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Data Preparation
X = new_train.drop('FloodProbability', axis=1)
y = new_train['FloodProbability']

scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(test), columns=test.columns)

# Convert the pandas DataFrame to numpy and then to torch Tensors
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_t = torch.tensor(X_train.values, dtype=torch.float32).to(device)
y_train_t = torch.tensor(y_train.values, dtype=torch.float32).to(device)
X_val_t = torch.tensor(X_val.values, dtype=torch.float32).to(device)
y_val_t = torch.tensor(y_val.values, dtype=torch.float32).to(device)
X_test_t = torch.tensor(X_test.values, dtype=torch.float32).to(device)
X_t = torch.tensor(X.values, dtype=torch.float32).to(device)
y_t = torch.tensor(y.values, dtype=torch.float32).to(device)

# Create DataLoader for the training data
train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=2048, shuffle=True)
all_train_loader = DataLoader(TensorDataset(X_t, y_t), batch_size=2048, shuffle=True)

In [6]:

### Define the MLP Model
class MLPRegressor(nn.Module):
    def __init__(self, input_dim=X_train.shape[1]):
        super(MLPRegressor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # Adjust the input dimension dynamically
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256, 128)
        self.fc5 = nn.Linear(128, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.dropout(self.relu(self.fc3(x)))
        x = self.relu(self.fc4(x))
        return self.fc5(x)

mlp_model = MLPRegressor().to(device)

### Training Parameters
criterion = nn.MSELoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.001)

### Training Loop
num_epochs = 300
mlp_model.train()
for epoch in range(num_epochs):
    for data, targets in all_train_loader:
        optimizer.zero_grad()
        outputs = mlp_model(data)
        loss = criterion(outputs, targets.view(-1, 1))
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')
torch.save(mlp_model, MODELS_PATH + 'mlp5.pth')


Epoch 1, Loss: 0.0007724763127043843
Epoch 2, Loss: 0.000412262073950842
Epoch 3, Loss: 0.00043280154932290316
Epoch 4, Loss: 0.0003684451512526721
Epoch 5, Loss: 0.000402844074415043
Epoch 6, Loss: 0.0003893839311785996
Epoch 7, Loss: 0.00038644575397484004
Epoch 8, Loss: 0.00034366524778306484
Epoch 9, Loss: 0.0003396949323359877
Epoch 10, Loss: 0.0003802865685429424
Epoch 11, Loss: 0.0003342445124872029
Epoch 12, Loss: 0.0003552164707798511
Epoch 13, Loss: 0.0003413427621126175
Epoch 14, Loss: 0.0003203046799171716
Epoch 15, Loss: 0.00034489063546061516
Epoch 16, Loss: 0.0003797139797825366
Epoch 17, Loss: 0.00037352429353632033
Epoch 18, Loss: 0.00037184380926191807
Epoch 19, Loss: 0.0003514110576361418
Epoch 20, Loss: 0.00038674272946082056
Epoch 21, Loss: 0.000331233226461336
Epoch 22, Loss: 0.0003847444895654917
Epoch 23, Loss: 0.00037081766640767455
Epoch 24, Loss: 0.00039203488267958164
Epoch 25, Loss: 0.0003863824240397662
Epoch 26, Loss: 0.00037658607470802963
Epoch 27, Loss

In [8]:
### Evaluation
# mlp_model.eval()
# with torch.no_grad():
#     train_predictions = mlp_model(X_train_t).view(-1)
#     score = r2_score(y_train, train_predictions.cpu())
#     print(f'Training R2 Score: {score}')
#     predictions = mlp_model(X_val_t).view(-1)
#     score = r2_score(y_val, predictions.cpu())
#     print(f'Validating R2 Score: {score}')
    
#     tmp = pd.DataFrame({'id': X_val.index, 'FloodProbability': predictions.cpu()})
#     tmp.to_csv(TEMP_PATH + 'mlp_val.csv', index=False)


In [7]:
mlp_model.eval()
with torch.no_grad():
    predictions = mlp_model(X_test_t).view(-1)
    submission = pd.DataFrame({'id': test.index, 'FloodProbability': predictions.cpu()})
    submission.to_csv(SUBMISSIONS_PATH + 'mlp5.csv', index=False)


In [9]:
class CNNRegressor(nn.Module):
    def __init__(self, input_dim=X_train.shape[1]):
        super(CNNRegressor, self).__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=3, stride=1, padding=1)  # Input channels, output channels, kernel size
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * input_dim, 128)  # Adjust size based on output of last conv layer
        self.fc2 = nn.Linear(128, 1)
        self.relu = nn.ReLU()
        self.flatten = nn.Flatten()

    def forward(self, x):
        x = x.unsqueeze(1)  # Add a channel dimension here in the forward pass
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        return self.fc2(x)

cnn_model = CNNRegressor().to(device)

# Training Parameters
criterion = nn.MSELoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=0.0005)

# Training Loop
num_epochs = 300
for epoch in range(num_epochs):
    cnn_model.train()
    running_loss = 0.0
    for data, targets in all_train_loader:
        optimizer.zero_grad()
        outputs = cnn_model(data)
        loss = criterion(outputs, targets.view(-1, 1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')
torch.save(cnn_model, MODELS_PATH + 'cnn5.pth')

Epoch 1, Loss: 0.00037581182550638914
Epoch 2, Loss: 0.00038602156564593315
Epoch 3, Loss: 0.00036227694363333285
Epoch 4, Loss: 0.0004159337258897722
Epoch 5, Loss: 0.00035183419822715223
Epoch 6, Loss: 0.0003638999769464135
Epoch 7, Loss: 0.0003570918634068221
Epoch 8, Loss: 0.0003477736026979983
Epoch 9, Loss: 0.0004024760564789176
Epoch 10, Loss: 0.00039086255128495395
Epoch 11, Loss: 0.0003162657667417079
Epoch 12, Loss: 0.0003838306583929807
Epoch 13, Loss: 0.0004138789081480354
Epoch 14, Loss: 0.000355051044607535
Epoch 15, Loss: 0.0004736946430057287
Epoch 16, Loss: 0.00035088456934317946
Epoch 17, Loss: 0.00041215901728719473
Epoch 18, Loss: 0.00038932295865379274
Epoch 19, Loss: 0.0004177643568255007
Epoch 20, Loss: 0.0003422682930249721
Epoch 21, Loss: 0.00038642046274617314
Epoch 22, Loss: 0.0004015204613097012
Epoch 23, Loss: 0.0003421467263251543
Epoch 24, Loss: 0.0003769267932511866
Epoch 25, Loss: 0.0003298541996628046
Epoch 26, Loss: 0.0003442151937633753
Epoch 27, Los

In [None]:
### Evaluation
# cnn_model.eval()
# with torch.no_grad():
#     train_predictions = cnn_model(X_train_t).view(-1)
#     score = r2_score(y_train, train_predictions.cpu())
#     print(f'Training R2 Score: {score}')
#     predictions = cnn_model(X_val_t).view(-1)
#     score = r2_score(y_val, predictions.cpu())
#     print(f'Validating R2 Score: {score}')
    
#     tmp = pd.DataFrame({'id': X_val.index, 'FloodProbability': predictions.cpu()})
#     tmp.to_csv(TEMP_PATH + 'cnn_val.csv', index=False)

In [12]:
cnn_model.eval()
with torch.no_grad():
    predictions = cnn_model(X_test_t).view(-1)
    submission = pd.DataFrame({'id': test.index, 'FloodProbability': predictions.cpu()})
    submission.to_csv(SUBMISSIONS_PATH + 'cnn5.csv', index=False)