In [34]:
import numpy as np
import pandas as pd 
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

train = pd.read_csv('data/reduced_new_train.csv', index_col='id')
test = pd.read_csv('data/test.csv', index_col='id')
# X = train.drop(['Rings'], axis=1)
# y = train['Rings']
test['Rings'] = 0
X = pd.get_dummies(train)
X_test = pd.get_dummies(test)

features = X.columns.tolist()

# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)
# X_test = scaler.transform(X_test)

X_train, X_val = train_test_split(X, test_size=0.2)

X_train = pd.DataFrame(X_train, columns=features)
X_val = pd.DataFrame(X_val, columns=features)
X_test = pd.DataFrame(X_test, columns=features)
X = pd.DataFrame(X, columns=features)

class TabularDataset(Dataset):
    def __init__(self, data, target_column):
        self.data = data.drop(target_column, axis=1)
        
        # Convert data to float32 dtype for PyTorch and handle non-numeric issues
        for col in self.data.columns:
            self.data[col] = pd.to_numeric(self.data[col], errors='coerce')  # Coerce errors will convert non-numeric to NaN
        
        # Fill NaN values with the mean (or median, zero, etc.)
        self.data.fillna(self.data.mean(), inplace=True)
        
        self.targets = data[target_column].astype(float).values  # Ensure target is also float
        
        # Convert DataFrame to numpy array
        self.data = self.data.values.astype(np.float32)
        self.targets = self.targets.astype(np.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = torch.from_numpy(self.data[idx])  # Now safely convert to tensor
        y = torch.tensor(self.targets[idx], dtype=torch.float32)
        return x, y

train_loader = DataLoader(TabularDataset(X_train, 'Rings'), batch_size=128, shuffle=True, pin_memory=True)
val_loader = DataLoader(TabularDataset(X_val, 'Rings'), batch_size=128, shuffle=False, pin_memory=True)
test_loader = DataLoader(TabularDataset(X_test, 'Rings'), batch_size=128, shuffle=False, pin_memory=True)
full_train_loader = DataLoader(TabularDataset(X, 'Rings'), batch_size=128, shuffle=True, pin_memory=True)

y_train = X_train['Rings']
y_val = X_val['Rings']
y = X['Rings']
X_train = X_train.drop(['Rings'], axis=1)
X_val = X_val.drop(['Rings'], axis=1)
X = X.drop(['Rings'], axis=1)

X_test = X_test.drop(['Rings'], axis=1)

cuda


In [35]:
def rmsle(y_true, y_pred):
    # Ensure predictions are non-negative since log of negative is undefined
    y_pred[y_pred < 0] = 0
    log_pred = np.log1p(y_pred)
    log_true = np.log1p(y_true)
    return np.sqrt(np.mean((log_pred - log_true) ** 2))


def rmsle_torch(y_pred, y_true):
    y_pred = torch.clamp(y_pred, 0, None)
    return torch.sqrt(F.mse_loss(torch.log1p(y_pred), torch.log1p(y_true)))


In [36]:
class CNNRegressor(nn.Module):
    def __init__(self, num_features):
        super(CNNRegressor, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=num_features, out_channels=32, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=128, kernel_size=1)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=1)
        self.conv4 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=1)
        self.adaptive_pool = nn.AdaptiveAvgPool1d(1)
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = x.unsqueeze(2)  # Reshape x to [batch_size, num_features, 1] for Conv1D
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.adaptive_pool(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [37]:
class NNRegressor(nn.Module):
    def __init__(self, input_size):
        super(NNRegressor, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(num_features=128)
        self.dropout1 = nn.Dropout(0.1)
        
        self.fc2 = nn.Linear(128, 512)
        self.bn2 = nn.BatchNorm1d(num_features=512)
        self.dropout2 = nn.Dropout(0.2)
        
        self.fc3 = nn.Linear(512, 512)
        self.bn3 = nn.BatchNorm1d(num_features=512)
        self.dropout3 = nn.Dropout(0.2)
        
        self.fc4 = nn.Linear(512, 128)
        self.bn4 = nn.BatchNorm1d(num_features=128)
        self.dropout4 = nn.Dropout(0.1)
        
        self.output = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)
        
        x = F.relu(self.bn4(self.fc4(x)))
        x = self.dropout4(x)
        
        x = self.output(x)
        return x

In [38]:
# LGBM Model with 0.14824868023245932
# Best hyperparameters:{'learning_rate': 0.031683333363955915, 'n_estimators': 957, 'max_depth': 16, 'num_leaves': 86, 'min_child_samples': 43, 'subsample': 0.9556403670916472, 'colsample_bytree': 0.6768299497756863}
# XGBoosts Model with 0.14895712403656916
# Best hyperparameters:{'learning_rate': 0.015135259972875157, 'n_estimators': 722, 'max_depth': 9, 'min_child_weight': 29, 'subsample': 0.822530521576753, 'colsample_bytree': 0.7400797719516602, 'reg_alpha': 3.1216939887945054e-06, 'reg_lambda': 0.010567667191794925}
# CatBoost Model with 0.1488679203959043
# Best hyperparameters:{'learning_rate': 0.10083192209799738, 'n_estimators': 894, 'max_depth': 7, 'l2_leaf_reg': 1.9327532000297312, 'random_strength': 0.0021895994884562485, 'bootstrap_type': 'MVS'}
# RF Model with 0.15002047134998422
# Best hyperparameters:{'n_estimators': 479, 'max_depth': 17, 'min_samples_split': 39, 'min_samples_leaf': 4}
# KNN Model with 0.15477076651737331
# Best hyperparameters:{'n_neighbors': 30, 'weights': 'uniform', 'metric': 'manhattan'}

lgb_model = lgb.LGBMRegressor(learning_rate=0.031683333363955915, n_estimators=957, max_depth=16, num_leaves=86, min_child_samples=43, subsample=0.9556403670916472, colsample_bytree=0.6768299497756863, verbose=-1)
xgb_model = xgb.XGBRegressor(learning_rate=0.015135259972875157, n_estimators=722, max_depth=9, min_child_weight=29, subsample=0.822530521576753, colsample_bytree=0.7400797719516602, reg_alpha=3.1216939887945054e-06, reg_lambda=0.010567667191794925, verbosity=0)
catboost_model = cb.CatBoostRegressor(learning_rate=0.10083192209799738, n_estimators=894, max_depth=7, l2_leaf_reg=1.9327532000297312, random_strength=0.0021895994884562485, bootstrap_type='MVS', verbose=0)
# rf_model = RandomForestRegressor(n_estimators=479, max_depth=17, min_samples_split=39, min_samples_leaf=4)
# knn_model = KNeighborsRegressor(n_neighbors=30, weights='uniform', metric='manhattan')
cnn_model = CNNRegressor(num_features=10).to(device)
nn_model = NNRegressor(input_size=10).to(device)

print("NN Model =================================================")

nn_model.train()
nn_optimizer = optim.Adam(nn_model.parameters(), lr=0.001)
for epoch in range(100): 
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass
        predictions = nn_model(inputs)
        loss = rmsle_torch(predictions, targets)

        # Backward pass and optimize
        nn_optimizer.zero_grad()
        loss.backward()
        nn_optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

print("CNN Model =================================================")

cnn_model.train()
cnn_optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)
for epoch in range(100):
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        cnn_optimizer.zero_grad()
        outputs = cnn_model(inputs)
        loss = rmsle_torch(outputs.squeeze(), targets)
        loss.backward()
        cnn_optimizer.step()
    
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


print("LGBM Model =================================================")
lgb_model.fit(X_train, y_train)
print("XGBoost Model ==============================================")
xgb_model.fit(X_train, y_train)
print("CatBoost Model =============================================")
catboost_model.fit(X_train, y_train)
print("RF Model ===================================================")
# rf_model.fit(X_train, y_train)
print("KNN Model ==================================================")
# knn_model.fit(X_train, y_train)

with torch.no_grad():
    nn_model.eval()
    nn_preds = []
    for inputs, _ in val_loader:
        inputs = inputs.to(device)
        outputs = nn_model(inputs)
        nn_preds.extend(outputs.squeeze().cpu().numpy())

with torch.no_grad():
    cnn_model.eval()
    cnn_preds = []
    for inputs, _ in val_loader:
        inputs = inputs.to(device)
        outputs = cnn_model(inputs)
        cnn_preds.extend(outputs.squeeze().cpu().numpy())

nn_preds = np.array(nn_preds)
cnn_preds = np.array(cnn_preds)

lgb_preds = lgb_model.predict(X_val)
xgb_preds = xgb_model.predict(X_val)
catboost_preds = catboost_model.predict(X_val)
# rf_preds = rf_model.predict(X_val)
# knn_preds = knn_model.predict(X_val)



  return torch.sqrt(F.mse_loss(torch.log1p(y_pred), torch.log1p(y_true)))
  return torch.sqrt(F.mse_loss(torch.log1p(y_pred), torch.log1p(y_true)))


Epoch 1, Loss: 0.26551562547683716
Epoch 2, Loss: 0.28324252367019653
Epoch 3, Loss: 0.3260224163532257
Epoch 4, Loss: 0.3679245114326477
Epoch 5, Loss: 0.2515692710876465
Epoch 6, Loss: 0.2579304873943329
Epoch 7, Loss: 0.2605358064174652
Epoch 8, Loss: 0.2680610120296478
Epoch 9, Loss: 0.2908575236797333
Epoch 10, Loss: 0.2829907238483429
Epoch 11, Loss: 0.2790223956108093
Epoch 12, Loss: 0.2582715153694153
Epoch 13, Loss: 0.25248679518699646
Epoch 14, Loss: 0.31817397475242615
Epoch 15, Loss: 0.271912157535553
Epoch 16, Loss: 0.2896386384963989
Epoch 17, Loss: 0.28876549005508423
Epoch 18, Loss: 0.2653987407684326
Epoch 19, Loss: 0.23733606934547424
Epoch 20, Loss: 0.29549524188041687
Epoch 21, Loss: 0.32832083106040955
Epoch 22, Loss: 0.3665827214717865
Epoch 23, Loss: 0.3224899172782898
Epoch 24, Loss: 0.28261059522628784
Epoch 25, Loss: 0.2900383174419403
Epoch 26, Loss: 0.30632874369621277
Epoch 27, Loss: 0.2981785535812378
Epoch 28, Loss: 0.3141362965106964
Epoch 29, Loss: 0.27

In [39]:
def objective(trial):
    # Suggest weights for each model
    weights = {
        'nn': trial.suggest_float('nn_weight', 0, 1),
        'cnn': trial.suggest_float('cnn_weight', 0, 1),
        'lgb': trial.suggest_float('lgb_weight', 0, 1),
        'xgb': trial.suggest_float('xgb_weight', 0, 1),
        'catboost': trial.suggest_float('catboost_weight', 0, 1),
        # 'rf': trial.suggest_float('rf_weight', 0, 1),
        # 'knn': trial.suggest_float('knn_weight', 0, 1)
    }
    
    # Initialize and train models
    preds = {
        'nn': nn_preds,
        'cnn': cnn_preds,
        'lgb': lgb_preds,
        'xgb': xgb_preds,
        'catboost': catboost_preds,
        # 'rf': rf_preds,
        # 'knn': knn_preds
    }
    
    # Ensemble predictions
    weighted_preds = np.sum(np.array([weights[name] * preds[name] for name in preds.keys()]), axis=0) / np.sum(list(weights.values()))
    
    # Calculate and return RMSLE
    return rmsle(y_val, weighted_preds)



study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500)

# Print best weights and best RMSLE 
print('Best weights:', study.best_params)
print('Best RMSLE:', study.best_value)


[I 2024-04-21 20:17:25,812] A new study created in memory with name: no-name-6081a484-44c1-40ca-b3db-e63e74b43379
[I 2024-04-21 20:17:25,824] Trial 0 finished with value: 0.16526178187633234 and parameters: {'nn_weight': 0.9133370900364772, 'cnn_weight': 0.16877391785429585, 'lgb_weight': 0.5663928956035407, 'xgb_weight': 0.9048468600549301, 'catboost_weight': 0.7735292196101309}. Best is trial 0 with value: 0.16526178187633234.
[I 2024-04-21 20:17:25,834] Trial 1 finished with value: 0.265321098461442 and parameters: {'nn_weight': 0.7015750688823474, 'cnn_weight': 0.6947789405815467, 'lgb_weight': 0.33285307571648004, 'xgb_weight': 0.7847068735284313, 'catboost_weight': 0.6640085250188665}. Best is trial 0 with value: 0.16526178187633234.
[I 2024-04-21 20:17:25,842] Trial 2 finished with value: 0.339855546850542 and parameters: {'nn_weight': 0.669559371303048, 'cnn_weight': 0.8205129069709539, 'lgb_weight': 0.5099871350958382, 'xgb_weight': 0.8301010049366689, 'catboost_weight': 0.002

Best weights: {'nn_weight': 0.0007763399373129122, 'cnn_weight': 0.030659176288506687, 'lgb_weight': 0.7696936633358512, 'xgb_weight': 0.621900868671136, 'catboost_weight': 0.9419429099134051}
Best RMSLE: 0.14465742326404502


In [40]:
# train new model with best weights
weights = study.best_params

# rename weights to be 
names = ['nn_weight', 'cnn_weight', 'lgb_weight', 'xgb_weight', 'catboost_weight']#, 'rf_weight', 'knn_weight']
weights = {i: weights[i] for i in names}

lgb_model = lgb.LGBMRegressor(learning_rate=0.031683333363955915, n_estimators=957, max_depth=16, num_leaves=86, min_child_samples=43, subsample=0.9556403670916472, colsample_bytree=0.6768299497756863, verbose=-1)
xgb_model = xgb.XGBRegressor(learning_rate=0.015135259972875157, n_estimators=722, max_depth=9, min_child_weight=29, subsample=0.822530521576753, colsample_bytree=0.7400797719516602, reg_alpha=3.1216939887945054e-06, reg_lambda=0.010567667191794925, verbosity=0)
catboost_model = cb.CatBoostRegressor(learning_rate=0.10083192209799738, n_estimators=894, max_depth=7, l2_leaf_reg=1.9327532000297312, random_strength=0.0021895994884562485, bootstrap_type='MVS', verbose=0)
# rf_model = RandomForestRegressor(n_estimators=479, max_depth=17, min_samples_split=39, min_samples_leaf=4)
# knn_model = KNeighborsRegressor(n_neighbors=30, weights='uniform', metric='manhattan')
cnn_model = CNNRegressor(num_features=10).to(device)
nn_model = NNRegressor(input_size=10).to(device)

nn_model.train()
nn_optimizer = optim.Adam(nn_model.parameters(), lr=0.001)
for epoch in range(100): 
    for inputs, targets in full_train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass
        predictions = nn_model(inputs)
        loss = rmsle_torch(predictions, targets)

        # Backward pass and optimize
        nn_optimizer.zero_grad()
        loss.backward()
        nn_optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


cnn_model.train()
cnn_optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)
for epoch in range(100):
    for inputs, targets in full_train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        cnn_optimizer.zero_grad()
        outputs = cnn_model(inputs)
        loss = rmsle_torch(outputs.squeeze(), targets)
        loss.backward()
        cnn_optimizer.step()
    
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


lgb_model.fit(X, y)
xgb_model.fit(X, y)
catboost_model.fit(X, y)
# rf_model.fit(X, y)
# knn_model.fit(X, y)

with torch.no_grad():
    nn_model.eval()
    nn_preds = []
    for inputs, _ in test_loader:
        inputs = inputs.to(device)
        outputs = nn_model(inputs)
        nn_preds.extend(outputs.squeeze().cpu().numpy())

with torch.no_grad():
    cnn_model.eval()
    cnn_preds = []
    for inputs, _ in test_loader:
        inputs = inputs.to(device)
        outputs = cnn_model(inputs)
        cnn_preds.extend(outputs.squeeze().cpu().numpy())

nn_preds = np.array(nn_preds)
cnn_preds = np.array(cnn_preds)


lgb_preds = lgb_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)
catboost_preds = catboost_model.predict(X_test)
# rf_preds = rf_model.predict(X_test)
# knn_preds = knn_model.predict(X_test)

preds = {
        'nn': nn_preds,
        'cnn': cnn_preds,
        'lgb': lgb_preds,
        'xgb': xgb_preds,
        'catboost': catboost_preds,
        # 'rf': rf_preds,
        # 'knn': knn_preds
    }


  return torch.sqrt(F.mse_loss(torch.log1p(y_pred), torch.log1p(y_true)))
  return torch.sqrt(F.mse_loss(torch.log1p(y_pred), torch.log1p(y_true)))


Epoch 1, Loss: 0.26845040917396545
Epoch 2, Loss: 0.24629095196723938
Epoch 3, Loss: 0.24886025488376617
Epoch 4, Loss: 0.3437735438346863
Epoch 5, Loss: 0.3094927668571472
Epoch 6, Loss: 0.26904571056365967
Epoch 7, Loss: 0.22600853443145752
Epoch 8, Loss: 0.2736709713935852
Epoch 9, Loss: 0.2674948573112488
Epoch 10, Loss: 0.31855571269989014
Epoch 11, Loss: 0.3128458857536316
Epoch 12, Loss: 0.3039594292640686
Epoch 13, Loss: 0.2760603129863739
Epoch 14, Loss: 0.24165181815624237
Epoch 15, Loss: 0.24895136058330536
Epoch 16, Loss: 0.26488351821899414
Epoch 17, Loss: 0.2567788362503052
Epoch 18, Loss: 0.2515377700328827
Epoch 19, Loss: 0.2608144283294678
Epoch 20, Loss: 0.30121415853500366
Epoch 21, Loss: 0.2732327878475189
Epoch 22, Loss: 0.26442286372184753
Epoch 23, Loss: 0.3036670684814453
Epoch 24, Loss: 0.2929449677467346
Epoch 25, Loss: 0.2531460225582123
Epoch 26, Loss: 0.1891719102859497
Epoch 27, Loss: 0.2852015495300293
Epoch 28, Loss: 0.2566390037536621
Epoch 29, Loss: 0.

In [41]:
weighted_preds = np.sum(np.array([weights[name+"_weight"] * preds[name] for name in preds.keys()]), axis=0) / np.sum(list(weights.values()))

# manually transform back to original scale of column 'Rings'
# rings_min = scaler.data_min_[7]
# rings_max = scaler.data_max_[7]
# weighted_preds = weighted_preds * (rings_max - rings_min) + rings_min

# Save predictions to a CSV file
submission = pd.DataFrame({'id': test.index, 'Rings': weighted_preds})
submission.to_csv('submissions/ensemble5.csv', index=False)