In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

from Baseline import Baseline
from RegressionTrainingTools import Trainer
device = torch.device('cuda')

In [2]:
x_train = pd.read_csv('data/train_preprocessed.csv', engine='pyarrow')
x_test = pd.read_csv('data/test_preprocessed.csv', engine='pyarrow')

In [3]:
x_train.head()

Unnamed: 0,ari_co,ari_po,ship_type_category,dist,breadth,built,depth,draught,gt,u_wind,...,air_temperature,bn,ata_lt,dubai,bdi_adj,port_size,ci_hour,month,wind_speed,deadweight_group
0,0,0,0,32.590869,40.0,28,20.0,20.0,86100,-0.256667,...,17.050794,4.127843,21,98.07,1152.45836,0.000113,161.218056,9,1.591468,1
1,0,0,0,35.575496,30.0,20,20.0,10.0,29400,-0.256667,...,17.050794,4.127843,11,99.03,1141.586111,0.000113,95.7675,9,1.591468,1
2,0,0,0,40.909139,40.0,13,20.0,10.0,48200,-0.256667,...,17.050794,4.127843,11,100.39,1135.655794,0.000113,35.445556,9,1.591468,1
3,0,0,0,45.939559,40.0,11,20.0,10.0,58600,-0.256667,...,17.050794,4.127843,11,99.03,1141.586111,0.000113,95.507222,9,1.591468,1
4,0,0,0,15.606497,30.0,11,20.0,10.0,44300,-0.256667,...,17.050794,4.127843,11,99.03,1141.586111,0.000113,99.873056,9,1.591468,1


# Preprocessing

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaling_columns = ['dist', 'breadth', 'built', 'depth', 'draught', 'gt', 'air_temperature', 'bn', 'dubai', 'bdi_adj', 'port_size', 'wind_speed']
scaler = MinMaxScaler(feature_range=(-1, 1))
x_train.loc[:, scaling_columns] = scaler.fit_transform(x_train.loc[:, scaling_columns])
x_test.loc[:, scaling_columns] = scaler.transform(x_test.loc[:, scaling_columns])

In [5]:
x_train.head()

Unnamed: 0,ari_co,ari_po,ship_type_category,dist,breadth,built,depth,draught,gt,u_wind,...,air_temperature,bn,ata_lt,dubai,bdi_adj,port_size,ci_hour,month,wind_speed,deadweight_group
0,0,0,0,-0.674062,0.2,-0.211268,0.333333,1.0,-0.274224,-0.256667,...,0.234356,-0.261544,21,0.478923,-0.616037,-0.916749,161.218056,9,-0.898146,1
1,0,0,0,-0.644213,-0.2,-0.43662,0.333333,0.0,-0.753008,-0.256667,...,0.234356,-0.261544,11,0.495715,-0.62086,-0.916749,95.7675,9,-0.898146,1
2,0,0,0,-0.590871,0.2,-0.633803,0.333333,0.0,-0.594258,-0.256667,...,0.234356,-0.261544,11,0.519503,-0.62349,-0.916749,35.445556,9,-0.898146,1
3,0,0,0,-0.540562,0.2,-0.690141,0.333333,0.0,-0.506439,-0.256667,...,0.234356,-0.261544,11,0.495715,-0.62086,-0.916749,95.507222,9,-0.898146,1
4,0,0,0,-0.843922,-0.2,-0.690141,0.333333,0.0,-0.62719,-0.256667,...,0.234356,-0.261544,11,0.495715,-0.62086,-0.916749,99.873056,9,-0.898146,1


In [6]:
print(x_train.shape[1]-1 == x_test.shape[1])

True


# Save

In [7]:
x_test.to_csv('data/test_4dl.csv', encoding='UTF-8', index=False)

# Data Preparation

In [8]:
y_train = np.sqrt(x_train.ci_hour.copy())
x_train.drop(columns=['ci_hour'], inplace=True)

x_train = torch.FloatTensor(x_train.values)
y_train = torch.FloatTensor(y_train.values)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, random_state=42, test_size=0.2)

train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)

# Modeling

In [9]:
model = Baseline(input_dim=20).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

In [10]:
trainer = Trainer(criterion, device, save_path='checkpoints/best_model.pt')
best_model = trainer.train(model, optimizer, train_loader, val_loader, patience=7, epochs=100)

Epoch  0: 100%|██████████| 1375/1375 [00:05<00:00, 257.07it/s, Train Loss=38.5832, Valid Loss=36.2630, Valid R2=0.1251, Valid MAE=4.0709]
Epoch  1: 100%|██████████| 1375/1375 [00:04<00:00, 306.14it/s, Train Loss=37.1216, Valid Loss=35.9371, Valid R2=0.1329, Valid MAE=4.0314]
Epoch  2: 100%|██████████| 1375/1375 [00:04<00:00, 303.09it/s, Train Loss=36.7275, Valid Loss=36.1396, Valid R2=0.1281, Valid MAE=4.0749]
Epoch  3: 100%|██████████| 1375/1375 [00:04<00:00, 296.57it/s, Train Loss=36.4200, Valid Loss=35.4517, Valid R2=0.1447, Valid MAE=4.1045]
Epoch  4: 100%|██████████| 1375/1375 [00:04<00:00, 288.72it/s, Train Loss=36.1528, Valid Loss=35.2280, Valid R2=0.1501, Valid MAE=3.9327]
Epoch  5: 100%|██████████| 1375/1375 [00:04<00:00, 299.58it/s, Train Loss=35.8456, Valid Loss=34.8251, Valid R2=0.1598, Valid MAE=4.0287]
Epoch  6: 100%|██████████| 1375/1375 [00:04<00:00, 294.13it/s, Train Loss=35.5929, Valid Loss=35.1030, Valid R2=0.1531, Valid MAE=3.9791]
Epoch  7: 100%|██████████| 1375/13

Early Stopped





In [11]:
trainer.test(val_loader)

Test Loss: 32.4588 | Test R2: 0.2169 | Test MAE: 3.8401
