In [1]:
import numpy as np
import pandas as pd

from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from EntityEmbeddingNet.train_module import EntityEmbNetModule
from EntityEmbeddingNet.data_module import EntityEmbNetDataModule, EntityEmbNetDataset

from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv('./data/train.csv')
target = np.log1p(train['count'])
del train['count']

In [3]:
def data_preprocessing(input_data: pd.DataFrame, is_train: bool) -> pd.DataFrame:
    if is_train:
        del input_data['casual'], input_data['registered']
    else:
        input_data = self.test_data

    input_data.datetime = input_data.datetime.apply(pd.to_datetime)
    input_data['year'] = input_data.datetime.apply(lambda x: x.year)
    input_data['month'] = input_data.datetime.apply(lambda x: x.month)
    input_data['time'] = input_data.datetime.apply(lambda x: x.hour)
    input_data['weekday'] = input_data.datetime.apply(lambda x: x.weekday())
    del input_data['datetime']
    return input_data

In [4]:
train = data_preprocessing(train, True)

In [5]:
num_vars = ['temp', 'atemp', 'humidity', 'windspeed']
cat_vars = list(train.columns)
for v in num_vars:
    cat_vars.remove(v)
cat_vars = {c: len(train[c].unique()) for c in cat_vars}

for v in cat_vars:
    if min(train[v]) != 0:
        train[v] -= 1
    if v == 'year':
        train[v] -= 2010

ct = ColumnTransformer(
    transformers=[
        ("minmax", MinMaxScaler(), num_vars)
    ]
)
train[num_vars] = ct.fit_transform(train)

In [6]:
cat_vars

{'season': 4,
 'holiday': 2,
 'workingday': 2,
 'weather': 4,
 'year': 2,
 'month': 12,
 'time': 24,
 'weekday': 7}

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      10886 non-null  int64  
 1   holiday     10886 non-null  int64  
 2   workingday  10886 non-null  int64  
 3   weather     10886 non-null  int64  
 4   temp        10886 non-null  float64
 5   atemp       10886 non-null  float64
 6   humidity    10886 non-null  float64
 7   windspeed   10886 non-null  float64
 8   year        10886 non-null  int64  
 9   month       10886 non-null  int64  
 10  time        10886 non-null  int64  
 11  weekday     10886 non-null  int64  
dtypes: float64(4), int64(8)
memory usage: 1020.7 KB


In [7]:
x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.3, random_state=1993)
data_module = EntityEmbNetDataModule(x_train, x_val, y_train, y_val, cat_vars, num_vars)
network = EntityEmbNetModule(cat_vars, num_vars)

In [8]:
logger = TensorBoardLogger(
                           save_dir="EntityEmbeddingNet",
                           name="regression",
                           default_hp_metric=False,
                           )

early_stop_callback = EarlyStopping(
                                    monitor='val_loss',
                                    min_delta=1e-4,
                                    patience=20,
                                    verbose=True,
                                    mode='min'
                                    )

trainer = Trainer(max_epochs=200,
                  callbacks=[early_stop_callback],
                  logger=logger
                  )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [9]:
trainer.fit(network, datamodule=data_module)

Missing logger folder: EntityEmbeddingNet/regression

  | Name    | Type         | Params
-----------------------------------------
0 | model   | EntityEmbNet | 2.3 K 
1 | loss_fn | MSELoss      | 0     
-----------------------------------------
2.3 K     Trainable params
0         Non-trainable params
2.3 K     Total params
0.009     Total estimated model params size (MB)


Epoch 0:  82%|██████████████████████████████████████████████████████████████████████████████████████▏                  | 119/145 [00:03<00:00, 38.11it/s, loss=18.2, v_num=0]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                                                                                                                     | 0/26 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                        | 0/26 [00:00<?, ?it/s][A
Epoch 0:  83%|██████████████████████████████████████████████████████████████████████████████████████▉                  | 120/145 [00:20<00:04,  5.75it/s, loss=18.2, v_num=0][A
Epoch 0:  83%|███████████████████████████████████████████████████████████████████████████████████████▌                 | 121/145 [00:20<00:04,  5.79it/s, loss=18.2, v_num=0][A
Epoch 0:  84%|██████████████████████████████████████████████████████████████████████

Metric val_loss improved. New best score: 4.295


Epoch 1:  82%|██████████████████████████████████████████████████████████████████████████████████████▏                  | 119/145 [00:23<00:05,  5.17it/s, loss=16.6, v_num=0]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                                                                                                                     | 0/26 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                        | 0/26 [00:00<?, ?it/s][A
Epoch 1:  83%|██████████████████████████████████████████████████████████████████████████████████████▉                  | 120/145 [00:45<00:09,  2.61it/s, loss=16.6, v_num=0][A
Epoch 1:  83%|███████████████████████████████████████████████████████████████████████████████████████▌                 | 121/145 [00:45<00:09,  2.64it/s, loss=16.6, v_num=0][A
Epoch 1:  84%|██████████████████████████████████████████████████████████████████████

Metric val_loss improved by 0.113 >= min_delta = 0.0001. New best score: 4.182


Epoch 2:  82%|██████████████████████████████████████████████████████████████████████████████████████▏                  | 119/145 [00:23<00:05,  5.12it/s, loss=14.9, v_num=0]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                                                                                                                     | 0/26 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                        | 0/26 [00:00<?, ?it/s][A
Epoch 2:  83%|██████████████████████████████████████████████████████████████████████████████████████▉                  | 120/145 [00:45<00:09,  2.62it/s, loss=14.9, v_num=0][A
Epoch 2:  83%|███████████████████████████████████████████████████████████████████████████████████████▌                 | 121/145 [00:45<00:09,  2.64it/s, loss=14.9, v_num=0][A
Epoch 2:  84%|██████████████████████████████████████████████████████████████████████

Metric val_loss improved by 0.093 >= min_delta = 0.0001. New best score: 4.089


Epoch 3:  82%|██████████████████████████████████████████████████████████████████████████████████████▏                  | 119/145 [00:23<00:05,  5.11it/s, loss=13.9, v_num=0]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                                                                                                                     | 0/26 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                        | 0/26 [00:00<?, ?it/s][A
Epoch 3:  83%|██████████████████████████████████████████████████████████████████████████████████████▉                  | 120/145 [00:45<00:09,  2.61it/s, loss=13.9, v_num=0][A
Epoch 3:  83%|███████████████████████████████████████████████████████████████████████████████████████▌                 | 121/145 [00:45<00:09,  2.63it/s, loss=13.9, v_num=0][A
Epoch 3:  84%|██████████████████████████████████████████████████████████████████████

Metric val_loss improved by 0.103 >= min_delta = 0.0001. New best score: 3.986


Epoch 4:  82%|██████████████████████████████████████████████████████████████████████████████████████▏                  | 119/145 [00:23<00:05,  5.12it/s, loss=12.7, v_num=0]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                                                                                                                     | 0/26 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                        | 0/26 [00:00<?, ?it/s][A
Epoch 4:  83%|██████████████████████████████████████████████████████████████████████████████████████▉                  | 120/145 [00:46<00:09,  2.61it/s, loss=12.7, v_num=0][A
Epoch 4:  83%|███████████████████████████████████████████████████████████████████████████████████████▌                 | 121/145 [00:46<00:09,  2.63it/s, loss=12.7, v_num=0][A
Epoch 4:  84%|██████████████████████████████████████████████████████████████████████

Metric val_loss improved by 0.094 >= min_delta = 0.0001. New best score: 3.892


Epoch 5:  82%|██████████████████████████████████████████████████████████████████████████████████████▏                  | 119/145 [00:23<00:05,  5.10it/s, loss=11.9, v_num=0]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                                                                                                                     | 0/26 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                                                                                        | 0/26 [00:00<?, ?it/s][A
Epoch 5:  83%|██████████████████████████████████████████████████████████████████████████████████████▉                  | 120/145 [00:46<00:09,  2.60it/s, loss=11.9, v_num=0][A
Epoch 5:  83%|███████████████████████████████████████████████████████████████████████████████████████▌                 | 121/145 [00:46<00:09,  2.62it/s, loss=11.9, v_num=0][A
Epoch 5:  84%|██████████████████████████████████████████████████████████████████████

Metric val_loss improved by 0.104 >= min_delta = 0.0001. New best score: 3.787


Epoch 6:   0%|                                                                                                                   | 0/145 [00:00<?, ?it/s, loss=11.9, v_num=0]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x11b500040>
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniconda/base/envs/Kaggle.BikeSharingDemand/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1481, in __del__
    self._shutdown_workers()
  File "/opt/homebrew/Caskroom/miniconda/base/envs/Kaggle.BikeSharingDemand/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1445, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/opt/homebrew/Caskroom/miniconda/base/envs/Kaggle.BikeSharingDemand/lib/python3.9/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/opt/homebrew/Caskroom/miniconda/base/envs/Kaggle.BikeSharingDemand/lib/python3.9/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/opt/homebrew/Caskroom/miniconda/base/envs/Kaggle.BikeSharingDemand/lib/python3.9/multiprocessing/connecti