In [3]:
import torch
from sklearn.preprocessing import LabelEncoder
from pytorch_tabnet.tab_model import TabNetRegressor
import pandas as pd
import numpy as np
import seaborn as sb
print(torch.version.cuda)

None


In [8]:
merged_data = pd.read_csv('data/merged_data.csv')
merged_data

Unnamed: 0,Year,Month,town,flat_type,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price_adjusted,average_storey,Set
0,2000,2,18,4,138.0,8,1996,74.655628,763180.858672,5.0,1
1,2000,2,18,4,125.0,17,1977,74.655628,681929.664593,5.0,1
2,2000,2,18,4,117.0,17,1978,74.655628,631147.668294,2.0,0
3,2000,2,18,4,117.0,17,1978,74.655628,681929.664593,14.0,1
4,2000,2,18,4,123.0,17,1976,74.655628,631147.668294,20.0,1
...,...,...,...,...,...,...,...,...,...,...,...
628402,2024,1,11,3,90.0,8,2003,78.416667,495000.000000,2.0,1
628403,2024,1,11,3,94.0,8,2019,94.916667,630000.000000,8.0,1
628404,2024,1,11,3,93.0,8,2018,93.833333,528000.000000,2.0,2
628405,2024,1,11,3,93.0,8,2019,94.666667,630000.000000,8.0,1


In [9]:

train = merged_data
target = 'resale_price_adjusted'

# If the 'Set' column does not exist, create it to randomly assign rows to train, valid, and test sets.
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p=[.8, .1, .1], size=(train.shape[0],))

# Get indices for each set.
train_indices = train[train.Set == "train"].index
valid_indices = train[train.Set == "valid"].index
test_indices = train[train.Set == "test"].index

In [10]:
# Initialize lists and dictionaries.
categorical_columns = []
categorical_dims = {}

# Process categorical columns.
for col in train.columns[train.dtypes == object]:
    print(col, train[col].nunique())
    l_enc = LabelEncoder()
    # Fill missing values with a placeholder string, 'VV_likely'.
    train[col] = train[col].fillna("VV_likely")
    # Transform the column with LabelEncoder.
    train[col] = l_enc.fit_transform(train[col].values)
    # Append the column name to the list of categorical columns.
    categorical_columns.append(col)
    # Store the number of unique values in the categorical_dims dictionary.
    categorical_dims[col] = len(l_enc.classes_)

# Process numerical columns.
# Note: You should only fill NaNs with the mean of the training set to prevent data leakage.
for col in train.columns[(train.dtypes == 'float64') | (train.dtypes == 'int64')]:
    # Calculate the mean from the training set only.
    mean_value = train.loc[train_indices, col].mean()
    # Fill missing values with the mean value.
    train.fillna({col: mean_value}, inplace=True)

# If 'remaining_lease' has NaNs, this is a special case because it's a float but with a lot of missing values.
# You might want to fill its NaNs separately with the mean or another relevant statistic from the training set.
if 'remaining_lease' in train.columns:
    mean_remaining_lease = train.loc[train_indices, 'remaining_lease'].mean()
    train.fillna({'remaining_lease': mean_remaining_lease}, inplace=True)

train


Unnamed: 0,Year,Month,town,flat_type,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price_adjusted,average_storey,Set
0,2000,2,18,4,138.0,8,1996,74.655628,763180.858672,5.0,1
1,2000,2,18,4,125.0,17,1977,74.655628,681929.664593,5.0,1
2,2000,2,18,4,117.0,17,1978,74.655628,631147.668294,2.0,0
3,2000,2,18,4,117.0,17,1978,74.655628,681929.664593,14.0,1
4,2000,2,18,4,123.0,17,1976,74.655628,631147.668294,20.0,1
...,...,...,...,...,...,...,...,...,...,...,...
628402,2024,1,11,3,90.0,8,2003,78.416667,495000.000000,2.0,1
628403,2024,1,11,3,94.0,8,2019,94.916667,630000.000000,8.0,1
628404,2024,1,11,3,93.0,8,2018,93.833333,528000.000000,2.0,2
628405,2024,1,11,3,93.0,8,2019,94.666667,630000.000000,8.0,1


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 628407 entries, 0 to 628406
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Year                   628407 non-null  int64  
 1   Month                  628407 non-null  int64  
 2   town                   628407 non-null  int64  
 3   flat_type              628407 non-null  int64  
 4   floor_area_sqm         628407 non-null  float64
 5   flat_model             628407 non-null  int64  
 6   lease_commence_date    628407 non-null  int64  
 7   remaining_lease        628407 non-null  float64
 8   resale_price_adjusted  628407 non-null  float64
 9   average_storey         628407 non-null  float64
 10  Set                    628407 non-null  int64  
dtypes: float64(4), int64(7)
memory usage: 52.7 MB


# Define categorical features for categorical embeddings

In [None]:
unused_feat = ['Set']

features = [ col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

# Define your embedding sizes for categorical features. This could be based on the cardinality of the features
# or determined through experimentation. For example, a common heuristic is to take the 
# min(50, (category_count + 1) // 2), but here we just use predefined sizes.
cat_emb_dim = [min(50, (dim + 1) // 2) for dim in cat_dims]


# Network parameters

In [None]:
clf = TabNetRegressor(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs)



# Training

In [None]:
X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices].reshape(-1, 1)

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices].reshape(-1, 1)

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices].reshape(-1, 1)

In [None]:
max_epochs = 100 if not os.getenv("CI", False) else 2

In [None]:
from pytorch_tabnet.augmentations import RegressionSMOTE
aug = RegressionSMOTE(p=0.2)

In [None]:
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['rmsle', 'mae', 'rmse', 'mse'],
    max_epochs=max_epochs,
    patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=5,
    drop_last=False,
    augmentations=aug, #aug
) 

epoch 0  | loss: 202848720701.6719| train_rmsle: 4.46145 | train_mae: 418039.24635| train_rmse: 445566.64813| train_mse: 198529637924.56708| valid_rmsle: 4.46859 | valid_mae: 418445.8496| valid_rmse: 446314.12438| valid_mse: 199196297625.12225|  0:01:51s
epoch 1  | loss: 194968676052.14584| train_rmsle: 5.89317 | train_mae: 412645.96506| train_rmse: 439306.53051| train_mse: 192990227745.8736| valid_rmsle: 5.91356 | valid_mae: 413050.24672| valid_rmse: 440043.68394| valid_mse: 193638443775.4849|  0:03:41s
epoch 2  | loss: 182340561140.1829| train_rmsle: 5.05658 | train_mae: 396580.79377| train_rmse: 420984.64837| train_mse: 177228074163.43314| valid_rmsle: 5.06592 | valid_mae: 396980.2576| valid_rmse: 421695.36351| valid_mse: 177826979604.91562|  0:05:32s
epoch 3  | loss: 166711522272.3561| train_rmsle: 4.97722 | train_mae: 383589.73349| train_rmse: 406856.65159| train_mse: 165532334941.91928| valid_rmsle: 5.00194 | valid_mae: 384008.07361| valid_rmse: 407558.25954| valid_mse: 166103734

KeyboardInterrupt: 

# Global explainability : feat importance summing to 1

In [None]:
clf.feature_importances_

array([9.77791451e-02, 6.13724158e-03, 1.26214851e-02, 1.62031754e-02,
       1.11614628e-01, 4.39594988e-01, 7.00270865e-03, 2.21551328e-02,
       2.42098249e-02, 4.74765422e-02, 7.87972700e-02, 6.09876806e-02,
       7.53701552e-02, 5.00226303e-05])