In [1]:
import os
import sys
import pandas as pd
import torch
from run import simple_classification, train_lstm, predict_lstm, create_submission, unique_values
from preprocessing import TargetPreprocessor
from sklearn.model_selection import train_test_split

base_path = r"C:\Users\KAI\Coding\ThinkOnward_challenge\thinkOnward_TSClassification"
data_path = r"\data\building-instinct-starter-notebook\Starter notebook"
sys.path.append(base_path+data_path)
sys.path.append(base_path+"\kai")
df_features = pd.read_parquet(base_path + '/preprocessed_data/standard_data.parquet', engine='pyarrow')
df_features.sort_index(inplace=True)

load_filepath_labels = os.path.join(base_path + data_path,'building-instinct-train-label', 'train_label.parquet')#path to the train label file
df_targets = pd.read_parquet(load_filepath_labels, engine='pyarrow')

# RandomForest classifier
X, y = df_features, df_targets["building_stock_type"].map({"residential": 0, "commercial": 1})
df_test = pd.read_parquet(base_path + '/preprocessed_data/data_test.parquet', engine='pyarrow')
df_test.sort_index(inplace=True)

y_pred = simple_classification(X, y, df_test, print_performance=True)
df_test["building_stock_type"] = y_pred

# The 2 LSTM models
# 1. Preprocessing
df_targets_res = df_targets[df_targets.building_stock_type == "residential"].filter(like='_res').copy()
df_targets_com = df_targets[df_targets.building_stock_type == "commercial"].filter(like='_com').copy()
target_preprocessor = TargetPreprocessor()
df_targets_res, association_dict_res, encoder_res = target_preprocessor.preprocess_res(df_targets_res)
df_targets_com, association_dict_com, encoder_com = target_preprocessor.preprocess_com(df_targets_com)
unique_values_res = unique_values(df_targets_res)
unique_values_com = unique_values(df_targets_com)

common_indices = df_features.index.intersection(df_targets_com.index)
X_com = df_features[df_features.index.isin(common_indices)]
y_com = df_targets_com[df_targets_com.index.isin(common_indices)]
# X_com = torch.tensor(X_com.values, dtype=torch.float32)
# y_com = torch.tensor(y_com.values, dtype=torch.float32)
X_train_com, X_val_com, y_train_com, y_val_com = train_test_split(X_com, y_com, test_size=0.2, random_state=42)
X_train_com = torch.tensor(X_train_com.values, dtype=torch.float32)
y_train_com = torch.tensor(y_train_com.values, dtype=torch.float32)
X_val_com = torch.tensor(X_val_com.values, dtype=torch.float32)
y_val_com = torch.tensor(y_val_com.values, dtype=torch.float32)

X_res = df_features[df_features.index.isin(df_targets_res.index)]
y_res = df_targets_res[df_targets_res.index.isin(df_features.index)]
# X_res = torch.tensor(X_res.values, dtype=torch.float32)
# y_res = torch.tensor(y_res.values, dtype=torch.float32)
X_train_res, X_val_res, y_train_res, y_val_res = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
X_train_res = torch.tensor(X_train_res.values, dtype=torch.float32)
y_train_res = torch.tensor(y_train_res.values, dtype=torch.float32)
X_val_res = torch.tensor(X_val_res.values, dtype=torch.float32)
y_val_res = torch.tensor(y_val_res.values, dtype=torch.float32)

0.992323346090601
[[765   2]
 [  9 664]]


In [5]:
# 2. Training
parameters = {
    "batch_size" : 16,
    "hidden_size" : 32,
    "num_epochs" : 10,
    "weight_numerical" : 10e-4,
    "weight_categorical" : 1.0,
    "association_dict": association_dict_com,
    "unique_values": unique_values_com,
    "device": "cuda:0",
    "lr": 0.01,
    "num_classes_categorical":y_com.shape[1],}
model_com, filename_com = train_lstm(X_com, y_com, parameters)

Epoch [1/10], Total Loss: 2058.0098, Numerical Loss: 2052.6116, Categorical Loss: 5.3981, Combined Loss: 7.4507, weights: 0.0010, 1.0000
Epoch [2/10], Total Loss: 1963.4072, Numerical Loss: 1956.9991, Categorical Loss: 6.4080, Combined Loss: 8.3650, weights: 0.0010, 1.0000
Epoch [3/10], Total Loss: 2829.2285, Numerical Loss: 2822.4009, Categorical Loss: 6.8275, Combined Loss: 9.6499, weights: 0.0010, 1.0000
Epoch [4/10], Total Loss: 3277.5774, Numerical Loss: 3272.9563, Categorical Loss: 4.6212, Combined Loss: 7.8942, weights: 0.0010, 1.0000
Epoch [5/10], Total Loss: 2792.4116, Numerical Loss: 2786.3699, Categorical Loss: 6.0418, Combined Loss: 8.8282, weights: 0.0010, 1.0000
Epoch [6/10], Total Loss: 2913.7444, Numerical Loss: 2909.1914, Categorical Loss: 4.5529, Combined Loss: 7.4621, weights: 0.0010, 1.0000
Epoch [7/10], Total Loss: 2467.3235, Numerical Loss: 2459.0903, Categorical Loss: 8.2333, Combined Loss: 10.6923, weights: 0.0010, 1.0000
Epoch [8/10], Total Loss: 2952.6052, Num

In [7]:
parameters_res = {
    "batch_size" : 16,
    "hidden_size" : 32,
    "num_epochs" : 10,
    "weight_numerical" : 10e-5,
    "weight_categorical" : 1.0,
    "association_dict": association_dict_res,
    "unique_values": unique_values_res,
    "device": "cuda",
    "lr": 0.01,
    "num_classes_categorical":y_res.shape[1],}
model_res, filename_res = train_lstm(X_res, y_res, parameters)

Epoch [1/10], Total Loss: 57729.2969, Numerical Loss: 57722.6641, Categorical Loss: 6.6339, Combined Loss: 12.4061, weights: 0.0001, 1.0000
Epoch [2/10], Total Loss: 98415.9531, Numerical Loss: 98408.3438, Categorical Loss: 7.6079, Combined Loss: 17.4487, weights: 0.0001, 1.0000
Epoch [3/10], Total Loss: 76382.7188, Numerical Loss: 76375.7656, Categorical Loss: 6.9552, Combined Loss: 14.5928, weights: 0.0001, 1.0000
Epoch [4/10], Total Loss: 104351.8984, Numerical Loss: 104345.6016, Categorical Loss: 6.3005, Combined Loss: 16.7350, weights: 0.0001, 1.0000
Epoch [5/10], Total Loss: 84287.1484, Numerical Loss: 84280.3516, Categorical Loss: 6.7943, Combined Loss: 15.2223, weights: 0.0001, 1.0000
Epoch [6/10], Total Loss: 96448.3359, Numerical Loss: 96441.8359, Categorical Loss: 6.5008, Combined Loss: 16.1450, weights: 0.0001, 1.0000
Epoch [7/10], Total Loss: 105557.9766, Numerical Loss: 105551.2188, Categorical Loss: 6.7565, Combined Loss: 17.3116, weights: 0.0001, 1.0000
Epoch [8/10], To

In [2]:
# 3. Predictions
parameters_com = {
    "batch_size" : 16,
    "hidden_size" : 32,
    "num_epochs" : 10,
    "weight_numerical" : 10e-4,
    "weight_categorical" : 1.0,
    "association_dict": association_dict_com,
    "unique_values": unique_values_com,
    "device": "cuda:0",
    "lr": 0.01,
    "num_classes_categorical" : y_com.shape[1],}
df_test_com = df_test[df_test.building_stock_type == 1]
arr_com = predict_lstm(df_test_com, parameters_com, encoder_com, filename=r"C:\Users\KAI\Coding\ThinkOnward_challenge\thinkOnward_TSClassification\kai\checkpoints\com_model_checkpoint_08_15_00_06.pth.tar")
parameters_res = {
    "batch_size" : 16,
    "hidden_size" : 32,
    "num_epochs" : 10,
    "weight_numerical" : 10e-5,
    "weight_categorical" : 1.0,
    "association_dict": association_dict_res,
    "unique_values": unique_values_res,
    "device": "cuda",
    "lr": 0.01,
    "num_classes_categorical":y_res.shape[1],}

df_test_res = df_test[df_test.building_stock_type == 0]
arr_res = predict_lstm(df_test_res, parameters_res, encoder_res, filename=r"C:\Users\KAI\Coding\ThinkOnward_challenge\thinkOnward_TSClassification\kai\checkpoints\res_model_checkpoint_08_15_00_15.pth.tar")
# 4. Submission
df = create_submission(arr_com, arr_res, df_test, save_filepath="submission.parquet")
for col in df.columns:
    print(col, df[col].unique())

  return t.to(
  checkpoint = torch.load(checkpoint_filename)


Allocated memory: 0.00 GB
Cached memory: 0.00 GB
Total memory: 12.00 GB
Unused memory: 12.00 GB


100%|██████████| 41/41 [00:03<00:00, 10.72it/s]
  checkpoint = torch.load(checkpoint_filename)


Allocated memory: 0.01 GB
Cached memory: 0.02 GB
Total memory: 12.00 GB
Unused memory: 11.99 GB


100%|██████████| 50/50 [00:04<00:00, 11.81it/s]


building_stock_type ['residential' 'commercial']
in.comstock_building_type_group_com ['nan' 'Mercantile' 'Office']
in.heating_fuel_com ['nan' 'NaturalGas']
in.hvac_category_com ['nan' 'Small Packaged Unit' 'Multizone CAV/VAV']
in.number_of_stories_com ['nan' '2']
in.ownership_type_com ['nan' 'owner_occupied' 'leased']
in.vintage_com ['nan' 'Before 1946']
in.wall_construction_type_com ['nan' 'WoodFramed']
in.tstat_clg_sp_f..f_com ['nan' '73' '72' '71' '74' '75' '70' '76' '69']
in.tstat_htg_sp_f..f_com ['nan' '69' '68' '67' '66' '70' '65' '64' '61' '63']
in.weekday_opening_time..hr_com ['nan' '10.25' '10.0' '10.5' '9.75' '9.5' '9.0' '9.25']
in.weekday_operating_hours..hr_com ['nan' '10.5' '10.25' '11.0' '9.75' '9.25' '9.0' '10.0' '10.75' '9.5'
 '8.75' '8.5']
in.bedrooms_res ['2' '3' 'nan' '4']
in.cooling_setpoint_res ['62F' '72F' 'nan' '70F' '60F' '68F' '67F' '65F' '76F' '75F']
in.heating_setpoint_res ['60F' '68F' 'nan' '67F' '55F' '70F' '65F' '72F' '62F' '75F']
in.geometry_building_type

# old code

In [1]:
import torch
from utils import print_gpu_memory
print_gpu_memory()
import os
import sys
base_path = r"C:\Users\KAI\Coding\ThinkOnward_challenge\thinkOnward_TSClassification"
data_path = r"\data\building-instinct-starter-notebook\Starter notebook"
sys.path.append(base_path+data_path)
sys.path.append(base_path+"\kai")
import pandas as pd
from tqdm import tqdm
from preprocessing import Preprocessor
df_features = pd.read_parquet(base_path + '/preprocessed_data/standard_data.parquet', engine='pyarrow')
df_features.sort_index(inplace=True)

load_filepath_labels = os.path.join(base_path + data_path,'building-instinct-train-label', 'train_label.parquet')#path to the train label file
df_targets = pd.read_parquet(load_filepath_labels, engine='pyarrow')
df_targets.sort_index(inplace=True)

Allocated memory: 0.00 GB
Cached memory: 0.00 GB
Total memory: 12.00 GB
Unused memory: 12.00 GB


## classifier

In [2]:
from run import simple_classification

X, y = df_features, df_targets["building_stock_type"].map({"residential": 0, "commercial": 1})

df_test = pd.read_parquet(base_path + '/preprocessed_data/data_test.parquet', engine='pyarrow')
df_test.sort_index(inplace=True)

y_pred = simple_classification(X, y, df_test, print_performance=True)
df_test["building_stock_type"] = y_pred

0.992323346090601
[[765   2]
 [  9 664]]


## Preprocessing for commercial and residential predictions

In [3]:
from preprocessing import TargetPreprocessor

df_targets_res = df_targets[df_targets.building_stock_type == "residential"].filter(like='_res').copy()
df_targets_com = df_targets[df_targets.building_stock_type == "commercial"].filter(like='_com').copy()
target_preprocessor = TargetPreprocessor()
df_targets_res, association_dict_res, encoder_res = target_preprocessor.preprocess_res(df_targets_res)
df_targets_com, association_dict_com, encoder_com = target_preprocessor.preprocess_com(df_targets_com)

In [4]:
def unique_values(df):
    unique_values = {}
    for col in df.columns:
        if df[col].nunique() > 2:
            unique_values[col] = df[col].unique()
    return unique_values
unique_values_res = unique_values(df_targets_res)
unique_values_com = unique_values(df_targets_com)

In [5]:
common_indices = df_features.index.intersection(df_targets_com.index)
# Filter the data
X_com = df_features[df_features.index.isin(common_indices)]
X_com = torch.tensor(X_com.values, dtype=torch.float32)
y_com = df_targets_com[df_targets_com.index.isin(common_indices)]
y_com = torch.tensor(y_com.values, dtype=torch.float32)

X_res = df_features[df_features.index.isin(df_targets_res.index)]
X_res = torch.tensor(X_res.values, dtype=torch.float32)
y_res = df_targets_res[df_targets_res.index.isin(df_features.index)]
y_res = torch.tensor(y_res.values, dtype=torch.float32)

## com model

In [6]:
from run import train_lstm
parameters = {
    "batch_size" : 16,
    "hidden_size" : 82,
    "num_epochs" : 25,
    "weight_numerical" : 10e-4,
    "weight_categorical" : 1.0,
    "association_dict": association_dict_com,
    "unique_values": unique_values_com,
    "device": "cuda",
    "lr": 0.1,
    "num_classes_categorial":y_com.shape[1],}

model_com = train_lstm(X_com, y_com, parameters)

Epoch [1/25], Total Loss: 2989.7385, Numerical Loss: 2981.7322, Categorical Loss: 8.0063, Combined Loss: 10.9880, weights: 0.0010, 1.0000


KeyboardInterrupt: 

In [None]:
from run import predict_lstm
df_test_com = df_test[df_test.building_stock_type == 1]
# print(df_test_com[df_test_com.columns.difference(["building_stock_type"])].shape)
X_test_com, y_pred_com = torch.tensor(df_test_com[df_test_com.columns.difference(["building_stock_type"])].values, dtype=torch.float32), df_test_com["building_stock_type"]
arr_com = predict_lstm(X_test_com, parameters, encoder_com, filename='/kai/checkpoints/com_model_checkpoint_08_13_23_14.pth.tar')
for col in arr_com.columns:
    print(arr_com[col].unique())

In [7]:
# from torch.utils.data import DataLoader, Dataset
# from models import MultiTaskLSTM, CustomLoss, TimeSeriesDataset
# from datetime import datetime

# import os
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
# torch.cuda.empty_cache()

# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# batch_size, input_size, sequence_length, hidden_size, num_classes_categorical = 16, 1, X_com.shape[1], 64, y_com.shape[1]
# # checkpoint_filename = base_path+ f'/kai/checkpoints/com_model_checkpoint_08_06_14_49.pth.tar'
# checkpoint_filename = base_path+ f'/kai/checkpoints/com_model_checkpoint_08_13_23_14.pth.tar'
# dataloader_com = DataLoader(TimeSeriesDataset(X_com, y_com), batch_size=batch_size, shuffle=True)
# model = MultiTaskLSTM(input_size, hidden_size, num_classes_categorical)
# model = model.to(device)

# checkpoint = torch.load(checkpoint_filename)
# model.load_state_dict(checkpoint['model_state_dict'])
# for X_batch, y_categorical_batch in dataloader_com:
#     for i in [0, 1]:
#         _X_batch = X_batch[:, :].view(X_batch.shape[0], sequence_length, input_size)
#         _X_batch = _X_batch.to(device)
#         categorical_pred = model.predict(_X_batch, association_dict_com)
#         print(categorical_pred)
#         print(y_categorical_batch[i, :])
#         break
#     break

  checkpoint = torch.load(checkpoint_filename)


tensor([[1.5293e+00, 1.9764e+03, 7.2813e+01, 6.8851e+01, 7.6482e+00, 1.0340e+01,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [1.5634e+00, 1.9740e+03, 7.2779e+01, 6.8623e+01, 7.6589e+00, 9.7448e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [1.5879e+00, 1.9668e+03, 7.2692e+01, 6.8938e+01, 7.4934e+00, 1.0317e+01,
         0.0000e+00, 0.0000e+00, 0.0000e+

### create the original com df

In [8]:
# from postprocessing import inverse_process
# com_arr_df = inverse_process(categorical_pred, encoder_com) # todo: standardize the target numerical columns in preprocessing
# display(com_arr_df)

Unnamed: 0,in.number_of_stories_com,in.vintage_com,in.tstat_clg_sp_f..f_com,in.tstat_htg_sp_f..f_com,in.weekday_opening_time..hr_com,in.weekday_operating_hours..hr_com,in.comstock_building_type_group_com,in.heating_fuel_com,in.hvac_category_com,in.ownership_type_com,in.wall_construction_type_com
0,2,1970 to 1979,73,69,7.75,10.25,Mercantile,NaturalGas,Small Packaged Unit,owner_occupied,Mass
1,2,1970 to 1979,73,69,7.75,9.75,Mercantile,NaturalGas,Small Packaged Unit,owner_occupied,Mass
2,2,1960 to 1969,73,69,7.5,10.25,Mercantile,NaturalGas,Multizone CAV/VAV,owner_occupied,Mass
3,2,1970 to 1979,73,69,7.75,10.0,Mercantile,NaturalGas,Small Packaged Unit,owner_occupied,Mass
4,2,1970 to 1979,73,69,7.75,9.5,Office,NaturalGas,Small Packaged Unit,owner_occupied,Mass
5,2,1970 to 1979,73,69,7.75,9.75,Office,NaturalGas,Small Packaged Unit,owner_occupied,Mass
6,2,1960 to 1969,73,69,7.5,10.25,Mercantile,NaturalGas,Multizone CAV/VAV,owner_occupied,Mass
7,2,1970 to 1979,73,69,7.5,10.5,Mercantile,NaturalGas,Small Packaged Unit,owner_occupied,Mass
8,2,1960 to 1969,73,69,7.5,9.5,Office,NaturalGas,Small Packaged Unit,owner_occupied,Mass
9,2,1970 to 1979,73,69,7.75,9.5,Office,NaturalGas,Small Packaged Unit,owner_occupied,Mass


In [12]:
# # file_path = base_path + data_path + r"\building-instinct-test-data"
# # df_features = Preprocessor.load_standard_df(file_path)
# # df_features.sort_index(inplace=True)
# # df_features.to_parquet(base_path + '/preprocessed_data/data_test.parquet', engine='pyarrow')

# # df_test = pd.read_parquet(base_path + '/preprocessed_data/data_test.parquet', engine='pyarrow')
# # df_test.sort_index(inplace=True)

# # TODO: clf that predicts com or res
# # then filter by com & res
# df_test_com = df_test[df_test.building_stock_type == 1]
# # df_test = df_test[df_test.index.isin(df_features.index)]
# # X_test_com = torch.tensor(df_test.values, dtype=torch.float32)
# # X_test_com = X_test_com.to(device)
# print(df_test_com[df_test_com.columns.difference(["building_stock_type"])].shape)
# X_test_com, y_pred_com = torch.tensor(df_test_com[df_test_com.columns.difference(["building_stock_type"])].values, dtype=torch.float32), df_test_com["building_stock_type"]

(654, 35040)


In [16]:
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
# def free_gpu_memory():
#     torch.cuda.empty_cache()
#     torch.cuda.synchronize()
# free_gpu_memory()
# model = MultiTaskLSTM(input_size, hidden_size, num_classes_categorical)
# model = model.to(device)
# # criterion = CustomLoss(association_dict_com)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# # checkpoint_filename = base_path+ f'/kai/checkpoints/com_model_checkpoint_08_06_14_49.pth.tar'
# checkpoint_filename = base_path+ f'/kai/checkpoints/com_model_checkpoint_08_13_23_14.pth.tar'
# checkpoint = torch.load(checkpoint_filename)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# epoch = checkpoint['epoch']
# best_loss = checkpoint['loss']

# predictions = []
# dataloader = DataLoader(TimeSeriesDataset(X_test_com, torch.zeros(X_test_com.shape[0], y_com.shape[1])), batch_size=16, shuffle=False)

# print_gpu_memory()
# for X_batch, y_categorical_batch in tqdm(dataloader):
#     model.eval()
#     _X_batch = X_batch.view(X_batch.shape[0], sequence_length, input_size)
#     _X_batch = _X_batch.to(device)
#     with torch.no_grad():
#         categorical_pred = model.predict(_X_batch, association_dict_com)
#     predictions.append(categorical_pred.cpu())
#     del _X_batch, categorical_pred
#     free_gpu_memory()
# predictions = torch.cat(predictions, dim=0)
# arr_df_com = inverse_process(predictions, encoder_com)
# arr_df_com.index = df_test_com.index

  checkpoint = torch.load(checkpoint_filename)


Allocated memory: 0.02 GB
Cached memory: 0.03 GB
Total memory: 12.00 GB
Unused memory: 11.98 GB


100%|██████████| 41/41 [00:11<00:00,  3.61it/s]


In [17]:
# for col in arr_df_com.columns:
#     print(arr_df_com[col].unique())

[2]
['1970 to 1979' '1960 to 1969']
[73]
[69 68]
[7.5 7.75]
[10.25 10.5 10.0 9.75 9.5]
['Mercantile' 'Office']
['NaturalGas']
['Small Packaged Unit' 'Multizone CAV/VAV']
['owner_occupied']
['Mass']


## res

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# Define custom loss function
class CustomLoss_res(nn.Module):
    def __init__(self, column_groups, valid_labels):
        super(CustomLoss_res, self).__init__()
        self.mse_loss = nn.MSELoss()
        self.column_groups = column_groups  # Dictionary mapping attribute prefixes to column indices
        self.valid_labels = {k: torch.tensor(v).float() for k, v in valid_labels.items()}  # Convert valid labels to tensor
        self.end_numerical = int(min([min(v) for v in self.column_groups.values()]))

    def custom_closest_loss(self, predicted, true, valid_labels):
        # Expand dimensions to allow broadcasting (batch_size, num_valid_labels)
        predicted_expanded = predicted.unsqueeze(1)  # (batch_size, 1)
        valid_labels_expanded = valid_labels.unsqueeze(0)  # (1, num_valid_labels)
        
        # Calculate the absolute differences
        distances = torch.abs(predicted_expanded - valid_labels_expanded)  # (batch_size, num_valid_labels)
        
        # Find the closest valid label (index of the smallest distance)
        min_distances, min_indices = torch.min(distances, dim=1)  # min_distances: (batch_size,), min_indices: (batch_size,)
        
        # Get the corresponding closest labels
        closest_labels = valid_labels[min_indices]  # (batch_size,)
        
        # Compute the loss only where the closest label is not equal to the true label
        mask = closest_labels != true
        loss = torch.abs(predicted[mask] - true[mask]).mean()  # Mean absolute error over all incorrect predictions
        
        return loss
    
    def forward(self, categorical_pred, categorical_true):

        loss_numerical = 0.0
        i = 0
        # Loop over each attribute group
        for attr, labels in self.valid_labels.items():# TODO: maybe speed it up by inserting all labels as a matrix and performing matrix operations
            # Calculate the loss for this group using the valid labels
            loss_numerical += self.custom_closest_loss(categorical_pred[:, i], categorical_true[:, i], labels)
            i += 1


        # Compute numerical loss (assuming the first few columns are numerical)
        # loss_numerical = self.custom_closest_loss(categorical_pred[:, :self.end_numerical], categorical_true[:, :self.end_numerical])
        # loss_numerical = self.mse_loss(categorical_pred[:, :self.end_numerical], categorical_true[:, :self.end_numerical])

        # Initialize categorical loss
        loss_categorical = 0.0
        
        # For each attribute group, compute the cross-entropy loss
        for attr, indices in self.column_groups.items():
            # Extract logits for the current attribute
            logits = categorical_pred[:, indices]
            
            # Extract the true labels for the current attribute
            # Convert one-hot encoding to class indices
            true_labels = torch.argmax(categorical_true[:, indices], dim=1)
            
            # Compute cross-entropy loss for the current attribute
            loss_categorical += F.cross_entropy(logits, true_labels)

        total_loss = loss_numerical + loss_categorical
        return total_loss, loss_numerical, loss_categorical   

In [18]:
from torch.utils.data import DataLoader, Dataset
from models import MultiTaskLSTM, CustomLoss, TimeSeriesDataset
from datetime import datetime

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()

# model parameters
batch_size = 16
device = "cuda:0" if torch.cuda.is_available() else "cpu"
input_size, sequence_length = 1, X_res.shape[1]
num_classes_categorical = y_res.shape[1]
hidden_size = 64
num_epochs = 20

weight_numerical = 10e-4
weight_categorical = 1.0

best_combined_loss = float('inf')
checkpoint_filename = base_path+ f'/kai/checkpoints/res_model_checkpoint_{datetime.now().strftime("%m_%d_%H_%M")}.pth.tar'

# create dataloaders
# dataloader_com = DataLoader(TimeSeriesDataset(X_com, y_com), batch_size=batch_size, shuffle=True)
dataloader_res = DataLoader(TimeSeriesDataset(X_res, y_res), batch_size=batch_size, shuffle=True)

model = MultiTaskLSTM(input_size, hidden_size, num_classes_categorical)
model = model.to(device)
criterion = CustomLoss_1(association_dict_res, unique_values_res)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for X_batch, y_categorical_batch in dataloader_res:
        X_batch = X_batch.view(X_batch.shape[0], sequence_length, input_size)
        X_batch = X_batch.to(device)
        y_categorical_batch = y_categorical_batch.to(device)
        optimizer.zero_grad()
        categorical_pred = model(X_batch)
        loss, loss_numerical, loss_categorical = criterion(categorical_pred, y_categorical_batch)
        
        # Combine losses with adjusted weights
        combined_loss = weight_numerical * loss_numerical + weight_categorical * loss_categorical

        combined_loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch [{epoch+1}/{num_epochs}], '
                  f'Total Loss: {loss.item():.4f}, '
                  f'Numerical Loss: {loss_numerical.item():.4f}, '
                  f'Categorical Loss: {loss_categorical.item():.4f}, '
                  f'Combined Loss: {combined_loss.item():.4f}, '
                  f'weights: {weight_numerical:.4f}, {weight_categorical:.4f}')
    
    if combined_loss.item() < best_combined_loss:
        torch.save({'epoch': epoch + 1,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': combined_loss.item(),}, checkpoint_filename)

Epoch [1/20], Total Loss: 72215.5859, Numerical Loss: 72208.4219, Categorical Loss: 7.1626, Combined Loss: 79.3710, weights: 0.0010, 1.0000
Epoch [2/20], Total Loss: 70440.7812, Numerical Loss: 70434.2188, Categorical Loss: 6.5609, Combined Loss: 76.9951, weights: 0.0010, 1.0000
Epoch [3/20], Total Loss: 99618.4062, Numerical Loss: 99611.2344, Categorical Loss: 7.1681, Combined Loss: 106.7793, weights: 0.0010, 1.0000
Epoch [4/20], Total Loss: 101878.4922, Numerical Loss: 101872.8750, Categorical Loss: 5.6162, Combined Loss: 107.4891, weights: 0.0010, 1.0000
Epoch [5/20], Total Loss: 68431.3125, Numerical Loss: 68425.3828, Categorical Loss: 5.9283, Combined Loss: 74.3537, weights: 0.0010, 1.0000
Epoch [6/20], Total Loss: 62581.7617, Numerical Loss: 62576.2070, Categorical Loss: 5.5551, Combined Loss: 68.1314, weights: 0.0010, 1.0000
Epoch [7/20], Total Loss: 81158.4453, Numerical Loss: 81151.3750, Categorical Loss: 7.0724, Combined Loss: 88.2238, weights: 0.0010, 1.0000
Epoch [8/20], To

### todo postprocessing + analysis of res and com labels in general! 

In [19]:
def map_to_closest_values(predictions, possible_values):
    """
    Map predicted values to the closest possible values.

    Args:
    - predictions (list of lists): The predicted values.
    - possible_values (dict): A dictionary where keys are column names and values are lists of possible values.

    Returns:
    - mapped_predictions (list of lists): The predictions mapped to the closest possible values.
    """
    def closest_value(predicted, possible):
        return possible[np.argmin((np.array(possible) - predicted)**2)]
    
    mapped_predictions = []
    
    for prediction in predictions:
        mapped_row = []
        for col_name, pred_value in zip(possible_values.keys(), prediction):
            mapped_value = closest_value(pred_value, possible_values[col_name])
            mapped_row.append(mapped_value)
        mapped_predictions.append(mapped_row)
    
    return mapped_predictions

def inverse_process_res(prediction, encoder):
    prediction = prediction.cpu().detach().numpy()

    # Possible values for each column
    possible_values = {
        'in.bedrooms_res': [3, 2, 1, 4, 5],
        'in.cooling_setpoint_res': [68, 75, 72, 70, 78, 80, 60, 65, 76, 67, 62],
        'in.heating_setpoint_res': [75, 72, 68, 70, 62, 55, 65, 78, 76, 67, 60, 80],
        'in.geometry_floor_area_res': [1749,  874, 1249,  624, 2249, 4000, 2749, 3499,  249],
        'in.income_res': [109999,  12499,  64999,  89999,  42499,  22499,  47499, 169999, 129999,
                            200000,  37499,  54999,  32499,  10000, 189999,  74999,  17499,  27499,
                            149999],
        'in.vintage_res': [1940, 1975, 1985, 1945, 1995, 1955, 1965, 2005, 2015]
    }
    mapped_predictions = prediction.copy()
    # Map the predictions
    mapped_predictions_1 = map_to_closest_values(prediction[:, :6], possible_values)
    mapped_predictions_2 = encoder.inverse_transform(prediction[:, 6:])

    mapped_predictions = np.concatenate((mapped_predictions_1, mapped_predictions_2), axis=1)
    cols = ['in.bedrooms_res', 'in.cooling_setpoint_res', 'in.heating_setpoint_res',
            'in.geometry_floor_area_res', 'in.income_res', 'in.vintage_res',
            'in.geometry_building_type_recs_res', 'in.geometry_foundation_type_res', 
            'in.geometry_wall_type_res', 'in.heating_fuel_res', 'in.roof_material_res',
            'in.tenure_res', 'in.vacancy_status_res']
    
    mapped_df = pd.DataFrame(mapped_predictions, columns=cols)

    vintage_mapping = {1940:'<1940', 1945:'1940s', 1955:'1950s', 1965:'1960s',
                           1975:'1970s', 1985:'1980s', 1995:'1990s', 2005:'2000s', 2015:'2010s'}
    mapped_df['in.vintage_res'] = mapped_df['in.vintage_res'].map(vintage_mapping)
    mapped_df['in.cooling_setpoint_res'] = mapped_df['in.cooling_setpoint_res'].apply(lambda x: str(x)+"F").astype(str)
    mapped_df['in.heating_setpoint_res'] = mapped_df['in.heating_setpoint_res'].apply(lambda x: str(x)+"F").astype(str)
    mapped_df['in.bedrooms_res'] = mapped_df['in.bedrooms_res'].astype(str)

    income_mapping = {109999: '100000-119999', 12499: '10000-14999', 64999: '60000-69999', 89999: '80000-99999', 
                      42499: '40000-44999', 22499: '20000-24999', 47499: '45000-49999', 169999: '160000-179999', 
                      129999: '120000-139999', 200000: '200000+', 37499: '35000-39999', 54999: '50000-59999', 
                      32499: '30000-34999', 10000: '<10000', 189999: '180000-199999', 74999: '70000-79999', 
                      17499: '15000-19999', 27499: '25000-29999', 149999: '140000-159999'}
    mapped_df['in.income_res'] = mapped_df['in.income_res'].map(income_mapping)

    geometry_mapping = {1749: '1500-1999', 874: '750-999', 1249: '1000-1499', 624: '500-749', 2249: '2000-2499',
                        4000: '4000+', 2749: '2500-2999', 3499: '3000-3999', 249: '0-499'}
    mapped_df['in.geometry_floor_area_res'] = mapped_df['in.geometry_floor_area_res'].map(geometry_mapping)

    return mapped_df

In [21]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
def free_gpu_memory():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
free_gpu_memory()

batch_size = 16
device = "cuda:0" if torch.cuda.is_available() else "cpu"
input_size, sequence_length = 1, X_res.shape[1]
num_classes_categorical = y_res.shape[1]
hidden_size = 64
num_epochs = 20
model = MultiTaskLSTM(input_size, hidden_size, num_classes_categorical)
model = model.to(device)
# criterion = CustomLoss_res(association_dict_res)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# checkpoint_filename = base_path+ f'/kai/checkpoints/res_model_checkpoint_08_09_22_35.pth.tar'
checkpoint_filename = base_path+ f'/kai/checkpoints/res_model_checkpoint_08_13_23_37.pth.tar'
checkpoint = torch.load(checkpoint_filename)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
best_loss = checkpoint['loss']

predictions = []

df_test_res = df_test[df_test.building_stock_type == 0]
X_test_res, y_pred_res = torch.tensor(df_test_res[df_test_res.columns.difference(["building_stock_type"])].values, dtype=torch.float32), df_test_res["building_stock_type"]
dataloader = DataLoader(TimeSeriesDataset(X_test_res, torch.zeros(X_test_res.shape[0], y_res.shape[1])), batch_size=16, shuffle=False)

print_gpu_memory()
for X_batch, y_categorical_batch in tqdm(dataloader):
    model.eval()
    _X_batch = X_batch.view(X_batch.shape[0], sequence_length, input_size)
    _X_batch = _X_batch.to(device)
    with torch.no_grad():
        categorical_pred = model.predict(_X_batch, association_dict_res)
    predictions.append(categorical_pred.cpu())
    del _X_batch, categorical_pred
    free_gpu_memory()
predictions = torch.cat(predictions, dim=0)
arr_df_res = inverse_process_res(predictions, encoder_res)
arr_df_res.index = df_test_res.index

  checkpoint = torch.load(checkpoint_filename)


Allocated memory: 0.02 GB
Cached memory: 0.03 GB
Total memory: 12.00 GB
Unused memory: 11.98 GB


100%|██████████| 50/50 [00:13<00:00,  3.64it/s]


In [22]:
for col in arr_df_res.columns:
    print(col)
    print(arr_df_res[col].unique())

in.bedrooms_res
['2' '3']
in.cooling_setpoint_res
['72F']
in.heating_setpoint_res
['70F' '72F' '65F' '68F' '55F' '60F' '62F' '67F']
in.geometry_floor_area_res
['1500-1999' '1000-1499']
in.income_res
['<10000']
in.vintage_res
['1990s' '1980s' '1950s' '1960s' '1970s']
in.geometry_building_type_recs_res
['Single-Family Detached' 'Multi-Family with 5+ Units']
in.geometry_foundation_type_res
['Slab']
in.geometry_wall_type_res
['Wood Frame']
in.heating_fuel_res
['Natural Gas' 'Electricity']
in.roof_material_res
['Composition Shingles' 'Asphalt Shingles, Medium']
in.tenure_res
['Renter' 'Owner']
in.vacancy_status_res
['Occupied' 'Vacant']


## create the submission

In [23]:
from run import create_submission

df = create_submission(arr_df_com, arr_df_res, df_test, save_filepath="submission.parquet")
print(df)

        building_stock_type in.comstock_building_type_group_com  \
bldg_id                                                           
1               residential                                 nan   
2               residential                                 nan   
3                commercial                          Mercantile   
4                commercial                          Mercantile   
5               residential                                 nan   
...                     ...                                 ...   
1436             commercial                          Mercantile   
1437            residential                                 nan   
1438            residential                                 nan   
1439             commercial                          Mercantile   
1440            residential                                 nan   

        in.heating_fuel_com in.hvac_category_com in.number_of_stories_com  \
bldg_id                                            