In [1]:
import torch
print("Cuda available: ", torch.cuda.is_available())
print("Cuda device count: ", torch.cuda.device_count())
print("Cuda current device: ", torch.cuda.current_device())
print("Cuda device name: ", torch.cuda.get_device_name(0))
print("Cuda device capability: ", torch.cuda.get_device_capability(0))
print("Cuda device memory: ", torch.cuda.get_device_properties(0).total_memory)
print("Cuda device memory: ", torch.cuda.get_device_properties(0).total_memory/1024**3, "GB")
print("Cuda device memory: ", torch.cuda.get_device_properties(0).total_memory/1024**2, "MB")
print("Cuda device memory: ", torch.cuda.get_device_properties(0).total_memory/1024, "KB")
# version
print("Cuda version: ", torch.version.cuda)
print("Cuda version: ", torch.version.cuda.split("."))
print("Cuda version: ", torch.version.cuda.split(".")[0])
print("Cuda version: ", torch.version.cuda.split(".")[1])

Cuda available:  True
Cuda device count:  1
Cuda current device:  0
Cuda device name:  NVIDIA GeForce RTX 3060
Cuda device capability:  (8, 6)
Cuda device memory:  12884246528
Cuda device memory:  11.9993896484375 GB
Cuda device memory:  12287.375 MB
Cuda device memory:  12582272.0 KB
Cuda version:  11.8
Cuda version:  ['11', '8']
Cuda version:  11
Cuda version:  8


In [2]:
# importaing the required libraries
import os
import sys
base_path = r"C:\Users\KAI\Coding\ThinkOnward_challenge\thinkOnward_TSClassification"
data_path = r"\data\building-instinct-starter-notebook\Starter notebook"
sys.path.append(base_path+data_path)
path = r"C:\Users\KAI\Coding\ThinkOnward_challenge\thinkOnward_TSClassification\kai"
sys.path.append(base_path+path)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from tqdm import tqdm

from utils import (calculate_average_hourly_energy_consumption, train_model, get_pred, calculate_hierarchical_f1_score,
sample_submission_generator)

In [3]:
def load_standard_df(folder_path):
    """
    Process multiple parquet files in a folder and return a pandas DataFrame with each row corresponding to one file in the folder.

    Parameters:
    - folder_path (str): Path to the folder containing parquet files.
    
    Returns:
    - df (pd.DataFrame): A pandas DataFrame with each row corresponding to one file in the folder (i.e. one building).
    """
    # Initialize an empty list to store individual DataFrames for each file
    result_dfs = []

    # Iterate through all files in the folder_path
    for file_name in tqdm(os.listdir(folder_path)[:1000]):
        if file_name.endswith(".parquet"):
            # Extract the bldg_id from the file name
            bldg_id = int(file_name.split('.')[0])

            # Construct the full file path
            file_path = os.path.join(folder_path, file_name)

            # Read the original parquet file
            df = pd.read_parquet(file_path)

            # Convert 'timestamp' column to datetime
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            result_df = df.pivot_table(values='out.electricity.total.energy_consumption', index='bldg_id', columns=['timestamp'])

            # Add 'bldg_id' index with values corresponding to the names of the parquet files
            result_df['bldg_id'] = bldg_id
            result_df.set_index('bldg_id', inplace=True)

            # Append the result_df to the list
            result_dfs.append(result_df)

    # Concatenate all individual DataFrames into a single DataFrame
    output_df = pd.concat(result_dfs, ignore_index=False)

    return output_df

In [4]:
file_path = base_path + data_path + r"\building-instinct-train-data"
df_features = load_standard_df(file_path)

100%|██████████| 1000/1000 [02:01<00:00,  8.25it/s]


In [6]:
df_features.sort_index(inplace=True)
print(df_features.head())

timestamp  2018-01-01 00:15:00  2018-01-01 00:30:00  2018-01-01 00:45:00  \
bldg_id                                                                    
1                     2.288000             2.190000             2.101000   
10                    0.188000             0.181000             0.184000   
11                    0.336000             0.336000             0.216000   
12                   15.851305            15.081237            14.091116   
13                    1.871387             1.883041             1.893783   

timestamp  2018-01-01 01:00:00  2018-01-01 01:15:00  2018-01-01 01:30:00  \
bldg_id                                                                    
1                     2.016000             2.027000             2.050000   
10                    0.303000             0.311000             0.331000   
11                    0.216000             0.212000             0.212000   
12                   13.586164            12.635340            12.166746   
13         

In [10]:
data = torch.tensor(df_features.values, dtype=torch.float32)
print(data)
print(data.shape[1])

tensor([[ 2.2880,  2.1900,  2.1010,  ...,  0.2040,  0.3400,  0.3750],
        [ 0.1880,  0.1810,  0.1840,  ...,  0.2130,  0.2080,  0.2040],
        [ 0.3360,  0.3360,  0.2160,  ...,  0.3160,  0.3160,  0.3890],
        ...,
        [56.1725, 58.3277, 59.9776,  ..., 51.5117, 53.8454, 53.7773],
        [ 2.9919,  2.8974,  2.8502,  ...,  3.1664,  3.1336,  3.0391],
        [ 0.5830,  0.5340,  0.4900,  ...,  0.1070,  0.2210,  0.2570]])
35040


In [80]:
def preprocess_com(df_com):
    if "residential" in df_com.building_stock_type:
        print("Residential still present")

    com_exception_cols = ["in.number_of_stories_com", "in.vintage_com", "in.tstat_clg_sp_f..f_com", "in.tstat_htg_sp_f..f_com", 
                        "in.weekday_opening_time..hr_com", "in.weekday_operating_hours..hr_com"]

    # columns that are simply transformed to numeric
    for col in com_exception_cols:
        if col == "in.vintage_com":
            period_dict = {'Before 1946': 1940,  # Using an estimated middle year
                            '1946 to 1959': 1952,
                            '1960 to 1969': 1965,
                            '1970 to 1979': 1975,
                            '1980 to 1989': 1985,
                            '1990 to 1999': 1995,
                            '2000 to 2012': 2006,
                            '2013 to 2018': 2016
                            }
            df_com['in.vintage_com'] = df_com['in.vintage_com'].map(period_dict)
        else:
            df_com[col] = pd.to_numeric(df_com[col])

    # One-hot encode categorical features
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    categorical_encoded = one_hot_encoder.fit_transform(df_com[df_com.columns.difference(com_exception_cols)])
    categorical_encoded = pd.DataFrame(categorical_encoded)
    categorical_encoded.index = df_com.index
    df_com = pd.concat([df_com[com_exception_cols], pd.DataFrame(categorical_encoded)], axis=1)
    return df_com.fillna(0)

def convert_income_range_to_mean(income):
    if '+' in income:
        return float(income[:-1])  # Take the numeric part of '200000+'
    if '<' in income:
        return float(income[1:])  # Take the numeric part of '<10000'
    
    lower, upper = map(int, income.split('-'))
    return int((lower + upper) / 2)

def convert_geometry_to_mean(area):
    if '+' in area:
        return float(area[:-1])  # Take the numeric part of '4000+'
    lower, upper = map(int, area.split('-'))
    return int((lower + upper) / 2)

def preprocess_res(df_res):
    if "commercial" in df_res.building_stock_type:
        print("Commercial still present")

    res_exception_cols = ["in.bedrooms_res", "in.cooling_setpoint_res", "in.heating_setpoint_res", "in.geometry_floor_area_res", 
                        "in.income_res", "in.vintage_res"]
    
    df_res["in.bedrooms_res"] = pd.to_numeric(df_res['in.bedrooms_res'])
    
    # handling the temperature columns f.e. going from 78F to 78
    df_res['in.cooling_setpoint_res'] = df_res['in.cooling_setpoint_res'].apply(lambda temp: float(temp[:-1]))
    df_res['in.heating_setpoint_res'] = df_res['in.heating_setpoint_res'].apply(lambda temp: float(temp[:-1]))

    df_res['in.geometry_floor_area_res'] = df_res['in.geometry_floor_area_res'].apply(convert_geometry_to_mean)
    df_res['in.income_res'] = df_res['in.income_res'].apply(convert_income_range_to_mean)

    vintage_mapping = {
        '<1940': 1940,
        '1940s': 1945,
        '1950s': 1955,
        '1960s': 1965,
        '1970s': 1975,
        '1980s': 1985,
        '1990s': 1995,
        '2000s': 2005,
        '2010s': 2015
    }

    df_res['in.vintage_res'] = df_res['in.vintage_res'].map(vintage_mapping)
    # One-hot encode categorical features
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    categorical_encoded = one_hot_encoder.fit_transform(df_res[df_res.columns.difference(res_exception_cols)])
    categorical_encoded = pd.DataFrame(categorical_encoded)
    categorical_encoded.index = df_res.index
    df_res = pd.concat([df_res[res_exception_cols], pd.DataFrame(categorical_encoded)], axis=1)
    return df_res.fillna(0)

In [81]:
from sklearn.preprocessing import LabelEncoder
load_filepath_labels = os.path.join(base_path + data_path,'building-instinct-train-label', 'train_label.parquet')#path to the train label file
df_targets = pd.read_parquet(load_filepath_labels, engine='pyarrow')
df_targets = df_targets[:1000]

df_targets_res = df_targets[df_targets.building_stock_type == "residential"].copy()
df_targets_com = df_targets[df_targets.building_stock_type == "commercial"].copy()
df_targets_res = preprocess_res(df_targets_res)
df_targets_com = preprocess_com(df_targets_com)

# print(df_targets.columns)
# for col in df_targets.columns:
#     print(f"{col} - {df_targets[col].unique()}")
# print(df_targets.nunique())
# Fill NaNs with a placeholder
# df_categorical = df_targets.fillna('Unknown')
# print(df_targets.isna().sum())

# print(categorical_encoded.shape)
# combined_data = np.hstack([time_series_data, categorical_encoded])

In [82]:
print(df_targets_com)

         in.number_of_stories_com  in.vintage_com  in.tstat_clg_sp_f..f_com  \
bldg_id                                                                       
3                               2            1995                       999   
5                               1            2006                       999   
6                               1            1965                        72   
9                               1            1985                       999   
12                              1            1940                        72   
...                           ...             ...                       ...   
992                             1            1995                        73   
993                             1            1995                        74   
994                             2            1952                        71   
995                             1            2006                        76   
996                             1            1975   

In [None]:
from torch.utils.data import DataLoader, Dataset
# Define custom dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, data, y_time_series, y_household_type, y_categorical):
        self.data = data
        self.y_time_series = y_time_series
        self.y_household_type = y_household_type
        self.y_categorical = y_categorical
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return (self.data[idx], 
                self.y_time_series[idx], 
                self.y_household_type[idx], 
                self.y_categorical[idx])

# Assuming y_time_series is same as data for simplicity
dataset = TimeSeriesDataset(data, data, y_household_type, y_categorical)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Sample dataset (replace this with actual dataset loading)
# X is the time series data with shape (num_samples, sequence_length, num_features)
# y_time_series is the target time series with shape (num_samples, sequence_length, num_features)
# y_household_type is the household category with shape (num_samples,)
# y_categorical is the additional categorical attribute with shape (num_samples, num_attributes)

X = torch.randn(100, 96, 1)  # Example shape for 100 samples, 96 timesteps, 1 feature
y_time_series = torch.randn(100, 96, 1)
y_household_type = torch.randint(0, 2, (100,))
y_categorical = torch.randint(0, 3, (100, 2))

dataset = TensorDataset(X, y_time_series, y_household_type, y_categorical)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
import torch
import torch.nn as nn

class MultiTaskLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes_household, num_classes_categorical):
        super(MultiTaskLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        
        # Output layers for different tasks
        self.time_series_head = nn.Linear(hidden_size, input_size)
        self.household_type_head = nn.Linear(hidden_size, num_classes_household)
        self.categorical_head = nn.Linear(hidden_size, num_classes_categorical)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the last output for classification tasks
        
        time_series_pred = self.time_series_head(lstm_out)
        household_type_pred = self.household_type_head(lstm_out)
        categorical_pred = self.categorical_head(lstm_out)
        
        return time_series_pred, household_type_pred, categorical_pred

input_size = 1  # Number of features in time series
hidden_size = 128
num_classes_household = 2
num_classes_categorical = 3

model = MultiTaskLSTM(input_size, hidden_size, num_classes_household, num_classes_categorical)


In [None]:
class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()
        self.mse_loss = nn.MSELoss()
        self.cross_entropy_loss = nn.CrossEntropyLoss()
    
    def forward(self, time_series_pred, time_series_true, 
                household_type_pred, household_type_true, 
                categorical_pred, categorical_true):
        loss_time_series = self.mse_loss(time_series_pred, time_series_true)
        loss_household_type = self.cross_entropy_loss(household_type_pred, household_type_true)
        
        # If multiple categorical attributes, apply CE loss for each and sum them
        loss_categorical = sum([self.cross_entropy_loss(categorical_pred[:, i], categorical_true[:, i]) 
                                for i in range(categorical_true.size(1))])
        
        total_loss = loss_time_series + loss_household_type + loss_categorical
        return total_loss

criterion = CustomLoss()


In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for X_batch, y_time_series_batch, y_household_type_batch, y_categorical_batch in dataloader:
        optimizer.zero_grad()
        
        time_series_pred, household_type_pred, categorical_pred = model(X_batch)
        
        loss = criterion(time_series_pred, y_time_series_batch, 
                         household_type_pred, y_household_type_batch, 
                         categorical_pred, y_categorical_batch)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(dataloader)}")
