In [2]:
import time
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [4]:
device = 'cpu'

In [101]:
columns_config = {
    'numerical': ['square_feet', 'year_built', 'floor_count', 'air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 
                 'sea_level_pressure', 'wind_speed', 'mean_target'],
    'categorical': [
#         'site_id', 'building_id', 'primary_use', 
        'wind_direction_cat', 'month', 'hour', 'season', 'daytime']
}

In [6]:
class Net(nn.Module):
    def __init__(self, d_in=10, k=2, n_hidden=1, batch_norm=False, dropout=False):
        super(Net, self).__init__()
        d_cur = d_in
        self.layers = []
        for i in range(n_hidden):
            self.layers.append(nn.Linear(d_cur, d_cur // k))
            if batch_norm:
                self.layers.append(nn.BatchNorm1d(d_cur // k))
            self.layers.append(nn.ReLU())
            if dropout:
                self.layers.append(nn.Dropout())
            d_cur //= k
        self.layers.append(nn.Linear(d_cur, 1))
        self.layers = nn.ModuleList(self.layers)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [119]:
# Utils

# Prepare data

def select_meter(df, meter=1):
    df = df[df.meter == meter]
    df.drop(columns='meter', inplace=True)
    return df

def filter_wind(weather_df):
    weather_df.loc[weather_df.wind_direction + weather_df.wind_speed == 0, ['wind_direction', 'wind_speed']] = np.NaN
    return weather_df

def merge(data, weather, meta):
    df = meta.merge(data, on='building_id')
    df = df.merge(weather, on=['site_id', 'timestamp'])
    return df

def filter_zero_targets(df):
    df = df[df.meter_reading != 0]
    return df

def create_new_features(df):
    df['month'] = df.timestamp.apply(lambda x: time.strptime(x ,"%Y-%m-%d %H:%M:%S").tm_mon)
    df['hour'] = df.timestamp.apply(lambda x: time.strptime(x ,"%Y-%m-%d %H:%M:%S").tm_hour)
    
    df['season'] = df['month'] % 12 // 3
    df['daytime'] = df['hour'] // 5    
    return df

def prepare_data(meter=1, fast_debug=False):
    meta = pd.read_csv('data/building_metadata.csv')
    train = pd.read_csv('data/train.csv')
    weather = pd.read_csv('data/weather_train.csv')
    
    train = select_meter(train, meter)
    weather = filter_wind(weather)
    df = merge(train, weather, meta)
    
    if fast_debug:
        # building_ids = [1109, 1130, 1363, 1377]
        building_ids = np.random.choice(df.building_id.unique(), 100, replace=False)
        df = df[df.building_id.isin(building_ids)]

    df = filter_zero_targets(df)
    df = create_new_features(df)
    return df

def prepare_test_data(train_df, meter=1):
    meta = pd.read_csv('data/building_metadata.csv')
    test = pd.read_csv('data/test.csv')
    weather = pd.read_csv('data/weather_test.csv')
    
    test = select_meter(test, meter)
    weather = filter_wind(weather)
    df = merge(test, weather, meta)
    
    building_ids = train_df.building_id.unique()
    df = df[df.building_id.isin(building_ids)]

    df['meter_reading'] = np.NaN
    df = create_new_features(df)
    return df

# def filter_building(building_id):
#     return df[df.building_id == building_id]
    
def save_results(submission, name):
    submission.to_csv('results/%s' % name)
    
    
# Preprocess data
class Preprocessor:
    def __init__(self, df):
        self.df = df
        
        self.train_idx = self.train_test_split()
        self.df = self.create_mean_target()
        self.df = self.fill_nans()
        self.df = self.create_wind_cat()
        
    def train_test_split(self):
        if 'row_id' in self.df.columns:
            train_idx = self.df[~pd.isna(self.df.meter_reading)].index
            
        else:
            train_idx = np.random.choice(self.df.index, len(self.df) * 7 // 10, replace=False)
        return train_idx
        
    def create_mean_target(self):
        mean_targets = pd.DataFrame(data=self.df[self.df.index.isin(self.train_idx)].groupby('building_id').meter_reading.mean())
        mean_targets.columns = ['mean_target']
        
        self.df['tmp_index'] = self.df.index
        self.df.index = self.df.building_id
        self.df['mean_target'] = mean_targets
        self.df.index = self.df.tmp_index
        
        return self.df
    
    def fill_nans(self):
        for col in ['square_feet', 'year_built', 'floor_count', 'air_temperature', 'cloud_coverage', 'dew_temperature', 
                    'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']:
            self.df[col] = self.df[col].fillna(self.df.loc[self.train_idx, col].mean())
        return self.df
    
    def create_wind_cat(self):
        self.df['wind_direction_cat'] = self.df['wind_direction'] // 45
        return self.df

# Scale data
class Scaler:
    def __init__(self, preprocessor, batch_size=512, prod=False):
        self.df = preprocessor.df
        self.train_idx = preprocessor.train_idx
        
        self.scaler_labels = None
        self.scaler_features = None
        self.encoders = {}
        
        self.create_scalers()
        
        cat_train, num_train, labels_train = self.transform(self.df.loc[self.train_idx])
        cat_test, num_test, labels_test = self.transform(self.df[~self.df.index.isin(self.train_idx)])
        self.d_in = cat_train.shape[1] + num_train.shape[1]
        
        if prod:
            self.testloader = self.create_dataloader(cat_test, num_test, labels_test, batch_size * 100, shuffle=False, add_row_ids=True,
                                             row_ids=self.df[~self.df.index.isin(self.train_idx)].row_id.values.reshape(-1, 1))
        else:
            self.testloader = self.create_dataloader(cat_test, num_test, labels_test, batch_size)
        self.trainloader = self.create_dataloader(cat_train, num_train, labels_train, batch_size)
  
    def create_scalers(self):
        self.scaler_features = StandardScaler()
        self.scaler_labels = StandardScaler()

        self.scaler_features.fit(self.df.loc[self.train_idx, columns_config['numerical']])
        self.scaler_labels.fit(self.df.loc[self.train_idx, 'meter_reading'].values.reshape(-1, 1))
        for col in columns_config['categorical']:
            self.encoders[col] = OneHotEncoder(handle_unknown='ignore', sparse=False)
            self.encoders[col].fit(self.df.loc[self.train_idx, col].values.reshape(-1, 1))
        
    def transform(self, data):
        num_features = self.scaler_features.transform(data.loc[:, columns_config['numerical']])
        labels = self.scaler_labels.transform(data.loc[:, 'meter_reading'].values.reshape(-1, 1))
        cat_features = []
        for col in columns_config['categorical']:
            cat_features.append(self.encoders[col].transform(data[col].values.reshape(-1, 1)))
        cat_features = np.concatenate(cat_features, axis=1)
        return cat_features, num_features, labels
    
    @staticmethod
    def create_dataloader(cat, num, labels, batch_size, shuffle=True, add_row_ids=False, row_ids=None):
        data = [cat, num]
        if add_row_ids:
            data.append(row_ids)
        dataset = TensorDataset(torch.Tensor(np.concatenate(data, 1)).to(device), torch.Tensor(labels).to(device))
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
        return dataloader
    
# Train
class Trainer:
    def __init__(self, scaler, net_config, lr=0.001):
        self.trainloader = scaler.trainloader
        self.testloader = scaler.testloader
        self.scaler_labels = scaler.scaler_labels

        self.optimizer = None
        self.criterion = None
        self.net = None
        
        net_config['d_in'] = scaler.d_in
        self.create_models(net_config, lr)
        
        self.train_losses = []
        self.test_losses = []
        self.metrics = []
        
    def create_models(self, net_config, lr):
        self.net = Net(**net_config).to(device)
        print('Net architecture:')
        print(self.net)
        
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.net.parameters(), lr=lr)
    
    def metric(self, pred, labels):
        pred_raw = self.scaler_labels.inverse_transform(pred.detach().cpu().numpy())
        labels_raw = self.scaler_labels.inverse_transform(labels.detach().cpu().numpy())
        pred_raw[pred_raw < 0] = 0
        loss =  np.mean((np.log(pred_raw + 1) - np.log(labels_raw + 1)) ** 2) ** 0.5
        return loss
    
    def train(self, n_epochs, verbose=True, do_val=True):
        for epoch in range(n_epochs):
            self.net.train()
            losses = []
            for i, data in enumerate(self.trainloader, 0):
                inputs, labels = data
                self.optimizer.zero_grad()
                outputs = self.net(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()
                losses.append(loss.item())
            if verbose:
                print('[%d] Train loss: %.3f' % (epoch + 1, np.mean(losses)))
            self.train_losses.append(np.mean(losses))

            if do_val:
                self.net.eval()
                losses = []
                metrics = []
                for i, data in enumerate(self.testloader, 0):
                    inputs, labels = data
                    with torch.no_grad():
                        outputs = self.net(inputs)
                    loss = self.criterion(outputs, labels)
                    losses.append(loss.item())
                    metrics.append(self.metric(outputs, labels))
                if verbose:
                    print('[%d] Test loss: %.3f' % (epoch + 1, np.mean(losses)))
                    print('[%d] Test metric: %.3f' % (epoch + 1, np.mean(metrics)))
                self.test_losses.append(np.mean(losses))
                self.metrics.append(np.mean(metrics))
                
    def predict(self, submission):
        self.net.eval()
        for data in self.testloader:
            inputs, labels = data
            inputs, row_ids = inputs[:, :-1], inputs[:, -1]
            print(row_ids)
            with torch.no_grad():
                outputs = self.net(inputs)
            pred_raw = self.scaler_labels.inverse_transform(outputs.detach().cpu().numpy())
            pred_raw[pred_raw < 0] = 0
            submission.loc[row_ids, 'meter_reading'] = pred_raw
        return submission

    def plot(self):
        f, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 10))
        ax1.plot(self.train_losses, color='b')
        ax1.plot(self.test_losses, color='y')
        ax2.plot(self.metrics, color='y')
        plt.show()
        
    def save_model(self, name):
        torch.save(self.net, 'models/' + name)
        

In [9]:
df = prepare_data(meter=0, fast_debug=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [19]:
test_df = prepare_test_data(df, meter=0)

In [61]:
df['row_id'] = np.NaN
df_all = pd.concat([df, test_df], axis=0)
df_all = df_all.reset_index()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [86]:
df_all.building_id.unique()

array([ 103,   78,   53,   69,   30,   44,   51,   67,   72,   80,   45,
        123,  192,  203,  208,  220,  231,  233,  237,  239,  265,  270,
        289,  308,  318,  341,  355,  365,  369,  376,  379,  418,  430,
        450,  467,  485,  488,  496,  536,  542,  546,  556,  300,  584,
        587,  603,  642,  644,  669,  701,  711,  712,  727,  769,  776,
        785,  803,  809,  810,  813,  823,  831,  832,  835,  842,  862,
        865,  903,  928,  963,  965,  992,  892, 1006, 1014, 1019, 1039,
       1043, 1047, 1056, 1065, 1070, 1098, 1100, 1106, 1124, 1125, 1146,
       1230, 1231, 1254, 1260, 1266, 1279, 1325, 1348, 1378, 1381, 1389,
       1440], dtype=int64)

In [120]:
df_b = df_all[df_all.building_id == 103]
preprocessor = Preprocessor(df_b)
scaler = Scaler(preprocessor, prod=True)
trainer = Trainer(scaler, net_config, lr=0.001)
trainer.train(10, verbose=True, do_val=False)
submission = trainer.predict(submission)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return self.pa

Net architecture:
Net(
  (layers): ModuleList(
    (0): Linear(in_features=64, out_features=21, bias=True)
    (1): BatchNorm1d(21, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5)
    (4): Linear(in_features=21, out_features=1, bias=True)
  )
)
[1] Train loss: 0.983
[2] Train loss: 0.851
[3] Train loss: 0.746
[4] Train loss: 0.651
[5] Train loss: 0.555
[6] Train loss: 0.467
[7] Train loss: 0.405
[8] Train loss: 0.359
[9] Train loss: 0.328
[10] Train loss: 0.309
tensor([1.2600e+02, 2.5500e+02, 3.8400e+02,  ..., 2.2598e+06, 2.2599e+06,
        2.2601e+06])


In [122]:
submission.loc[126]

row_id           126.000000
meter_reading     31.236124
Name: 126, dtype: float64

In [None]:
test = pd.read_csv('data/)

In [63]:
preprocessor = Preprocessor(df_all)

In [77]:
scaler = Scaler(preprocessor, prod=True)

  return self.partial_fit(X, y)


In [78]:
net_config = {
    'n_hidden': 1,
    'batch_norm': True,
    'dropout': True,
    'k': 3
}

trainer = Trainer(scaler, net_config, lr=0.001)

Net architecture:
Net(
  (layers): ModuleList(
    (0): Linear(in_features=63, out_features=21, bias=True)
    (1): BatchNorm1d(21, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5)
    (4): Linear(in_features=21, out_features=1, bias=True)
  )
)


In [79]:
trainer.train(150, verbose=True, do_val=False)

[1] Train loss: 0.161
[2] Train loss: 0.112
[3] Train loss: 0.109
[4] Train loss: 0.108
[5] Train loss: 0.106
[6] Train loss: 0.107
[7] Train loss: 0.107
[8] Train loss: 0.108
[9] Train loss: 0.106
[10] Train loss: 0.105
[11] Train loss: 0.105
[12] Train loss: 0.105
[13] Train loss: 0.107
[14] Train loss: 0.106
[15] Train loss: 0.106
[16] Train loss: 0.103
[17] Train loss: 0.104
[18] Train loss: 0.104
[19] Train loss: 0.107
[20] Train loss: 0.105


KeyboardInterrupt: 

In [88]:
submission = pd.read_csv('data/sample_submission.csv')

In [None]:
net_config = {
    'n_hidden': 1,
    'batch_norm': True,
    'dropout': True,
    'k': 3
}
    
for building in buildings:
    for meter in meters:
        df = prepare_data(meter=meter, building_id=building)
        preprocessor = Preprocessor(df)
        scaler = Scaler(preprocessor)
        trainer = Trainer(scaler, net_config, lr=0.001)
        trainer.train(150, verbose=False)
        print('Building %d, meter %d, last test loss %.4f, last test metric %.4f' % (building, meter, trainer.test_losses[-1],
                                                                                    trainer.metrics[-1]))
        trainer.save_model('model_%d_%d.pkl' % (building, meter))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
