In [68]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from tqdm import tqdm

In [69]:
df = pd.read_pickle("train.pkl")
df_test = pd.read_pickle("test.pkl")

df.rename(columns={"PM2.5_target": "y"}, inplace=True)
df_test.rename(columns={"PM2.5_target": "y"}, inplace=True)

df.rename(columns={c: c.replace(" ", "_") for c in df.columns}, inplace=True)
df_test.rename(columns={c: c.replace(" ", "_") for c in df_test.columns}, inplace=True)

# Keep only data from 2018 onwards
df = df.loc[df.index.get_level_values("date") >= "2018-01-01"]

In [70]:
# Add year, month, day, hour columns (as dummies)
df["year"] = df.index.get_level_values("date").year
df["month"] = df.index.get_level_values("date").month
df["day"] = df.index.get_level_values("date").day
df["hour"] = df.index.get_level_values("date").hour

df = pd.get_dummies(df, columns=["month", "day", "hour"], drop_first=True)

# Drop columns with only NaNs
df.dropna(axis=1, how="all", inplace=True)

# In case the first day is missing, just take the average over all cities
day1 = df.index.get_level_values("date") == df.index.get_level_values("date").min()
df.loc[day1, "y"] = df.loc[day1, "y"].replace({np.nan: df.loc[day1, "y"].mean()})

# Fill any missings with the past value (by city)
df["y"] = df.groupby("city").y.ffill()
# 
# 
# 
# Add year, month, day, hour columns (as dummies)
df_test["year"] = df_test.index.get_level_values("date").year
df_test["month"] = df_test.index.get_level_values("date").month
df_test["day"] = df_test.index.get_level_values("date").day
df_test["hour"] = df_test.index.get_level_values("date").hour
# 
df_test = pd.get_dummies(df_test, columns=["month", "day", "hour"], drop_first=True)
# 
df_test = df_test[list(set(df.columns).intersection(set(df_test.columns)))]

In [71]:
# Add logarithm of target
df["logy"] = np.log(df["y"])

## Feature Imputation

In [72]:
# Missing values per column
df.isna().mean().sort_values(ascending=False)

THC       0.965903
HCHO      0.940675
Hg        0.939259
MH        0.935458
CH4       0.818199
            ...   
day_5     0.000000
day_4     0.000000
day_3     0.000000
day_2     0.000000
day_10    0.000000
Length: 97, dtype: float64

In [73]:
from lightgbm import LGBMRegressor

In [74]:
# Iteratively, fill features using LightGBM trained on all features that have no NaNs
features = df.drop(columns=["y", "PM2.5", "logy"]).isna().mean().sort_values()
features_full = features[features == 0].index.tolist()
features_to_impute = features[features > 0].index.tolist()

In [75]:
for feature in tqdm(features_to_impute, total=len(features_to_impute)):
    # Make LGBM data
    X = df[features_full]
    X_test = df_test[features_full]
    y = df[feature]

    # Train LGBM
    lgbm = LGBMRegressor(verbose=0, n_jobs=-1).fit(X, y)
    y_pred = lgbm.predict(X)
    y_pred_test = lgbm.predict(X_test)

    # Fill NaNs
    df[feature].fillna(pd.Series(y_pred, index=df.index), inplace=True)
    df_test[feature].fillna(pd.Series(y_pred_test, index=df_test.index), inplace=True)

    # We can now use this feature for imputing other features
    features_full.append(feature)

100%|██████████| 27/27 [00:29<00:00,  1.11s/it]


In [76]:
features = [c for c in df.columns if c not in ["y", "PM2.5", "logy"]]
out1 = ["y"]
out2 = ["logy"]

In [77]:
X_train = df[features]
X_test = df_test[features]
y = df[out2]

## LGBM Predictions

### 1. Naïve

In [78]:
# LightGBM on all features
lgbm = LGBMRegressor(verbose=0).fit(X_train, y)
y_pred = np.exp(lgbm.predict(X_test))

In [79]:
preds = pd.DataFrame(y_pred, index=df_test.index)

In [80]:
preds.to_pickle("00.pkl") # LGBM prediction

### 2. Wide Format

In [81]:
# What if instead, we use the contemporaneous values of other cities to help in prediction?
# Transform dataframe to wide format, using index level city
df_wide = df.reset_index().pivot(index="date", columns="city", values=features + out2)
df_wide.columns = df_wide.columns.map('_'.join).str.strip('_')
df_wide_test = df_test.reset_index().pivot(index="date", columns="city", values=features)
df_wide_test.columns = df_wide_test.columns.map('_'.join).str.strip('_')

features = [c for c in df_wide.columns if not (c.startswith("y_") or c.startswith("logy_"))]

df_wide = df_wide.astype(float)
df_wide_test = df_wide_test.astype(float)

In [82]:
cities = sorted(df.index.get_level_values("city").unique().to_list())
out2 = ["logy_" + city for city in cities]
pred = pd.DataFrame(index=df_test.index, columns=["y"])

In [83]:
for city in tqdm(cities, total=len(cities), unit="city", desc="Fitting models"):
    # Train LGBM
    lgbm = LGBMRegressor(verbose=0).fit(df_wide[features], df_wide[f"logy_{city}"])
    y_pred = lgbm.predict(df_wide_test[features])
    pred.loc[pred.index.get_level_values("city") == city, "y"] = np.exp(y_pred)

Fitting models: 100%|██████████| 12/12 [00:17<00:00,  1.44s/city]


In [85]:
pred.to_pickle("00_wide.pkl") # LGBM prediction using contemporaneous values of other cities

## LSTM Predictions

In [89]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler

from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler

In [88]:
class PMDataset(Dataset):
    # We could also define our constructor to take a pandas dataframe and extract the 
    # features / targets directly. Notice that we must also specify a sequence length
    def __init__(self, X, y, seq_len, enforce_shape=False):
        self.X = torch.Tensor(X)
        self.y = torch.Tensor(y)
        self.added = 0
        # We need the Tensors to be of a length divisible by the sequence length
        if enforce_shape and (X.shape[0] % seq_len != 0):
            # Pad with zeros
            self.X = torch.cat([self.X, torch.zeros(seq_len - X.shape[0] % seq_len, *self.X.shape[1:])])
            self.y = torch.cat([self.y, torch.zeros(seq_len - y.shape[0] % seq_len, *self.y.shape[1:])])
            self.added = seq_len - X.shape[0] % seq_len
        self.seq_len = seq_len

    def __len__(self):
        return (self.y.shape[0] - self.seq_len) // self.seq_len + 1
    
    def __getitem__(self, idx):
        idx *= self.seq_len
        return self.X[idx:idx+self.seq_len], self.y[idx:idx+self.seq_len]

In [96]:
class SimpleLSTM(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_outputs=1, num_lstm_layers=2):
        super().__init__()
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_outputs = n_outputs

        self.lstm = nn.LSTM(n_inputs, n_hidden, num_lstm_layers, batch_first=True)
        self.dense = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(n_hidden, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_outputs)
        )

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.dense(x)
        return x

In [97]:
def make_lstm_preds(df_train, df_test, batch_size=128, seq_len=96, n_epochs=50):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")   
    # Use mean-square error as a loss
    loss_fn = nn.MSELoss()

    preds = {}
    
    for city in tqdm(cities, total=len(cities), unit="city", desc="Fitting models"):
        dfx = df_train.xs(city, level="city").copy()
        dfy = df_test.xs(city, level="city").copy()

        X_train = dfx.drop(columns=["y", "PM2.5", "logy"])
        y_train = dfx["logy"].values.reshape(-1, 1)

        features = X_train.columns

        X_test = dfy[features]

        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        dsx = PMDataset(X=X_train, y=y_train, seq_len=seq_len)
        dsy = PMDataset(X=X_test, y=np.zeros(X_test.shape[0]), seq_len=seq_len, enforce_shape=True)
        dlx = DataLoader(dsx, batch_size=batch_size, shuffle=True)
        dly = DataLoader(dsy, batch_size=1, shuffle=False)

        nnet = SimpleLSTM(X_train.shape[1], 128, 1).to(device)

        opt = optim.Adam(nnet.parameters(), lr=1e-4)
        sched = lr_scheduler.OneCycleLR(opt, max_lr=0.01, 
                                        steps_per_epoch=len(dlx), 
                                        epochs=n_epochs) 

        for epoch in range(n_epochs):
            nnet.train()

            for X, y in dlx:
                # Pass the batch to the GPU
                X = X.to(device)
                y = y.to(device)

                # Reset the gradients
                opt.zero_grad()

                # Compute the forward pass
                y_pred = nnet(X)
                
                # Compute the loss
                loss = loss_fn(y, y_pred)

                # Compute the gradients
                loss.backward()

                # Update the parameters
                opt.step()
                sched.step()

        # Make predictions
        nnet.eval()
        y_pred = []
        with torch.no_grad():
            for X, y in dly:
                X = X.to(device)
                y_pred.append(nnet(X).cpu().numpy())

        y_pred = np.concatenate(y_pred).reshape(-1)[:-dsy.added]

        preds[city] = np.exp(y_pred)

    return preds

In [98]:
preds = make_lstm_preds(df, df_test)
preds = pd.DataFrame(preds, index=df_test.index.get_level_values("date").unique())
preds = pd.melt(preds, ignore_index=False).rename(columns={"variable": "city", "value": "y"})
preds["date"] = preds.index
preds = preds.set_index(["date", "city"]).sort_index(level=[0, 1])

Fitting models: 100%|██████████| 12/12 [00:14<00:00,  1.20s/city]


In [99]:
preds.to_pickle("00_lstm.pkl") # LSTM prediction