In [None]:
import pandas as pd
import gc
import numpy as np
import datetime
import math
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

from sklearn.preprocessing import LabelEncoder

import platform
# 차트에서 한글 출력을 위한 설정
your_os = platform.system()
if your_os == 'Linux':
    rc('font', family='NanumGothic')
elif your_os == 'Windows':
    ttf = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=ttf).get_name()

    rc('font', family=font_name)
elif your_os == 'Darwin':
    rc('font', family='AppleGothic')
rc('axes', unicode_minus=False)


import torch
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더

gc.collect()



0

In [None]:
df_tr = pd.read_csv('train_04.csv')
test = pd.read_csv('test_04.csv')
test = test.loc[test['PM2.5'].isna() == False].reset_index().iloc[:,1:]

sub = pd.read_csv('answer_sample.csv')

In [None]:
df_tr.shape, test.shape

((595680, 20), (52224, 20))

## Feature Extraction

In [None]:
df_tr

Unnamed: 0,연도,일시,측정소,기온(°C),풍향(deg),풍속(m/s),강수량(mm),습도(%),hour,day,month,week,day_hour_mean,hour_mean,hour_std,sin_time,cos_time,THI,CDH,PM2.5
0,0,01-01 00:00,0,0.173776,0.201944,0.023018,0.0,0.828,0,1,1,52,0.094667,0.092518,0.072062,0.000000,1.000000,45.681459,-25.826224,0.056
1,0,01-01 01:00,0,0.176935,0.168611,0.030691,0.0,0.831,1,1,1,52,0.097167,0.093540,0.073062,0.258819,0.965926,45.683797,-51.649289,0.060
2,0,01-01 02:00,0,0.180095,0.087222,0.033248,0.0,0.784,2,1,1,52,0.098750,0.093592,0.073949,0.500000,0.866025,45.741984,-77.469194,0.068
3,0,01-01 03:00,0,0.178515,0.087222,0.025575,0.0,0.745,3,1,1,52,0.097500,0.093602,0.073547,0.707107,0.707107,45.782737,-103.290679,0.060
4,0,01-01 04:00,0,0.164297,0.113889,0.020460,0.0,0.750,4,1,1,52,0.097417,0.093316,0.073317,0.866025,0.500000,45.751553,-129.126382,0.068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595675,3,12-31 19:00,16,0.273302,0.832222,0.086957,0.0,0.671,19,31,12,53,0.096429,0.104161,0.084752,-0.965926,0.258819,46.036158,-308.241706,0.060
595676,3,12-31 20:00,16,0.271722,0.831667,0.043478,0.0,0.692,20,31,12,53,0.099571,0.107241,0.084810,-0.866025,0.500000,46.009804,-308.295419,0.052
595677,3,12-31 21:00,16,0.268562,0.832500,0.066496,0.0,0.706,21,31,12,53,0.094000,0.107902,0.083740,-0.707107,0.707107,45.988449,-308.360190,0.044
595678,3,12-31 22:00,16,0.262243,0.866944,0.043478,0.0,0.725,22,31,12,53,0.100000,0.107062,0.084508,-0.500000,0.866025,45.955817,-308.447077,0.052


In [None]:
df_tr.head()

Unnamed: 0,연도,일시,측정소,기온(°C),풍향(deg),풍속(m/s),강수량(mm),습도(%),hour,day,month,week,day_hour_mean,hour_mean,hour_std,sin_time,cos_time,THI,CDH,PM2.5
0,0,01-01 00:00,0,0.173776,0.201944,0.023018,0.0,0.828,0,1,1,52,0.094667,0.092518,0.072062,0.0,1.0,45.681459,-25.826224,0.056
1,0,01-01 01:00,0,0.176935,0.168611,0.030691,0.0,0.831,1,1,1,52,0.097167,0.09354,0.073062,0.258819,0.965926,45.683797,-51.649289,0.06
2,0,01-01 02:00,0,0.180095,0.087222,0.033248,0.0,0.784,2,1,1,52,0.09875,0.093592,0.073949,0.5,0.866025,45.741984,-77.469194,0.068
3,0,01-01 03:00,0,0.178515,0.087222,0.025575,0.0,0.745,3,1,1,52,0.0975,0.093602,0.073547,0.707107,0.707107,45.782737,-103.290679,0.06
4,0,01-01 04:00,0,0.164297,0.113889,0.02046,0.0,0.75,4,1,1,52,0.097417,0.093316,0.073317,0.866025,0.5,45.751553,-129.126382,0.068


## Data Processing

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
min_max_scaler = MinMaxScaler()
df_tr["PM2.5"] = min_max_scaler.fit_transform(df_tr["PM2.5"].to_numpy().reshape(-1,1))
test["PM2.5"] = min_max_scaler.transform(test["PM2.5"].to_numpy().reshape(-1,1))

## DataLoader

* Split train/val

In [None]:
train = df_tr[df_tr['연도'].isin([0, 1, 2])]
data_train = train['PM2.5'].to_numpy()

val = df_tr[df_tr['연도'] == 3]
data_val = val['PM2.5'].to_numpy()

In [None]:
train_index = list(df_tr[df_tr['연도'].isin([0, 1, 2])].index)
val_index = list(df_tr[df_tr['연도'].isin([3])].index)

## Dataset

In [None]:
from torch.utils.data import DataLoader, Dataset
class windowDataset(Dataset):
    def __init__(self, y, input_window=48, output_window=72, stride=1):
        #총 데이터의 개수
        L = y.shape[0]
        #stride씩 움직일 때 생기는 총 sample의 개수
        num_samples = (L - input_window - output_window) // stride + 1

        #input과 output
        X = np.zeros([input_window, num_samples])
        Y = np.zeros([output_window, num_samples])

        for i in np.arange(num_samples):
            start_x = stride*i
            end_x = start_x + input_window
            X[:,i] = y[start_x:end_x]

            start_y = stride*i + input_window
            end_y = start_y + output_window
            Y[:,i] = y[start_y:end_y]

        X = X.reshape(X.shape[0], X.shape[1], 1).transpose((1,0,2))
        Y = Y.reshape(Y.shape[0], Y.shape[1], 1).transpose((1,0,2))
        self.x = X
        self.y = Y

        self.len = len(X)
    def __getitem__(self, i):
        return self.x[i], self.y[i]
    def __len__(self):
        return self.len

In [None]:
iw = 24*2
ow = 24*3

train_dataset = windowDataset(data_train, input_window=iw, output_window=ow, stride=1)
train_loader = DataLoader(train_dataset, batch_size=128)

## Modeling

In [None]:
class TFModel(nn.Module):
    def __init__(self,iw, ow, d_model, nhead, nlayers, dropout=0.5):
        super(TFModel, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=nlayers)
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        self.encoder = nn.Sequential(
            nn.Linear(1, d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, d_model)
        )

        self.linear =  nn.Sequential(
            nn.Linear(d_model, d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, 1)
        )

        self.linear2 = nn.Sequential(
            nn.Linear(iw, (iw+ow)//2),
            nn.ReLU(),
            nn.Linear((iw+ow)//2, ow)
        )

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, src, srcmask):
        src = self.encoder(src)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src.transpose(0,1), srcmask).transpose(0,1)
        output = self.linear(output)[:,:,0]
        output = self.linear2(output)
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

def gen_attention_mask(x):
    mask = torch.eq(x, 0)
    return mask

## Training

In [None]:
device = torch.device("cuda")
lr = 0.01
model = TFModel(24*2, 24*3, 256, 4, 2, 0.1).to(device)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
if next(model.parameters()).is_cuda:
    print("모델이 CUDA에 올라갔습니다.")
else:
    print("모델이 CUDA에 올라가지 않았습니다.")

모델이 CUDA에 올라갔습니다.


In [None]:
epoch = 30
model.train()
progress = tqdm(range(epoch))
for i in progress:
    batchloss = 0.0
    for (inputs, outputs) in train_loader:
        optimizer.zero_grad()
        src_mask = model.generate_square_subsequent_mask(inputs.shape[1]).to(device)
        result = model(inputs.float().to(device),  src_mask)
        loss = criterion(result, outputs[:,:,0].float().to(device))
        loss.backward()
        optimizer.step()
        batchloss += loss
    progress.set_description("loss: {:0.6f}".format(batchloss.cpu().item() / len(train_loader)))

loss: 0.047493: 100% 30/30 [27:37<00:00, 55.24s/it]


In [None]:
sub_list = list(sub.일시.unique())

In [None]:
val_x = val[val['일시'].isin(list(sub.일시.unique()))]

In [None]:
val_y = val[~val['일시'].isin(list(sub.일시.unique()))].iloc[:-1080]

In [None]:
val_x['PM2.5'].to_numpy()

array([0.224, 0.2  , 0.208, ..., 0.196, 0.192, 0.18 ])

In [None]:
val_y['PM2.5']

26280     0.136
26281     0.144
26282     0.152
26283     0.144
26284     0.140
          ...  
594523    0.076
594524    0.108
594525    0.116
594526    0.120
594527    0.132
Name: PM2.5, Length: 69504, dtype: float64

In [None]:
a = val_y['PM2.5'][48:96].to_numpy().reshape(-1,1)

In [None]:
len(a)

48

In [None]:
def evaluate():
    input = torch.tensor(a).reshape(1,-1,1).to(device).float().to(device)
    model.eval()

    src_mask = model.generate_square_subsequent_mask(input.shape[1]).to(device)
    predictions = model(input, src_mask)
    return predictions.detach().cpu().numpy()

In [None]:
result = evaluate()
result = min_max_scaler.inverse_transform(result)[0]
real = df_tr["PM2.5"].to_numpy()
real = min_max_scaler.inverse_transform(real.reshape(-1,1))[:,0]

In [None]:
result

array([0.11196984, 0.1122987 , 0.11194911, 0.11231173, 0.11211448,
       0.11208609, 0.11221403, 0.11224626, 0.11257584, 0.1132174 ,
       0.11368193, 0.11349264, 0.11271667, 0.11408363, 0.11280753,
       0.11285288, 0.11315247, 0.11300595, 0.11318571, 0.11332428,
       0.11343425, 0.11320131, 0.11401094, 0.11390433, 0.11383286,
       0.11402749, 0.11357152, 0.11394957, 0.11387671, 0.11476504,
       0.1142515 , 0.113648  , 0.1148149 , 0.11424537, 0.11432274,
       0.11495753, 0.11438397, 0.11542334, 0.11508318, 0.11486713,
       0.11535779, 0.11467499, 0.11472994, 0.11481643, 0.11496844,
       0.11493858, 0.11453047, 0.11498888, 0.115853  , 0.11567188,
       0.11579599, 0.11607789, 0.11540916, 0.11551205, 0.11584012,
       0.11562633, 0.11585685, 0.11590303, 0.11525034, 0.11552141,
       0.11588207, 0.1157468 , 0.11538744, 0.11583242, 0.11607194,
       0.11594656, 0.1159285 , 0.11538888, 0.11590203, 0.11527197,
       0.11549899, 0.11577772], dtype=float32)

In [None]:
def MAE(y_pred, y_true):
    return np.mean(np.abs(y_true - y_pred))

In [None]:
result

* Sliding Window

In [None]:
seq_length = 2
output_dim = 3
batch_size = 32

In [None]:
def build_train_dataset(time_series, seq_length, output_dim):
    dataX = []
    dataY = []

    for r in tqdm(time_series.측정소.unique().tolist()):
        a = time_series.loc[time_series.측정소 == r].values
        for i in range(0, (a.shape[0]//((seq_length+output_dim)*24))*((seq_length+output_dim)*24), (seq_length+output_dim)*24):
            _x = a[i:i+seq_length*24, [-1]]
            _y = a[i+seq_length*24:(i+seq_length*24)+output_dim*24, [-1]]

            dataX.append(_x)
            dataY.append(_y)

    return np.array(dataX), np.array(dataY)


def build_test_dataset(time_series):
    dataX = []
    for r in tqdm(time_series.측정소.unique().tolist()):
        a = time_series.loc[time_series.측정소 == r].values
        for i in range(0,a.shape[0]//48):
            _x = a[i*48:(i*48)+48,[-1]]
            dataX.append(_x)

    return np.array(dataX)

In [None]:
trainX, trainY = build_train_dataset(train, seq_length, output_dim)
valX, valY = build_train_dataset(val, seq_length, output_dim)
testX = build_test_dataset(test)

In [None]:
trainX.shape, trainY.shape, valX.shape, valY.shape, testX.shape

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
## Train
train_dataset = TensorDataset(torch.tensor(trainX, dtype=torch.float32).to(device),
                              torch.tensor(trainY, dtype=torch.float32).to(device))

val_dataset = TensorDataset(torch.tensor(valX, dtype=torch.float32).to(device),
                              torch.tensor(valY, dtype=torch.float32).to(device))

## Test
testX_tensor = torch.FloatTensor(testX)

In [None]:
train_dataloader = DataLoader(train_dataset,
                        batch_size=batch_size,
                        shuffle=False,
                        drop_last=True)

val_dataloader = DataLoader(val_dataset,
                        batch_size=batch_size,
                        shuffle=False,
                        drop_last=True)

In [None]:
class TFModel(nn.Module):
    def __init__(self,iw, ow, d_model, nhead, nlayers, dropout=0.5):
        super(TFModel, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=nlayers)
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        self.encoder = nn.Sequential(
            nn.Linear(1, d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, d_model)
        )

        self.linear =  nn.Sequential(
            nn.Linear(d_model, d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, 1)
        )

        self.linear2 = nn.Sequential(
            nn.Linear(iw, (iw+ow)//2),
            nn.ReLU(),
            nn.Linear((iw+ow)//2, ow)
        )

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, src, srcmask):
        src = self.encoder(src) # (4,48,512)
        src = self.pos_encoder(src) # (4,48,512)
        output = self.transformer_encoder(src.transpose(0,1), srcmask).transpose(0,1) # (4,48,512)
        output = self.linear(output)[:,:,0] # (4,48)
        output = self.linear2(output) # (4,72)

        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

def gen_attention_mask(x):
    mask = torch.eq(x, 0)
    return mask

In [None]:
device = torch.device("cuda")
model = TFModel(24*2, 24*3, 512, 8, 4, 0.1).to(device)

criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

epochs = 100

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
result.shape

In [None]:
model.train()
train_hist = np.zeros(epochs)
for epoch in range(epochs):
    avg_cost = 0
    for (inputs, outputs) in train_dataloader:
        optimizer.zero_grad()
        src_mask = model.generate_square_subsequent_mask(inputs.shape[1]).to(device)
        result = model(inputs.float().to(device),  src_mask)
        loss = criterion(result, outputs[:,:,0].float().to(device))
        loss.backward()
        optimizer.step()
        avg_cost += loss/batch_size

    train_hist[epoch] = avg_cost
    if epoch % 5 == 0:
        print('Epoch:', '%04d' % (epoch), 'train loss :', '{:.4f}'.format(avg_cost))

In [None]:
# epoch별 손실값
fig = plt.figure(figsize=(10, 4))
plt.plot(train_hist, label="Training loss")
plt.title('loss')
plt.legend()
plt.show()

In [None]:
# 모델 저장
now = datetime.datetime.now().strftime('model_%m-%d_%H:%M')


PATH = './' + now + '.pth'
torch.save(model.state_dict(), PATH)

### Validation

In [None]:
def evaluate_model(model, val_loader, criterion):
    model.eval()
    val_loss = 0
    lst = []
    with torch.no_grad():
        for (inputs, outputs) in val_loader:
            src_mask = model.generate_square_subsequent_mask(inputs.shape[1]).to(device)
            y_pred = model(inputs.float().to(device),  src_mask)
            loss = criterion(y_pred, outputs[:,:,0].float().to(device))

            val_loss += loss.item() * inputs.shape[0]
            lst.append(y_pred)
    val_loss /= len(val_loader.dataset)


    model.train()

    print('Validation MAE: {:.4f}'.format(val_loss))

# 모델의 성능 평가
criterion = torch.nn.L1Loss().to(device)
evaluate_model(model, val_dataloader, criterion)

### Prediction

In [None]:
# Load pre-trained model
model = TFModel(24*2, 24*3, 512, 8, 4, 0.1).to(device)
model.load_state_dict(torch.load(PATH), strict=False)

model.eval()

In [None]:
# Prediction
with torch.no_grad():
    pred = []
    for pr in range(len(testX_tensor)):
        src_mask = model.generate_square_subsequent_mask(testX_tensor.shape[1]).to(device)
        result = model(torch.unsqueeze(testX_tensor[pr].float(),0).to(device),  src_mask)
        result.detach().cpu().numpy()
        pred.append(result)

In [None]:
a = [np.array(pred[i].cpu()).reshape(72) for i in range(len(pred))]
b = [a[i].tolist() for i in range(len(a))]

### Submission

In [None]:
sub['PM2.5'] = sum(b, [])

In [None]:
sub.to_csv('baseline_{}.csv'.format(now[6:]), index = False)

In [None]:
pd.read_csv('baseline_{}.csv'.format(now[6:])).isna().sum()