In [1]:
import pandas as pd
import gc
import numpy as np
import datetime
import math
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

from sklearn.preprocessing import LabelEncoder

import platform
# 차트에서 한글 출력을 위한 설정
your_os = platform.system()
if your_os == 'Linux':
    rc('font', family='NanumGothic')
elif your_os == 'Windows':
    ttf = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=ttf).get_name()
    
    rc('font', family=font_name)
elif your_os == 'Darwin':
    rc('font', family='AppleGothic')
rc('axes', unicode_minus=False)


import torch
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더

gc.collect()



0

In [8]:
df_tr = pd.read_csv('train_04.csv')
test = pd.read_csv('test_04.csv')
test = test.loc[test['PM2.5'].isna() == False].reset_index().iloc[:,1:]

sub = pd.read_csv('answer_sample.csv')

In [9]:
df_tr.shape, test.shape

((595680, 20), (52224, 20))

## DataLoader

* Split train/val

In [10]:
train = df_tr[df_tr['연도'].isin([0, 1, 2])]
val = df_tr[df_tr['연도'] == 3]

In [14]:
train = train.drop(columns = '일시').reset_index().iloc[:,1:]
val = val.drop(columns = '일시').reset_index().iloc[:,1:]
test = test.drop(columns = '일시')

train = train.astype(float)
val = val.astype(float)
test = test.astype(float)

train.shape, val.shape, test.shape

((446760, 19), (148920, 19), (52224, 19))

* Sliding Window

In [None]:
seq_length = 2
output_dim = 3
batch_size = 32

In [None]:
def build_train_dataset(time_series, seq_length, output_dim):
    dataX = []
    dataY = []

    for r in tqdm(time_series.측정소.unique().tolist()):
        a = time_series.loc[time_series.측정소 == r].values
        for i in range(0, (a.shape[0]//((seq_length+output_dim)*24))*((seq_length+output_dim)*24), (seq_length+output_dim)*24):
            _x = a[i:i+seq_length*24, [-1]]
            _y = a[i+seq_length*24:(i+seq_length*24)+output_dim*24, [-1]]

            dataX.append(_x)
            dataY.append(_y)
            
    return np.array(dataX), np.array(dataY)


def build_test_dataset(time_series):
    dataX = []
    for r in tqdm(time_series.측정소.unique().tolist()):
        a = time_series.loc[time_series.측정소 == r].values
        for i in range(0,a.shape[0]//48):
            _x = a[i*48:(i*48)+48,[-1]]
            dataX.append(_x)
            
    return np.array(dataX)

In [None]:
trainX, trainY = build_train_dataset(train, seq_length, output_dim)
valX, valY = build_train_dataset(val, seq_length, output_dim)
testX = build_test_dataset(test)

In [None]:
trainX.shape, trainY.shape, valX.shape, valY.shape, testX.shape

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
## Train
train_dataset = TensorDataset(torch.tensor(trainX, dtype=torch.float32).to(device), 
                              torch.tensor(trainY, dtype=torch.float32).to(device))

val_dataset = TensorDataset(torch.tensor(valX, dtype=torch.float32).to(device), 
                              torch.tensor(valY, dtype=torch.float32).to(device))

## Test
testX_tensor = torch.FloatTensor(testX)

In [None]:
train_dataloader = DataLoader(train_dataset,
                        batch_size=batch_size,
                        shuffle=False,  
                        drop_last=True)

val_dataloader = DataLoader(val_dataset,
                        batch_size=batch_size,
                        shuffle=False,  
                        drop_last=True)

In [None]:
class TFModel(nn.Module):
    def __init__(self,iw, ow, d_model, nhead, nlayers, dropout=0.5):
        super(TFModel, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=nlayers) 
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        self.encoder = nn.Sequential(
            nn.Linear(1, d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, d_model)
        )
        
        self.linear =  nn.Sequential(
            nn.Linear(d_model, d_model//2),
            nn.ReLU(),
            nn.Linear(d_model//2, 1)
        )

        self.linear2 = nn.Sequential(
            nn.Linear(iw, (iw+ow)//2),
            nn.ReLU(),
            nn.Linear((iw+ow)//2, ow)
        ) 

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, src, srcmask):
        src = self.encoder(src) # (4,48,512)
        src = self.pos_encoder(src) # (4,48,512)
        output = self.transformer_encoder(src.transpose(0,1), srcmask).transpose(0,1) # (4,48,512)
        output = self.linear(output)[:,:,0] # (4,48)
        output = self.linear2(output) # (4,72)

        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

def gen_attention_mask(x):
    mask = torch.eq(x, 0)
    return mask

In [None]:
device = torch.device("cuda")
model = TFModel(24*2, 24*3, 512, 8, 4, 0.1).to(device)

criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

epochs = 100

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
result.shape

In [None]:
model.train()
train_hist = np.zeros(epochs)
for epoch in range(epochs):
    avg_cost = 0
    for (inputs, outputs) in train_dataloader:
        optimizer.zero_grad()
        src_mask = model.generate_square_subsequent_mask(inputs.shape[1]).to(device)
        result = model(inputs.float().to(device),  src_mask)
        loss = criterion(result, outputs[:,:,0].float().to(device))
        loss.backward()
        optimizer.step()
        avg_cost += loss/batch_size
    
    train_hist[epoch] = avg_cost
    if epoch % 5 == 0:
        print('Epoch:', '%04d' % (epoch), 'train loss :', '{:.4f}'.format(avg_cost))

In [None]:
# epoch별 손실값
fig = plt.figure(figsize=(10, 4))
plt.plot(train_hist, label="Training loss")
plt.title('loss')
plt.legend()
plt.show()

In [None]:
# 모델 저장
now = datetime.datetime.now().strftime('model_%m-%d_%H:%M')


PATH = './' + now + '.pth'
torch.save(model.state_dict(), PATH)

### Validation

In [None]:
def evaluate_model(model, val_loader, criterion):
    model.eval()
    val_loss = 0
    lst = []
    with torch.no_grad():
        for (inputs, outputs) in val_loader:
            src_mask = model.generate_square_subsequent_mask(inputs.shape[1]).to(device)
            y_pred = model(inputs.float().to(device),  src_mask)
            loss = criterion(y_pred, outputs[:,:,0].float().to(device))
            
            val_loss += loss.item() * inputs.shape[0]
            lst.append(y_pred)
    val_loss /= len(val_loader.dataset)
    
    
    model.train()
    
    print('Validation MAE: {:.4f}'.format(val_loss))

# 모델의 성능 평가
criterion = torch.nn.L1Loss().to(device)
evaluate_model(model, val_dataloader, criterion)

### Prediction

In [None]:
# Load pre-trained model
model = TFModel(24*2, 24*3, 512, 8, 4, 0.1).to(device)
model.load_state_dict(torch.load(PATH), strict=False)

model.eval()

In [None]:
# Prediction
with torch.no_grad(): 
    pred = []
    for pr in range(len(testX_tensor)):
        src_mask = model.generate_square_subsequent_mask(testX_tensor.shape[1]).to(device)
        result = model(torch.unsqueeze(testX_tensor[pr].float(),0).to(device),  src_mask)
        result.detach().cpu().numpy()
        pred.append(result)

In [None]:
a = [np.array(pred[i].cpu()).reshape(72) for i in range(len(pred))]
b = [a[i].tolist() for i in range(len(a))]

### Submission

In [None]:
sub['PM2.5'] = sum(b, [])

In [None]:
sub.to_csv('baseline_{}.csv'.format(now[6:]), index = False)

In [None]:
pd.read_csv('baseline_{}.csv'.format(now[6:])).isna().sum()