<a href="https://colab.research.google.com/github/JockWang/colab/blob/master/SeaRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import torch
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
import logging

LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%m/%d/%Y %H:%M:%S"
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)

In [0]:
class MyDataSet(Dataset):
  def __init__(self, path, max_length=256, mode='train', valid_size=0.2):
    super(MyDataSet, self).__init__()
    self.data = []
    self.ship = []
    self.label = []
    self.mode = mode
    self.feature = pd.read_pickle(path=path+'all_feature.pkl')
    def genData(x):
      ship = list(x['ship'])[0]
      if 'type' in list(x.columns):
        label = list(x['type'])[0]
        self.label.append(label)
        x = x.drop(columns=['type'])
      x = x.drop(columns=['ship'])
      for col in list(x.columns):
        if col not in ['ship','type']:
          x[col] = (x[col]-x[col].min())/(x[col].max()-x[col].min())
      x = x.values
      if x.shape[0] < max_length:
        x = np.vstack((np.zeros((max_length-x.shape[0],x.shape[1])),x))
      else:
        x = x[-max_length:,:]
      self.data.append(x)
      self.ship.append(ship)
    if mode == 'train' or mode == 'valid' or mode == 'train_all':
      random.seed(2020)
      train_data = pd.read_pickle(path+'train.pkl')
      train_data.groupby('ship').apply(genData)
      if mode != 'train_all':
        temp_data = []
        temp_ship = []
        temp_label = []
        for i in range(len(self.ship)):
          if mode == 'train':
            if random.random() >= valid_size:
              temp_data.append(self.data[i])
              temp_ship.append(self.ship[i])
              temp_label.append(self.label[i])
          else:
            if random.random() < valid_size:
              temp_data.append(self.data[i])
              temp_ship.append(self.ship[i])
              temp_label.append(self.label[i])
        self.data = temp_data
        self.ship = temp_ship
        self.label = temp_label
    else:
      test_data = pd.read_pickle(path+'test.pkl')
      test_data.groupby('ship').apply(genData)
    self.data = np.array(self.data)
    self.data[np.isnan(self.data)] = 0
    self.ship = np.array(self.ship)
    self.label = np.array(self.label)
    self.feature[np.isnan(self.feature)] = 0
    logging.info('Generating dataset:'+mode+' Seq:'+str(self.data.shape)+' Ship:'+str(self.ship.shape)+' Feature:'+str(self.feature.shape))

  def __getitem__(self, index):
    if self.mode == 'train' or self.mode == 'valid' or self.mode == 'trian_all':
      return torch.tensor(self.data[index], dtype=torch.float), torch.tensor(self.ship[index], dtype=torch.long), torch.tensor(self.label[index], dtype=torch.long), torch.tensor(self.feature[self.ship[index],:], dtype=torch.float)
    else:
      return torch.tensor(self.data[index], dtype=torch.float), torch.tensor(self.ship[index], dtype=torch.long), torch.tensor(self.feature[self.ship[index],:], dtype=torch.float)
  
  def __len__(self):
    return len(self.ship)

In [0]:
path = '/content/drive/My Drive/Colab Notebooks/Sea/'
max_length = 512
train_dataset = MyDataSet(path=path, max_length=max_length, mode='train')
valid_dataset = MyDataSet(path=path, max_length=max_length, mode='valid')
test_dataset = MyDataSet(path=path, max_length=max_length, mode='test')

01/30/2020 15:01:34 - INFO - Generating dataset:train Seq:(5581, 512, 11) Ship:(5581,) Feature:(9000, 83)
01/30/2020 15:02:29 - INFO - Generating dataset:valid Seq:(1419, 512, 11) Ship:(1419,) Feature:(9000, 83)
01/30/2020 15:02:44 - INFO - Generating dataset:test Seq:(2000, 512, 11) Ship:(2000,) Feature:(9000, 83)


In [0]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [0]:
def valid():
  valid_y = []
  valid_pred = []
  for step, [data, ship, label, feature] in enumerate(valid_loader):
    data, ship, label, feature = data.to(device), ship.to(device), label.to(device), feature.to(device)
    valid_y += label.tolist()
    out, embed = model(ship, data.float(), feature)
    valid_pred += out.argmax(dim=1).tolist()
  logging.info('Valid f1_score: %.5f' % metrics.f1_score(valid_y, valid_pred, average='macro'))

In [0]:
class Model(nn.Module):
  def __init__(self, ship_size, ship_dim, feat_dim, rnn_hidden, hidden, ship_feature=0):
    super(Model, self).__init__()
    self.ship = nn.Embedding(ship_size, ship_dim)
    self.ship_feature = ship_feature
    self.rnn = nn.GRU(input_size=feat_dim, hidden_size=rnn_hidden, batch_first=True)
    hidden = [ship_dim+rnn_hidden+ship_feature] + hidden
    self.linears = nn.ModuleList([nn.Linear(hidden[i], hidden[i+1]) for i in range(len(hidden)-1)])
    
  def forward(self, ship, data, feature):
    ship = self.ship(ship)
    data = self.rnn(data) # (output, (h_0, c_0))
    out = torch.squeeze(data[1])
    if self.ship_feature == 0:
      out = torch.cat((ship, data[1][0]), dim=1)
    else:
      out = torch.cat((ship, data[1][0], feature), dim=1)
    for linear in self.linears:
      out = F.relu(linear(out))
    return F.log_softmax(out), torch.squeeze(data[1])

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
ship_feature = 83
ship_size = 9000
ship_dim = 32
feat_dim = 11
rnn_hidden = 8
hidden = [256, 30, 3]
model = Model(ship_size, ship_dim, feat_dim, rnn_hidden, hidden, ship_feature).to(device)
logging.info(model)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# # device = torch.device('cpu')
# ship_size = 9000
# ship_dim = 32
# feat_dim = 11
# rnn_hidden = 8
# hidden = [8, 3]
# model = Model(ship_size, ship_dim, feat_dim, rnn_hidden, hidden).to(device)
# logging.info(model)

01/30/2020 15:44:41 - INFO - Model(
  (ship): Embedding(9000, 32)
  (rnn): GRU(11, 8, batch_first=True)
  (linears): ModuleList(
    (0): Linear(in_features=123, out_features=256, bias=True)
    (1): Linear(in_features=256, out_features=30, bias=True)
    (2): Linear(in_features=30, out_features=3, bias=True)
  )
)


In [0]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0)
criterion = nn.NLLLoss()

In [0]:
epochs = 30
for epoch in tqdm(range(epochs)):
  running_loss = 0
  for step, [data, ship, label, feature] in enumerate(train_loader):
    data, ship, label, feature = data.to(device), ship.to(device), label.to(device), feature.to(device)
    optimizer.zero_grad()
    # model(ship, data.float(), feature)
    out, embed = model(ship, data.float(), feature)
    loss = criterion(out, label)
    loss.backward()
    optimizer.step()
    running_loss += loss.item()
    if step % 10 == 9:
      logging.info('Epoch:%d Step:%d loss:%.5f' % (epoch + 1, step+1, running_loss / 10))
      running_loss = 0
  valid()    






01/30/2020 15:45:43 - INFO - Epoch:1 Step:10 loss:0.38444
01/30/2020 15:45:44 - INFO - Epoch:1 Step:20 loss:0.36509
01/30/2020 15:45:44 - INFO - Epoch:1 Step:30 loss:0.36532
01/30/2020 15:45:44 - INFO - Epoch:1 Step:40 loss:0.39454
01/30/2020 15:45:44 - INFO - Epoch:1 Step:50 loss:0.31848
01/30/2020 15:45:44 - INFO - Epoch:1 Step:60 loss:0.37497
01/30/2020 15:45:45 - INFO - Epoch:1 Step:70 loss:0.37861
01/30/2020 15:45:45 - INFO - Epoch:1 Step:80 loss:0.28424
01/30/2020 15:45:45 - INFO - Epoch:1 Step:90 loss:0.37239
01/30/2020 15:45:45 - INFO - Epoch:1 Step:100 loss:0.32258
01/30/2020 15:45:45 - INFO - Epoch:1 Step:110 loss:0.35070
01/30/2020 15:45:46 - INFO - Epoch:1 Step:120 loss:0.41746
01/30/2020 15:45:46 - INFO - Epoch:1 Step:130 loss:0.37929
01/30/2020 15:45:46 - INFO - Epoch:1 Step:140 loss:0.35202
01/30/2020 15:45:46 - INFO - Epoch:1 Step:150 loss:0.28484
01/30/2020 15:45:47 - INFO - Epoch:1 Step:160 loss:0.33952
01/30/2020 15:45:47 - INFO - Epoch:1 Step:170 loss:0.37674
0

In [0]:
train_all_dataset = MyDataSet(path=path, max_length=max_length, mode='train_all')
train_all_loader = DataLoader(train_all_dataset, batch_size=batch_size, shuffle=False)

01/30/2020 15:32:06 - INFO - Generating dataset:train_all Seq:(7000, 512, 11) Ship:(7000,) Feature:(9000, 83)


In [0]:
embedding = []
for loader in [train_all_loader, test_loader]:
  for step, dataset in enumerate(loader):
    data, ship, feature = dataset[0].to(device), dataset[1].to(device), dataset[-1].to(device)
    out, embed = model(ship, data.float(), feature)
    embedding += embed.tolist()
for i in range(len(embedding)):
  embedding[i] = [i] + embedding[i]
train_embed = embedding[:7000]
test_embed = embedding[7000:]
train_embed = pd.DataFrame(train_embed,columns=['ship','embed_0','embed_1','embed_2','embed_3','embed_4','embed_5','ebmed_6','embed_7'])
test_embed = pd.DataFrame(test_embed,columns=['ship','embed_0','embed_1','embed_2','embed_3','embed_4','embed_5','ebmed_6','embed_7'])
logging.info('train_embed:'+str(train_embed.shape)+' test_embed:'+str(test_embed.shape))
pd.to_pickle(train_embed,path+'train_embed.pkl')
pd.to_pickle(test_embed,path+'test_embed.pkl')

01/30/2020 15:42:31 - INFO - train_embed:(7000, 9) test_embed:(2000, 9)
