In [1]:
import pandas as pd
import shapely
import numpy as np
import sys
import torch
import argparse
import random
import math
import os
import warnings
from gensim.models import word2vec
from shapely.geometry import LineString, Polygon
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler# 好处在于可以保存训练集中的参数（均值、方差）
from scipy.stats import stats
from torch.utils.data import DataLoader
from torch.optim import lr_scheduler
import tqdm
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
pd.options.display.max_columns = 999
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))
import gc
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
"""
bild the model, loss and data class, including two different versions
version 1:
seq to seq model
version old:
the old seq to seq model without any paramseters
versioin 2:
seq to 1 model
"""
class TS_rnn(torch.nn.Module):
    """
    scores for each piece
    input:
        tensor size of (batch_size, seq_len, num_dim)
    output:
        tensor size of (batch_size, seq_len)
    """
    def __init__(self, num_hidden = 64, num_layers = 2, dropout = 0.5):
        super(TS_rnn, self).__init__()
        #change the structure of the network
        num_inp = 13
        self.rnn = torch.nn.LSTM(input_size = num_inp, hidden_size = num_hidden, num_layers = num_layers, dropout = dropout)
        self.mlp = torch.nn.Sequential(
                torch.nn.Linear(num_hidden, 16),
                torch.nn.Dropout(),
                torch.nn.ReLU(),
                torch.nn.Linear(16, 1)
                )

    def forward(self, inp):
        # input of the rnn (seq_len, batch, input_size)
        data_in = torch.transpose(inp, 0, 1)
        # run rnn, it has two output
        out_rnn, _ = self.rnn(data_in)
        out_rnn = torch.transpose(out_rnn, 0, 1) # (batch_size, seq_len, num_dim)
        # rnn the mlp
        batch_size, seq_len, num_dim = out_rnn.shape
        out = []
        for i in range(seq_len):
            tmp = self.mlp(out_rnn[:, i,:])
            out.append(tmp)
        # now out is list of (batch_size, 1), combine the items in the list to get the output with size (batch_size, seq_len)
        out = torch.cat(out, 1)
        #return out.squeeze() when the batch_size == 1, this can course trouble
        return out

class TS_rnn_old(torch.nn.Module):
    """
    scores for each piece
    input:
        tensor size of (batch_size, seq_len, num_dim)
    output:
        tensor size of (batch_size, seq_len)
    """
    def __init__(self):
        super(TS_rnn_old, self).__init__()
        #change the structure of the network
        num_inp = 13
        num_hidden = 64
        self.rnn = torch.nn.LSTM(input_size = num_inp, hidden_size = num_hidden, num_layers = 2)
        self.mlp = torch.nn.Sequential(
                torch.nn.Linear(num_hidden, 16),
                torch.nn.Dropout(),
                torch.nn.ReLU(),
                torch.nn.Linear(16, 1)
                )

    def forward(self, inp):
        # input of the rnn (seq_len, batch, input_size)
        data_in = torch.transpose(inp, 0, 1)
        # run rnn, it has two output
        out_rnn, _ = self.rnn(data_in)
        out_rnn = torch.transpose(out_rnn, 0, 1) # (batch_size, seq_len, num_dim)
        # rnn the mlp
        batch_size, seq_len, num_dim = out_rnn.shape
        out = []
        for i in range(seq_len):
            tmp = self.mlp(out_rnn[:, i,:])
            out.append(tmp)
        # now out is list of (batch_size, 1), combine the items in the list to get the output with size (batch_size, seq_len)
        out = torch.cat(out, 1)
        #return out.squeeze() when the batch_size == 1, this can course trouble
        return out

class TS_rnn2(torch.nn.Module):
    """
    scores only for the whole task
    input:
        tensor size of (batch_size, seq_len, num_dim)
    output:
        tensor size of (batch_size)
    """
    def __init__(self):
        super(TS_rnn2, self).__init__()
        #change the structure of the network
        num_inp = 8
        num_hidden = 64
        self.rnn = torch.nn.LSTM(input_size = num_inp, hidden_size = num_hidden, num_layers = 2)
        self.mlp = torch.nn.Sequential(
                torch.nn.Linear(num_hidden, 64),
                torch.nn.Dropout(),
                torch.nn.ReLU(),
                torch.nn.Linear(64, 1)
                )

    def forward(self, inp):
        # input of the rnn (seq_len, batch, input_size)
        data_in = torch.transpose(inp, 0, 1)
        # run rnn, it has two output
        out_rnn, _ = self.rnn(data_in)
        out_rnn = torch.transpose(out_rnn, 0, 1) # (batch_size, seq_len, num_dim)
        # only use the last output
        out_rnn = out_rnn[:, -1, :].squeeze()
        # rnn the mlp
        out = self.mlp(out_rnn)
        return out.squeeze()
    
class PDLoss(torch.nn.Module):
    def __init__(self, p = 2):
        super(PDLoss, self).__init__()
        self.pd = torch.nn.PairwiseDistance(p)

    def forward(self, o, t):
        # out: (batch_size, 1)
        out = self.pd(o, t)
        return out.mean()

class Data:
    """
    data class for TS_rnn
    """
    def __init__(self, x, y):
        self.data = {}
        self.data['train_x'] = self.add_file(x).float()
        self.data['train_y'] = self.add_file(y)[:, :, -1].float() # use the first metric tempately
        assert(len(self.data['train_x']) == len(self.data['train_y']))
        self.len = len(self.data['train_x'])

    def add_file(self, path):
        return torch.from_numpy(np.load(path))

    def add_scores(self, path):
        return torch.FloatTensor([float(li.rstrip('\n')) for li in open(path)])

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        return (self.data['train_x'][index],
                self.data['train_y'][index])

class Data2:
    """
    data class for TS_rnn2
    """
    def __init__(self, x, y):
        self.data = {}
        self.data['train_x'] = self.add_file(x)
        self.data['train_y'] = self.add_file(y)[:, :, -1] # use the first metric tempately
        self.data['train_y'] = torch.mean(self.data['train_y'], 1)
        assert(len(self.data['train_x']) == len(self.data['train_y']))
        self.len = len(self.data['train_x'])

    def add_file(self, path):
        return torch.from_numpy(np.load(path))

    def add_scores(self, path):
        return torch.FloatTensor([float(li.rstrip('\n')) for li in open(path)])

    def __len__(self):
        return self.len

    def __getitem__(self, index):
        return (self.data['train_x'][index],
                self.data['train_y'][index])

'\nbild the model, loss and data class, including two different versions\nversion 1:\nseq to seq model\nversion old:\nthe old seq to seq model without any paramseters\nversioin 2:\nseq to 1 model\n'

In [3]:
# write the test function
def test_model(dl_test, model, loss):
    model.eval()
    test_loss = 0
    counter = 0
    for batch_idx, dat in enumerate(dl_test):
        counter += 1
        # codes to be changed
        inp, target = dat
        out = model(inp)
        lo = loss(out, target)
        test_loss += lo.data
    return test_loss/counter

def significant_test(dl_test, model, loss):
    model.eval()
    test_loss = 0
    counter = 0
    hit = 0
    miss = 0
    for batch_idx, dat in enumerate(dl_test):
        counter += 1
        # codes to be changed
        inp, target = dat
        out = model(inp)
        #target = target.mean(dim = 1)
        target = target[:, :].mean(dim = 1)
        #print(out.shape)
        #out = out.mean(dim = 1)
        out = out[:, :].mean(dim = 1)
        #print(out.shape)
        if len(inp) > 5:
            _, top_target = torch.topk(target, 1, largest=False)
            _, top_predict = torch.topk(out, 5, largest = False)
            if top_target in top_predict:
                hit += 1
            else:
                miss += 1
    return hit * 1.0/(hit + miss)

def metric2(dl_test, model, loss):
    model.eval()
    counter = 0
    hit_count = {}
    for batch_idx, dat in enumerate(dl_test):
        counter += 1
        inp, target = dat
        out = model(inp)
        #target = target.mean(dim = 1)
        #out = out.mean(dim = 1)
        target = target[:, :].mean(dim = 1)
        out = out[:, :].mean(dim = 1)
        if len(inp) > 5:
            _, index_top_target = torch.topk(target, 1, largest = False)
            _, index_rank = torch.topk(out, len(target), largest = False)
            index_rank = index_rank.tolist()
            index_in_rank = index_rank.index(index_top_target)
            if index_in_rank not in hit_count.keys():
                #print('create new key')
                hit_count[index_in_rank] = 1
            else:
                #print('add one')
                hit_count[index_in_rank] = hit_count[index_in_rank] + 1
    return hit_count

def metric3(dl_test, model, loss):
    model.eval()
    counter = 0
    hit_count = {}
    for batch_idx, dat in enumerate(dl_test):
        counter += 1
        inp, target = dat
        out = model(inp)
        #target = target.mean(dim = 1)
        #out = out.mean(dim = 1)
        target = target[:, :].mean(dim = 1)
        out = out[:, :].mean(dim = 1)
        if len(inp) > 5:
            #_, index_top_target = torch.topk(target, 1, largest = False)
            _, index_top_out = torch.topk(out, 1, largest = False)
            #_, index_rank = torch.topk(out, len(target), largest = False)
            _, index_rank = torch.topk(target, len(target), largest = False)
            index_rank = index_rank.tolist()
            index_in_rank = index_rank.index(index_top_out)
            if index_in_rank not in hit_count.keys():
                #print('create new key')
                hit_count[index_in_rank] = 1
            else:
                #print('add one')
                hit_count[index_in_rank] = hit_count[index_in_rank] + 1
    return hit_count

In [26]:
def evaluate(pm, testdata):
    # read extra test set
    test = pd.read_pickle(testdata)
    test_x = test.iloc[:, :650].values.reshape(len(test), 50, -1)
    test_y = test.iloc[:, 650:].values.reshape(len(test), 50, -1)
    np.save('../data/rnn_test_x', test_x)
    np.save('../data/rnn_test_y', test_y)
    loss = torch.nn.L1Loss()
    test_x = '../data/rnn_test_x.npy'
    test_y = '../data/rnn_test_y.npy'
    test = Data(test_x, test_y)
    dl_test = DataLoader(test, batch_size = 100, shuffle = True)
    model = torch.load(pm)
    me = 0
    for i in range(5):
        lo = significant_test(dl_test, model, loss)
        me += lo
        print('test ' + str(i)+': ' + str(lo))
    hit_count = metric3(dl_test, model, loss)
    hit_count = sorted(hit_count.items(), key = lambda x: x[0])
    #out = [i[1] for i in hit_count]
    out = hit_count
    return (me, out)

In [38]:
# "mean-44": ('../models/TS_rnn_mean/rnn_15.pkl', '../results/sample_test_6_mean.pkl'),
tmp = evaluate('../models/TS_rnn_mean/rnn_15.pkl', '../results/sample_test_6_mean.pkl')

test 0: 0.8941176470588236
test 1: 0.8352941176470589
test 2: 0.8941176470588236
test 3: 0.8470588235294118
test 4: 0.8823529411764706


In [42]:
rs['mean-44']

(4.352941176470589,
 [(0, 37),
  (1, 16),
  (2, 9),
  (3, 7),
  (4, 2),
  (5, 1),
  (6, 3),
  (7, 2),
  (8, 4),
  (9, 1),
  (10, 1),
  (13, 1),
  (22, 1)])

In [31]:
td2 = {
    'v2-44': ('../models/TS_rnn_v2/rnn_30.pkl', '../results/Dataframe_feature_test_6.pkl'),
    'v2-54': ('../models/TS_rnn_v2/rnn_30.pkl', '../results/Dataframe_feature_test_4.pkl'),
    'v2-10': ('../models/TS_rnn_v2/rnn_30.pkl', '../results/Dataframe_feature_test_10.pkl'),
    'v2-20': ('../models/TS_rnn_v2/rnn_30.pkl', '../results/Dataframe_feature_test_11.pkl'),
    'v2-30': ('../models/TS_rnn_v2/rnn_30.pkl', '../results/Dataframe_feature_test_14.pkl'),
    'v2-40': ('../models/TS_rnn_v2/rnn_30.pkl', '../results/Dataframe_feature_test_16.pkl'),
    "v3-44": ('../models/TS_rnn_v3/rnn_32.pkl', '../results/Dataframe_feature_test_7.pkl'),
    'v3-54': ('../models/TS_rnn_v3/rnn_32.pkl', '../results/Dataframe_feature_test_7.pkl'),
    'v3-10': ('../models/TS_rnn_v3/rnn_32.pkl', '../results/Dataframe_feature_test_9.pkl'),
    'v3-20': ('../models/TS_rnn_v3/rnn_32.pkl', '../results/Dataframe_feature_test_12.pkl'),
    'v3-30': ('../models/TS_rnn_v3/rnn_32.pkl', '../results/Dataframe_feature_test_13.pkl'),
    'v3-40': ('../models/TS_rnn_v3/rnn_32.pkl', '../results/Dataframe_feature_test_15.pkl'),
}

In [32]:
td = {
    'area-44': ('../models/TS_rnn_area/rnn_15.pkl', '../results/sample_test_6_area_quote.pkl'),
    'area-54': ('../models/TS_rnn_area/rnn_15.pkl', '../results/sample_test_4_area_quote.pkl'),
    'area-10': ('../models/TS_rnn_area/rnn_15.pkl', '../results/sample_test_10_area_quote.pkl'),
    'area-20': ('../models/TS_rnn_area/rnn_15.pkl', '../results/sample_test_11_area_quote.pkl'),
    'area-30': ('../models/TS_rnn_area/rnn_15.pkl', '../results/sample_test_14_area_quote.pkl'),
    'area-40': ('../models/TS_rnn_area/rnn_15.pkl', '../results/sample_test_16_area_quote.pkl'),
    "mean-44": ('../models/TS_rnn_mean/rnn_15.pkl', '../results/sample_test_6_mean.pkl'),
    'mean-54': ('../models/TS_rnn_mean/rnn_15.pkl', '../results/sample_test_4_mean.pkl'),
    'mean-10': ('../models/TS_rnn_mean/rnn_15.pkl', '../results/sample_test_10_mean.pkl'),
    'mean-20': ('../models/TS_rnn_mean/rnn_15.pkl', '../results/sample_test_11_mean.pkl'),
    'mean-30': ('../models/TS_rnn_mean/rnn_15.pkl', '../results/sample_test_14_mean.pkl'),
    'mean-40': ('../models/TS_rnn_mean/rnn_15.pkl', '../results/sample_test_16_mean.pkl'),
    "norm-44": ('../models/TS_rnn_norm/rnn_15.pkl', '../results/sample_test_6_norm_max_x.pkl'),
    'norm-54': ('../models/TS_rnn_norm/rnn_15.pkl', '../results/sample_test_4_norm_max_x.pkl'),
    'norm-10': ('../models/TS_rnn_norm/rnn_15.pkl', '../results/sample_test_10_norm_max_x.pkl'),
    'norm-20': ('../models/TS_rnn_norm/rnn_15.pkl', '../results/sample_test_11_norm_max_x.pkl'),
    'norm-30': ('../models/TS_rnn_norm/rnn_15.pkl', '../results/sample_test_14_norm_max_x.pkl'),
    'norm-40': ('../models/TS_rnn_norm/rnn_15.pkl', '../results/sample_test_16_norm_max_x.pkl'),
}

In [33]:
# separate metrics
rs = {}
for name, value in td.items():
    print('###########')
    print('#' + name + '#')
    print('###########')
    rs[name] = evaluate(value[0], value[1])

###########
#area-44#
###########
test 0: 0.9764705882352941
test 1: 0.9647058823529412
test 2: 0.9176470588235294
test 3: 0.9764705882352941
test 4: 0.9529411764705882
###########
#area-54#
###########
test 0: 0.9342105263157895
test 1: 0.9407894736842105
test 2: 0.9342105263157895
test 3: 0.9342105263157895
test 4: 0.9144736842105263
###########
#area-10#
###########
test 0: 0.6120218579234973
test 1: 0.5901639344262295
test 2: 0.6284153005464481
test 3: 0.6120218579234973
test 4: 0.6229508196721312
###########
#area-20#
###########
test 0: 0.9202898550724637
test 1: 0.8478260869565217
test 2: 0.9057971014492754
test 3: 0.9130434782608695
test 4: 0.9057971014492754
###########
#area-30#
###########
test 0: 0.9251700680272109
test 1: 0.9183673469387755
test 2: 0.8775510204081632
test 3: 0.9115646258503401
test 4: 0.9115646258503401
###########
#area-40#
###########
test 0: 0.9133333333333333
test 1: 0.9
test 2: 0.9266666666666666
test 3: 0.9
test 4: 0.96
###########
#mean-44#
########

In [34]:
# combined metrics
rs2 = {}
for name, value in td2.items():
    print('###########')
    print('#' + name + '#')
    print('###########')
    rs2[name] = evaluate(value[0], value[1])

###########
#v2-44#
###########
test 0: 0.8375
test 1: 0.9
test 2: 0.925
test 3: 0.8875
test 4: 0.8875
###########
#v2-54#
###########
test 0: 0.6907894736842105
test 1: 0.6973684210526315
test 2: 0.7236842105263158
test 3: 0.6776315789473685
test 4: 0.6907894736842105
###########
#v2-10#
###########
test 0: 0.1366120218579235
test 1: 0.12568306010928962
test 2: 0.1912568306010929
test 3: 0.16393442622950818
test 4: 0.14754098360655737
###########
#v2-20#
###########
test 0: 0.4855072463768116
test 1: 0.4927536231884058
test 2: 0.47101449275362317
test 3: 0.5
test 4: 0.43478260869565216
###########
#v2-30#
###########
test 0: 0.5782312925170068
test 1: 0.5782312925170068
test 2: 0.5578231292517006
test 3: 0.5170068027210885
test 4: 0.5918367346938775
###########
#v2-40#
###########
test 0: 0.58
test 1: 0.6733333333333333
test 2: 0.6333333333333333
test 3: 0.6066666666666667
test 4: 0.5866666666666667
###########
#v3-44#
###########
test 0: 0.7662337662337663
test 1: 0.8181818181818182


In [35]:
import pickle

In [43]:
with open('result_separate_metric_new.pickle', 'wb') as handle:
    pickle.dump(rs, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('result_combined_metric_new.pickle', 'wb') as handle:
    pickle.dump(rs2, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [37]:
for n, v in rs.items():
    print('--------------')
    print(n)
    print(v)

--------------
area-44
(4.788235294117647, [(0, 49), (1, 20), (2, 7), (3, 3), (4, 3), (7, 1), (8, 1), (11, 1)])
--------------
area-54
(4.657894736842105, [(0, 86), (1, 26), (2, 21), (3, 5), (4, 2), (5, 1), (6, 6), (7, 3), (8, 1), (30, 1)])
--------------
area-10
(3.0655737704918034, [(0, 56), (1, 30), (2, 8), (3, 14), (4, 9), (5, 8), (6, 4), (7, 5), (8, 4), (9, 3), (10, 1), (11, 1), (12, 3), (13, 2), (14, 1), (15, 5), (16, 2), (17, 1), (19, 1), (21, 2), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (30, 1), (33, 1), (37, 3), (38, 1), (39, 2), (43, 1), (44, 1), (47, 2), (48, 1), (54, 1), (71, 1), (89, 1), (98, 1)])
--------------
area-20
(4.492753623188406, [(0, 71), (1, 24), (2, 15), (3, 4), (4, 7), (5, 5), (6, 3), (8, 1), (9, 1), (10, 2), (11, 2), (15, 1), (17, 1), (48, 1)])
--------------
area-30
(4.54421768707483, [(0, 85), (1, 23), (2, 12), (3, 6), (4, 4), (5, 4), (6, 4), (7, 2), (8, 2), (11, 1), (13, 1), (16, 1), (30, 1), (50, 1)])
--------------
area-40
(4.6, [(0, 82), (1, 29), (