This file is used to do many-to-one prediction

In [1]:
import pandas as pd
import os
import time
import fnmatch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
fixlen_path = '/home/jxin05/BBMAS_Data/fixlen_data/' # data with length 2mins is in this folder, each sample is a .csv file

def train_test_split(path): #split data into 80/20 as training and testing set
    file_list = os.listdir(path)
    idx = np.random.permutation(len(file_list))
    train_idx = idx[:int(len(file_list)*0.8)]
    test_idx = idx[int(len(file_list)*0.8):]
    trainfile = []
    testfile = []
    for i in train_idx:
        trainfile.append(file_list[i])
    for i in test_idx:
        testfile.append(file_list[i])
    return trainfile,testfile

def load_original_data2(path,filelist, fix_len): 
    X = np.zeros((len(filelist), 3, fix_len))
    Y = np.zeros((len(filelist),1))    
    for i in range(len(filelist)):
        x = remove_col(pd.read_csv(path+filelist[i])).values
        X[i,:,:] = x.T
    return X,Y # X in shape of (num_sample, 3, 12000), Y is a vector if 0s

def label_fromList(Xlist):
    num_sample = len(Xlist)
    Y = np.ones((num_sample,1))
    X = np.zeros((num_sample, Xlist[0].shape[1], Xlist[0].shape[0]))
    for i in range(num_sample):
        X[i, :, :] = Xlist[i].T
    return X,Y

def remove_col(df):
    return df.drop(['EID','time','time_in_ms'], axis =1)

# filelist can be list of train files or test files
# fake_number is the number of synthetic samples to be generated
# replace_time is the number of parts to be replaced in a pure sample
def mix_data(path, filelist, fake_number, replace_time, replace_size = 200):
    pick_dict = {}
    mixed_data = []
    count = 0
    while(count < fake_number):
        pick = np.random.randint(0, len(filelist)) # index of pure data to be processed
        pick_dict[count] = []
        #print("creating fake sample from " + filelist[pick])
        target = pd.read_csv(path + filelist[pick])
        mix = np.asarray(remove_col(target).values)
        for j in range(replace_time):
            start = np.random.randint(3,len(target)/replace_size-1)
            while(start in pick_dict[count]):
                start = np.random.randint(3,len(target)/replace_size-1)
            pick_dict[count].append(start)
            replace_clip = take_from_others(path, filelist,replace_size, pick)
            mix[start*replace_size:(start+1)*replace_size, :] = replace_clip
        mixed_data.append(mix)
        #print("clips replaced: {}".format(np.sort(pick_dict[count])))
        count+=1
        #print("**********")
    return mixed_data, pick_dict 

# randomly choose a part of signal whose length is replace_size from a file rather than the pick file
def take_from_others(path, filelist, replace_size, pick):
    takefrom = np.random.randint(0,len(filelist))
    while(takefrom == pick):
        takefrom = np.random.randint(0,len(filelist))
    target_df = pd.read_csv(path+filelist[takefrom])
    #print(filelist[takefrom])
    #print(len(target_df))
    start_idx = np.random.randint(0,len(target_df)-replace_size)
    replace_clip = target_df.iloc[start_idx:start_idx+replace_size]
    replace_clip = remove_col(replace_clip)
    return np.asarray(replace_clip.values)

# apply exponential moving average on the boundary between pure signal and replaced signal 
def smooth(mixed_data, pick_dict, window = 3):
    res = mixed_data
    for i in range(len(mixed_data)):
        for v in pick_dict[i]:
            start = v*200-10
            end = v*200+10
            data_x = mixed_data[i][start:end,0]
            data_y = mixed_data[i][start:end,1]
            data_z = mixed_data[i][start:end,2]
            smoothed_x = ExpMovingAverage(data_x, window)
            smoothed_y = ExpMovingAverage(data_y, window)
            smoothed_z = ExpMovingAverage(data_z, window)
            res[i][start+window+1:end-window,0] = smoothed_x[window+1:-window]
            res[i][start+window+1:end-window,1] = smoothed_y[window+1:-window]
            res[i][start+window+1:end-window,2] = smoothed_z[window+1:-window]
    return res

def ExpMovingAverage(array, window):
    weights = np.exp(np.linspace(-1., 0., window))
    weights /= weights.sum()
    
    a = np.convolve(array, weights, mode='full')[:len(array)]
    a[:window] = a[window]
    return a

# transformate input X to fit the neural network 
def transformation(x):
    if(len(x.shape) == 3): # train with batch
        inputslist = []
        for i in range(x.shape[0]):
            inputs_x = torch.from_numpy(x[i,0,:]).float()
            inputs_x = inputs_x.view(60,-1)
            inputs_y = torch.from_numpy(x[i,1,:]).float()
            inputs_y = inputs_y.view(60,-1)
            inputs_z = torch.from_numpy(x[i,2,:]).float()
            inputs_z = inputs_z.view(60,-1)
            inputs = torch.stack([inputs_x, inputs_y, inputs_z])
            inputs = inputs.transpose(0,1)
            inputslist.append(inputs)
        res = torch.stack(inputslist)
        res = res.view(x.shape[0], 60, 1, 3, 200)
        return res
    else: # train with one sample
        inputs_x = torch.from_numpy(x[0,:]).float()
        inputs_x = inputs_x.view(60,-1)
        inputs_y = torch.from_numpy(x[1,:]).float()
        inputs_y = inputs_y.view(60,-1)
        inputs_z = torch.from_numpy(x[2,:]).float()
        inputs_z = inputs_z.view(60,-1)
        inputs = torch.stack([inputs_x, inputs_y, inputs_z])
        inputs = inputs.transpose(0,1)
        inputs = inputs.view(-1, 60, 1, 3, 200)
        return inputs

#generate mini-batches
def take_batch(batch_size, X, Y):
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    for i in range(0,X.shape[0]-batch_size+1, batch_size):
        excerpt = indices[i:i + batch_size]
        yield X[excerpt], Y[excerpt]

def train_batch_m2one(net, X,Y, batch_size = 20, epochs = 25, lr = 0.01):
    opt = torch.optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
    criterion = nn.CrossEntropyLoss(weight= torch.tensor([1-(125/900), 125/900]))
    for e in range(epochs):
        h = net.init_hidden(batch_size)
        train_loss = []
        net.train()
        for x,y in take_batch(batch_size, X, Y):
            targets = torch.from_numpy(y).view(batch_size, -1)
            targets = targets.squeeze()
            #print(targets.size())
            h = tuple([each.data for each in h])
            opt.zero_grad()  
            output, h = net(x, h)
            output = output[:,:,-1]
            #print(output.size())
            loss = criterion(output, targets.long())
            train_loss.append(loss.item())
            loss.backward()
            opt.step()
        print("Epoch: {}/{}...".format(e+1, epochs),
        "Train Loss: {:.4f}...".format(np.mean(train_loss)))
        
def predict(net, x):
    with torch.no_grad():
        h = net.init_hidden(1)
        output, h = net(x,h)
    if(output[0,-1,0] >= output[0, -1, 1]):
        return 0
    else:
        return 1
    
def check_acc(net, X, Y):
    correct = 0
    wrong = 0
    for i in range(X.shape[0]):
        res = predict(net,X[i])
        if(Y[i] == res):
            correct += 1
        else:
            wrong += 1
    return correct/X.shape[0]
        
def onerun2(net, path):
    print("creating synthetic data")
    trainfile,testfile = train_test_split(path)
    train_data1, train_dict1 = mix_data(path, trainfile, 225, 30) 
    train_data2, train_dict2 = mix_data(path, trainfile, 225, 20)
    train_data3, train_dict3 = mix_data(path, trainfile, 225, 15) 
    train_data4, train_dict4 = mix_data(path, trainfile, 225, 10) 
    test_data1, test_dict1 = mix_data(path, testfile, 75, 30) 
    test_data2, test_dict2 = mix_data(path, testfile, 75, 20)
    test_data3, test_dict3 = mix_data(path, testfile, 75, 15) 
    test_data4, test_dict4 = mix_data(path, testfile, 75, 10) 
    print("smoothing data")
    train_data1 = smooth(train_data1, train_dict1)
    train_data2 = smooth(train_data2, train_dict2)
    train_data3 = smooth(train_data3, train_dict3)
    train_data4 = smooth(train_data4, train_dict4)
    test_data1 = smooth(test_data1, test_dict1)
    test_data2 = smooth(test_data2, test_dict2)
    test_data3 = smooth(test_data3, test_dict3)
    test_data4 = smooth(test_data4, test_dict4)
    X1_train, Y1_train = label_fromList(train_data1)
    X2_train, Y2_train = label_fromList(train_data2)
    X3_train, Y3_train = label_fromList(train_data3)
    X4_train, Y4_train = label_fromList(train_data4)
    print("split datasets")
    original_trainX, original_trainY = load_original_data2(path,trainfile, 12000)
    X1_test, Y1_test = label_fromList(test_data1)
    X2_test, Y2_test = label_fromList(test_data2)
    X3_test, Y3_test = label_fromList(test_data3)
    X4_test, Y4_test = label_fromList(test_data4)
    original_testX, original_testY = load_original_data2(path,testfile, 12000)
    train_X = np.vstack([X1_train, X2_train, X3_train, original_trainX, X4_train])
    train_Y = np.vstack([Y1_train, Y2_train, Y3_train, original_trainY, Y4_train])
    test_X = np.vstack([X1_test, X2_test, X3_test, original_testX, X4_test])
    test_Y = np.vstack([Y1_test, Y2_test, Y3_test, original_testY, Y4_test])
    
    train_batch_m2one(net, train_X, train_Y, epochs=20)
    acc1 = check_acc(net, X1_test, Y1_test)
    acc2 = check_acc(net, X2_test, Y2_test)
    acc3 = check_acc(net, X3_test, Y3_test)
    acc4 = check_acc(net, X1_test, Y4_test)
    acc5 = check_acc(net, original_testX, original_testY)
    return acc1, acc2, acc3, acc4, acc5

In [18]:
class cnn_lstm2(nn.Module):
    
    def __init__(self,drop_prob = 0.3, n_class = 2, n_layer = 1):
        super(cnn_lstm2, self).__init__()
        self.n_class = n_class
        
        self.conv1 = nn.Conv2d(1,32,(2,8), stride = (1,4))
        self.conv2 = nn.Conv2d(32,64, (2,4), stride = (1,2))
        self.conv3 = nn.Conv1d(64,32, 4, stride = 2)
        self.lstm = nn.LSTM(320, 128, num_layers = 1, batch_first = True)
        self.fc1 = nn.Linear(128, 32)
        self.fc2 = nn.Linear(32,n_class)
        self.dropout = nn.Dropout(drop_prob)
        self.outlayer = nn.Softmax(dim = 2)
        
    def forward(self, x, hidden):
        inputs = transformation(x)
        batch_size, timesteps, C, H, W = inputs.size()
        inputs = inputs.view(batch_size*timesteps,C,H,W)
        inputs = self.conv1(inputs)
        inputs = F.relu(inputs)
        inputs = self.conv2(inputs)
        inputs = F.relu(inputs)
        inputs = inputs.squeeze()
        inputs = self.conv3(inputs)
        inputs = F.relu(inputs)
        
        inputs = inputs.view(batch_size, timesteps, -1)
        inputs, hidden = self.lstm(inputs,hidden)
        inputs = self.dropout(inputs)
        inputs = self.fc1(inputs)
        inputs = self.dropout(inputs)
        inputs = self.fc2(inputs)
        out = self.outlayer(inputs)
        
        if(batch_size != 1):
            #print("doing transpose")
            out = out.transpose(1,2)
        return out, hidden
    
    def init_hidden(self,batch_size):
        weight = next(self.parameters()).data
        
        hidden = (weight.new(1, batch_size, 128).zero_(),
                      weight.new(1, batch_size, 128).zero_())
        return hidden

def init_weights1(m):
    if type(m) == nn.LSTM:
        for name, param in m.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.orthogonal_(param.data)
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                param.data.fill_(0)
    elif type(m) == nn.Conv2d or type(m) == nn.Conv1d or type(m) == nn.Linear:
        torch.nn.init.orthogonal_(m.weight)
        m.bias.data.fill_(0)

In [37]:
net2 = cnn_lstm2()
net2.apply(init_weights1)
acc1, acc2, acc3, acc4, acc5 = onerun2(net2,fixlen_path)


creating synthetic data
smoothing data
split datasets
Epoch: 1/20... Train Loss: 0.6860...
Epoch: 2/20... Train Loss: 0.6371...
Epoch: 3/20... Train Loss: 0.6043...
Epoch: 4/20... Train Loss: 0.5963...
Epoch: 5/20... Train Loss: 0.5896...
Epoch: 6/20... Train Loss: 0.5567...
Epoch: 7/20... Train Loss: 0.5809...
Epoch: 8/20... Train Loss: 0.5002...
Epoch: 9/20... Train Loss: 0.4719...
Epoch: 10/20... Train Loss: 0.5585...
Epoch: 11/20... Train Loss: 0.4750...
Epoch: 12/20... Train Loss: 0.4683...
Epoch: 13/20... Train Loss: 0.5336...
Epoch: 14/20... Train Loss: 0.5045...
Epoch: 15/20... Train Loss: 0.6540...
Epoch: 16/20... Train Loss: 0.6358...
Epoch: 17/20... Train Loss: 0.6188...
Epoch: 18/20... Train Loss: 0.6726...
Epoch: 19/20... Train Loss: 0.7022...
Epoch: 20/20... Train Loss: 0.6153...


In [38]:
print(acc1)
print(acc2)
print(acc3)
print(acc4)
print(acc5)

0.92
0.84
0.72
0.96
1.0
