In [1]:
import os
import math
import torch
import random
import pickle
import pandas
import calendar
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
os.environ['KMP_DUPLICATE_LIB_OK']='True'
torch.set_default_tensor_type(torch.FloatTensor)
import config

# Read and process data

In [2]:
x = pandas.read_csv("./data/act_RT_ratio.csv", header=None)
x

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,5.747525e+05,12544.854490,0.000098,0.000076,0.000494,0.000035,0.021827,0.770558,0.070874
1,1.855941e+06,7804.580458,0.000080,0.000069,0.000492,0.000035,0.004205,0.864428,0.070792
2,1.055941e+06,13348.334830,0.000094,0.000042,0.000491,0.000040,0.012641,0.445271,0.080725
3,1.616157e+06,20343.034300,0.000092,0.000067,0.000481,0.000036,0.012587,0.732824,0.075100
4,9.382088e+05,31791.979200,0.000087,0.000071,0.000485,0.000036,0.033886,0.818391,0.073257
...,...,...,...,...,...,...,...,...,...
9995,7.599910e+05,26895.689570,0.000096,0.000076,0.000495,0.000031,0.035389,0.798953,0.062688
9996,1.978893e+06,14908.490850,0.000090,0.000058,0.000486,0.000035,0.007534,0.644370,0.072477
9997,1.751350e+06,28643.064310,0.000090,0.000046,0.000489,0.000039,0.016355,0.503876,0.079677
9998,1.714986e+06,21232.323230,0.000087,0.000048,0.000485,0.000037,0.012380,0.555556,0.075883


In [3]:
X = []
for i in range(10000):
    
    f1 = x.iloc[i, :6]
    F = [float(f) for f in f1.to_numpy()]
    X.append(F)
X = torch.tensor(X)
X.shape

torch.Size([10000, 6])

In [4]:
X_max = torch.max(X, dim=0)[0]
X_min = torch.min(X, dim=0)[0]

In [5]:
X_max

tensor([2.0500e+06, 3.2000e+04, 1.0000e-04, 8.0000e-05, 5.0200e-04, 4.0000e-05])

In [6]:
X_min

tensor([2.5000e+05, 6.0000e+03, 8.0000e-05, 4.0000e-05, 4.8000e-04, 3.0000e-05])

In [7]:
Xn = config.Normalization(X, X_min, X_max)
Xn

tensor([[0.1804, 0.2517, 0.9000, 0.9000, 0.6364, 0.5000],
        [0.8922, 0.0694, 0.0000, 0.7250, 0.5455, 0.5000],
        [0.4477, 0.2826, 0.7000, 0.0500, 0.5000, 1.0000],
        ...,
        [0.8341, 0.8709, 0.5000, 0.1500, 0.4091, 0.9000],
        [0.8139, 0.5859, 0.3500, 0.2000, 0.2273, 0.7000],
        [0.9377, 0.3681, 0.3500, 0.2250, 0.0909, 0.5000]])

In [8]:
Xn.max(), Xn.min()

(tensor(1.), tensor(0.))

In [9]:
y = pandas.read_csv("./data/act_eta.csv", header=None)
y.shape

(10000, 5)

In [10]:
Y = []
for i in range(10000):
    
    f1 = y.iloc[i, :]
    F = [float(f) for f in f1.to_numpy()]
    Y.append(F)
Y = torch.tensor(Y)
Y.shape

torch.Size([10000, 5])

In [11]:
useful = torch.where(Y[:,-1]<0.1)[0]
useful

tensor([   2,    6,    8,  ..., 9991, 9995, 9997])

In [12]:
Xn = Xn[useful,:]
X  = X[useful,:]
Xn.shape, X.shape

(torch.Size([3777, 6]), torch.Size([3777, 6]))

In [13]:
Y = Y[useful,:-1]
Y.shape

torch.Size([3777, 4])

In [14]:
Y_max = torch.max(Y, dim=0)[0]
Y_min = torch.min(Y, dim=0)[0]

In [15]:
Yn = config.Normalization(Y, Y_min, Y_max)

In [16]:
Yn.max(), Yn.min()

(tensor(1.), tensor(0.))

# Make dataset

In [17]:
E_train = int(X.shape[0]*0.7)
E_valid = int(X.shape[0]*0.2)
E_test  = int(X.shape[0] - E_train - E_valid)

config.SetSeed()
index = torch.randperm(X.shape[0])
index_learn = index[:E_train+E_valid]
index_train = index[:E_train]
index_valid = index[E_train:E_train+E_valid]
index_test  = index[-E_test:]

X_learn, Y_learn = X[index_learn,:], Y[index_learn,:]
X_train, Y_train = X[index_train,:], Y[index_train,:]
X_valid, Y_valid = X[index_valid,:], Y[index_valid,:]
X_test,  Y_test  = X[index_test,:] , Y[index_test,:]

Xn_learn, Yn_learn = Xn[index_learn,:], Yn[index_learn,:]
Xn_train, Yn_train = Xn[index_train,:], Yn[index_train,:]
Xn_valid, Yn_valid = Xn[index_valid,:], Yn[index_valid,:]
Xn_test,  Yn_test  = Xn[index_test,:] , Yn[index_test,:]

In [18]:
a = {'Xn': Xn, 'Yn': Yn, 
     'X': X,   'Y': Y, 
     
     'Xn_learn': Xn_learn, 'Yn_learn': Yn_learn, 
     'Xn_train': Xn_train, 'Yn_train': Yn_train, 
     'Xn_valid': Xn_valid, 'Yn_valid': Yn_valid, 
     'Xn_test' : Xn_test,  'Yn_test' : Yn_test,
     
     'X_learn': X_learn, 'Y_learn': Y_learn, 
     'X_train': X_train, 'Y_train': Y_train, 
     'X_valid': X_valid, 'Y_valid': Y_valid, 
     'X_test' : X_test,  'Y_test' : Y_test,
     
     'X_max':X_max, 'X_min':X_min,
     'Y_max':Y_max, 'Y_min':Y_min,}

with open('./data/act_dataset_without_ratio.p', 'wb') as file:
    pickle.dump(a, file)
with open('../../LNC/dataset/act_dataset_without_ratio.p', 'wb') as file:
    pickle.dump(a, file)