# **prepares data, store in .pickle file**

# **prepares data, store in .pickle file**

In [1]:
from sklearn.model_selection import KFold
import pickle
import pandas as pd
import os.path
import numpy as np
# path dictionary
path_data_folder = "5.16.23_Pressure/"
path_dictionary = {
    '1.0': path_data_folder + "Pressure_1_atm",
    '2.0': path_data_folder + "Pressure_2_atm",
    '3.0': path_data_folder + "Pressure_3_atm",
    '4.0': path_data_folder + "Pressure_4_atm",
    '5.0': path_data_folder + "Pressure_5_atm",
}
# separator in this file is tab
# label for entire file is the temperature
# frames = pd.read_csv("5.9.23/twentypointseven", sep="\t", header=None)
# # <- pandas index is [column][row]

In [2]:
# the data are (1000 * 1) column vectors.
# in the file, there are 1000 lines, each with n numbers, 
# where n = number of data vectors
def load_data(filename_dictionary):
    X_data = [] # data
    y_data = [] # label
    for filename, filepath in filename_dictionary.items():
        print(f"reading file:        {filepath}")
        X_in_this_file = pd.read_csv(filepath, sep="\t", header=None)
        value = float(filename)
        print(f"\tpressure value: {value}")
        number_of_examples = X_in_this_file.shape[0]
        y_in_this_file = np.zeros(shape=(number_of_examples)) + value
        y_in_this_file = pd.DataFrame(y_in_this_file)
        # default column setting is NO array, 
        # need to make it array to use list of indices!
        y_in_this_file.columns = np.asarray(range(y_in_this_file.shape[1]))
        X_data.append(X_in_this_file)
        y_data.append(y_in_this_file)
    X_data = pd.concat(X_data, axis=0, ignore_index=True)
    y_data = pd.concat(y_data, axis=0, ignore_index=True)
    return np.asarray(X_data), np.asarray(y_data)

In [3]:
from sklearn.model_selection import KFold
import pickle
import pandas as pd
# change: response (X) -> spectrum, spectra (y) -> temperature

spectrum_raw, pressure_raw = load_data(filename_dictionary=path_dictionary)
print()
print(f"total number of examples:     {spectrum_raw.shape[0]}")
print(f"length of each example:       {spectrum_raw.shape[1]}")
print(f"shape of X data (spectrum): {spectrum_raw.shape}, type: {spectrum_raw[0][0].dtype}")
print(f"shape of y data (pressure): {pressure_raw.shape}, type: {pressure_raw[0].dtype}")
                                            

reading file:        5.16.23_Pressure/Pressure_1_atm
	pressure value: 1.0
reading file:        5.16.23_Pressure/Pressure_2_atm
	pressure value: 2.0
reading file:        5.16.23_Pressure/Pressure_3_atm
	pressure value: 3.0
reading file:        5.16.23_Pressure/Pressure_4_atm
	pressure value: 4.0
reading file:        5.16.23_Pressure/Pressure_5_atm
	pressure value: 5.0

total number of examples:     5000
length of each example:       10000
shape of X data (spectrum): (5000, 10000), type: float64
shape of y data (pressure): (5000, 1), type: float64


In [4]:
# saving preprocessed
# shuffle
print(f"before shuffle: {pressure_raw[:,0]}")
seed = np.random.randint(1, 100)
np.random.seed(seed)
np.random.shuffle(spectrum_raw)
np.random.seed(seed)
np.random.shuffle(pressure_raw)
print(f"after shuffle: {pressure_raw[:,0]}")

spectrum = spectrum_raw
pressure = pressure_raw

data_file_name = 'data_pres230516'
with open(data_file_name + '.pickle', 'wb') as handle:
    pickle.dump([spectrum, pressure], handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"saved data in {data_file_name}.pickle file")

before shuffle: [1. 1. 1. ... 5. 5. 5.]
after shuffle: [5. 3. 3. ... 1. 5. 2.]
saved data in data_pres230516.pickle file


In [5]:
# get & save indices
import os.path
file_name = 'cv_res=5_fold=5_pres230516'
    # 2x5 resampling
train_indices = []
test_indices = []
number_resamples = 2
n_splits = 5
eg_ct = pressure.shape[0]

for i in range(number_resamples):
    kf = KFold(n_splits=n_splits, random_state=i, shuffle=True)
    for i, (train_index, test_index) in enumerate(kf.split(range(eg_ct))):
        train_indices.append(train_index)
        test_indices.append(test_index)
print(f"got indices by KFold method, fold = {n_splits}, resample = {number_resamples}")
with open(file_name+'.pickle', 'wb') as handle:
    pickle.dump([train_indices,test_indices], handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"saved indices in {file_name}.pickle file")
print()
print(f"sets of training indices: {len(train_indices)}")
print(f"number of training indices per set: {len(train_indices[0])}")
print(f"sets of testing indices: {len(test_indices)}")
print(f"number of testing indices per set: {len(test_indices[0])}")

got indices by KFold method, fold = 5, resample = 2
saved indices in cv_res=5_fold=5_pres230516.pickle file

sets of training indices: 10
number of training indices per set: 4000
sets of testing indices: 10
number of testing indices per set: 1000


In [6]:
# list comprehension example used!
raw = [1, 2, 3, 4, 5, 6]
indices = [
    [1, 2],
    [3, 4],
    [0, 5]
]
row = indices[0]
result =[raw[t] for row in indices for t in row]
result_1 = [[raw[t] for t in row] for row in indices]
print(result)
print(result_1)

[2, 3, 4, 5, 1, 6]
[[2, 3], [4, 5], [1, 6]]


In [7]:
from sklearn.model_selection import train_test_split
test_ratio = 0.2
fold = 8
number_of_resamples = 2
# file_name = f"indices_fold={fold}_res={number_of_resamples}_test={test_ratio}"
file_name = "indices_pres230516"
eg_ct = pressure.shape[0]
dummy_indices = np.asarray(range(eg_ct))
# split into train_and_validate & test set
train_and_validate_set, test_indices = train_test_split(dummy_indices,
                                                    test_size=test_ratio,
                                                    random_state=1)
# split train_and_validate set into cross-validation sets
train_indices = []
validate_indices = []
for i in range(number_of_resamples):
    kf = KFold(n_splits=fold, random_state=i, shuffle=True)
    for i, (train_index, validate_index) in enumerate(kf.split(
        range(train_and_validate_set.shape[0]))):
        train_indices.append(train_index)
        validate_indices.append(validate_index)
print(f"got indices by KFold method, fold = {fold}, resample = {number_of_resamples}")
validate_indices = [[train_and_validate_set[i] for i in fold] for fold in validate_indices]
train_indices = [[train_and_validate_set[i] for i in fold] for fold in train_indices]
test_indices = test_indices.tolist()

with open(file_name+'.pickle', 'wb') as handle:
    pickle.dump([train_indices, validate_indices, test_indices], handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"saved indices in {file_name}.pickle file")
print()
print(f"sets of training indices: {len(train_indices)}")
print(f"number of training indices per set: {len(train_indices[0])}")
print(f"sets of validating indices: {len(validate_indices)}")
print(f"number of validating indices per set: {len(validate_indices[0])}")
print(f"number of testing indices: {len(test_indices)}")

got indices by KFold method, fold = 8, resample = 2
saved indices in indices_pres230516.pickle file

sets of training indices: 16
number of training indices per set: 3500
sets of validating indices: 16
number of validating indices per set: 500
number of testing indices: 1000


In [8]:
# test read indices
index = 1
with open(file_name+'.pickle', 'rb') as handle:
    train_indices_test_read, validate_indices_test_read, test_indices_test_read = pickle.load(handle)
    print(f"got indices from {file_name}.pickle")  
print(f"train_indices row {index}: {train_indices_test_read[index]}")
print(f"\tnumber of folds: {len(train_indices_test_read)}")
print(f"\tnumber of indices per fold: {len(train_indices_test_read[index])}")
print(f"validate_index row {index}: {validate_indices_test_read[index]}")
print(f"\tnumber of folds: {len(validate_indices_test_read)}")
print(f"\tnumber of indices per fold: {len(validate_indices_test_read[index])}")
print(f"number of test indices: {len(test_indices_test_read)}")

got indices from indices_pres230516.pickle
train_indices row 1: [1233, 1056, 1686, 187, 1272, 453, 1247, 3039, 4634, 2605, 840, 128, 1123, 2451, 1330, 344, 4549, 2217, 1159, 4127, 236, 1379, 2743, 4017, 1739, 2588, 1851, 607, 846, 4952, 4195, 4256, 563, 418, 4670, 1435, 2630, 4332, 1078, 3679, 518, 3463, 1020, 4305, 3871, 4654, 4304, 2054, 2680, 3269, 4185, 1225, 3839, 4301, 2537, 2606, 4099, 4507, 946, 1707, 4528, 4605, 616, 1718, 955, 194, 909, 3826, 2665, 4567, 1549, 3461, 368, 1625, 4214, 2748, 1545, 1615, 2115, 3472, 216, 4723, 4296, 1498, 3359, 1055, 2234, 4531, 4365, 3516, 2903, 2430, 3650, 3339, 2874, 4672, 2183, 2593, 4113, 3096, 769, 2873, 4147, 3064, 1457, 234, 2692, 1, 1963, 4539, 4989, 4119, 210, 671, 4871, 3660, 438, 2905, 1424, 4156, 880, 4134, 1714, 1008, 2881, 4753, 4668, 1821, 3079, 385, 2038, 1850, 3978, 2523, 3602, 3059, 3888, 3308, 3132, 3003, 1992, 396, 3327, 1495, 1571, 4925, 3417, 1717, 2165, 1230, 2947, 613, 290, 1438, 4702, 3106, 3343, 4684, 4387, 650, 4150, 2