# **prepares data, store in .pickle file**

# **prepares data, store in .pickle file**

In [9]:
from sklearn.model_selection import KFold
import pickle
import pandas as pd
import os.path
import numpy as np
# path dictionary
path_data_folder = "5.9.23/"
path_dictionary = {
    '20.7': path_data_folder + "twentypointseven",
    '21.0': path_data_folder + "twentyonepointzero_1",
    '21.2': path_data_folder + "twentyonepointtwo_1",
    '21.5': path_data_folder + "twentyonepointfive_1",
    '21.7': path_data_folder + "TwentyonepointseevendegreeC_1",
    '21.8': path_data_folder + "twentypointeight_1"
}
# separator in this file is tab
# label for entire file is the temperature
# frames = pd.read_csv("5.9.23/twentypointseven", sep="\t", header=None)
# # <- pandas index is [column][row]

In [10]:
# the data are (1000 * 1) column vectors.
# in the file, there are 1000 lines, each with n numbers, 
# where n = number of data vectors
def load_data(filename_dictionary):
    X_data = [] # data
    y_data = [] # label
    for filename, filepath in filename_dictionary.items():
        print(f"reading file:        {filepath}")
        X_in_this_file = pd.read_csv(filepath, sep="\t", header=None)
        value = float(filename)
        print(f"\ttemperature value: {value}")
        number_of_examples = X_in_this_file.shape[0]
        y_in_this_file = np.zeros(shape=(number_of_examples)) + value
        y_in_this_file = pd.DataFrame(y_in_this_file)
        # default column setting is NO array, 
        # need to make it array to use list of indices!
        y_in_this_file.columns = np.asarray(range(y_in_this_file.shape[1]))
        X_data.append(X_in_this_file)
        y_data.append(y_in_this_file)
    X_data = pd.concat(X_data, axis=0, ignore_index=True)
    y_data = pd.concat(y_data, axis=0, ignore_index=True)
    return np.asarray(X_data), np.asarray(y_data)

In [11]:
from sklearn.model_selection import KFold
import pickle
import pandas as pd
# change: response (X) -> spectrum, spectra (y) -> temperature

spectrum_raw, temperature_raw = load_data(filename_dictionary=path_dictionary)
print()
print(f"total number of examples:     {spectrum_raw.shape[0]}")
print(f"length of each example:       {spectrum_raw.shape[1]}")
print(f"shape of X data (spectrum): {spectrum_raw.shape}, type: {spectrum_raw[0][0].dtype}")
print(f"shape of y data (temperature): {temperature_raw.shape}, type: {temperature_raw[0].dtype}")
                                            

reading file:        5.9.23/twentypointseven
	temperature value: 20.7
reading file:        5.9.23/twentyonepointzero_1
	temperature value: 21.0
reading file:        5.9.23/twentyonepointtwo_1
	temperature value: 21.2
reading file:        5.9.23/twentyonepointfive_1
	temperature value: 21.5
reading file:        5.9.23/TwentyonepointseevendegreeC_1
	temperature value: 21.7
reading file:        5.9.23/twentypointeight_1
	temperature value: 21.8

total number of examples:     6000
length of each example:       10000
shape of X data (spectrum): (6000, 10000), type: float64
shape of y data (temperature): (6000, 1), type: float64


In [12]:
# saving preprocessed
# shuffle
print(f"before shuffle: {temperature_raw[:,0]}")
seed = np.random.randint(1, 100)
np.random.seed(seed)
np.random.shuffle(spectrum_raw)
np.random.seed(seed)
np.random.shuffle(temperature_raw)
print(f"after shuffle: {temperature_raw[:,0]}")

spectrum = spectrum_raw
temperature = temperature_raw

data_file_name = 'data_temp230509'
with open(data_file_name + '.pickle', 'wb') as handle:
    pickle.dump([spectrum, temperature], handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"saved data in {data_file_name}.pickle file")

before shuffle: [20.7 20.7 20.7 ... 21.8 21.8 21.8]
after shuffle: [20.7 21.2 21.7 ... 21.7 21.2 21.2]
saved data in data_temp230509.pickle file


In [13]:
# get & save indices
import os.path
file_name = 'cv_res=5_fold=5_temp230509'
    # 2x5 resampling
train_indices = []
test_indices = []
number_resamples = 2
n_splits = 5
eg_ct = temperature.shape[0]

for i in range(number_resamples):
    kf = KFold(n_splits=n_splits, random_state=i, shuffle=True)
    for i, (train_index, test_index) in enumerate(kf.split(range(eg_ct))):
        train_indices.append(train_index)
        test_indices.append(test_index)
print(f"got indices by KFold method, fold = {n_splits}, resample = {number_resamples}")
with open(file_name+'.pickle', 'wb') as handle:
    pickle.dump([train_indices,test_indices], handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"saved indices in {file_name}.pickle file")
print()
print(f"sets of training indices: {len(train_indices)}")
print(f"number of training indices per set: {len(train_indices[0])}")
print(f"sets of testing indices: {len(test_indices)}")
print(f"number of testing indices per set: {len(test_indices[0])}")

got indices by KFold method, fold = 5, resample = 2
saved indices in cv_res=5_fold=5_temp230509.pickle file

sets of training indices: 10
number of training indices per set: 4800
sets of testing indices: 10
number of testing indices per set: 1200


In [14]:
# list comprehension example used!
raw = [1, 2, 3, 4, 5, 6]
indices = [
    [1, 2],
    [3, 4],
    [0, 5]
]
row = indices[0]
result =[raw[t] for row in indices for t in row]
result_1 = [[raw[t] for t in row] for row in indices]
print(result)
print(result_1)

[2, 3, 4, 5, 1, 6]
[[2, 3], [4, 5], [1, 6]]


In [15]:
from sklearn.model_selection import train_test_split
test_ratio = 0.2
fold = 8
number_of_resamples = 2
# file_name = f"indices_fold={fold}_res={number_of_resamples}_test={test_ratio}"
file_name = "indices_temp230509"
eg_ct = temperature.shape[0]
dummy_indices = np.asarray(range(eg_ct))
# split into train_and_validate & test set
train_and_validate_set, test_indices = train_test_split(dummy_indices,
                                                    test_size=test_ratio,
                                                    random_state=1)
# split train_and_validate set into cross-validation sets
train_indices = []
validate_indices = []
for i in range(number_of_resamples):
    kf = KFold(n_splits=fold, random_state=i, shuffle=True)
    for i, (train_index, validate_index) in enumerate(kf.split(
        range(train_and_validate_set.shape[0]))):
        train_indices.append(train_index)
        validate_indices.append(validate_index)
print(f"got indices by KFold method, fold = {fold}, resample = {number_of_resamples}")
validate_indices = [[train_and_validate_set[i] for i in fold] for fold in validate_indices]
train_indices = [[train_and_validate_set[i] for i in fold] for fold in train_indices]
test_indices = test_indices.tolist()

with open(file_name+'.pickle', 'wb') as handle:
    pickle.dump([train_indices, validate_indices, test_indices], handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"saved indices in {file_name}.pickle file")
print()
print(f"sets of training indices: {len(train_indices)}")
print(f"number of training indices per set: {len(train_indices[0])}")
print(f"sets of validating indices: {len(validate_indices)}")
print(f"number of validating indices per set: {len(validate_indices[0])}")
print(f"number of testing indices: {len(test_indices)}")

got indices by KFold method, fold = 8, resample = 2
saved indices in indices_temp230509.pickle file

sets of training indices: 16
number of training indices per set: 4200
sets of validating indices: 16
number of validating indices per set: 600
number of testing indices: 1200


In [16]:
# test read indices
index = 1
with open(file_name+'.pickle', 'rb') as handle:
    train_indices_test_read, validate_indices_test_read, test_indices_test_read = pickle.load(handle)
    print(f"got indices from {file_name}.pickle")  
print(f"train_indices row {index}: {train_indices_test_read[index]}")
print(f"\tnumber of folds: {len(train_indices_test_read)}")
print(f"\tnumber of indices per fold: {len(train_indices_test_read[index])}")
print(f"validate_index row {index}: {validate_indices_test_read[index]}")
print(f"\tnumber of folds: {len(validate_indices_test_read)}")
print(f"\tnumber of indices per fold: {len(validate_indices_test_read[index])}")
print(f"number of test indices: {len(test_indices_test_read)}")

got indices from indices_temp230509.pickle
train_indices row 1: [1352, 4495, 5390, 4945, 2651, 100, 2787, 5242, 2938, 1162, 1564, 1884, 4604, 4874, 210, 3821, 4478, 2478, 5877, 4740, 3819, 1005, 4932, 4925, 3074, 5857, 1463, 372, 5926, 3833, 5378, 3239, 4325, 2045, 2804, 834, 4695, 4880, 2313, 3987, 2680, 5809, 2326, 3219, 205, 1006, 5165, 4744, 3242, 3223, 1299, 951, 5337, 2084, 2873, 4668, 5892, 5609, 3039, 3135, 2068, 57, 3432, 2340, 5170, 3704, 1851, 1681, 5343, 646, 4511, 4521, 1560, 5476, 1528, 4493, 2328, 1326, 2402, 2155, 1891, 2733, 203, 2694, 518, 2868, 2722, 3534, 4651, 3332, 3964, 1077, 1947, 1637, 2851, 2406, 3518, 5855, 2856, 2812, 3753, 3691, 5042, 5633, 2354, 16, 3645, 2011, 761, 4441, 1061, 1470, 5746, 3388, 4052, 190, 2513, 462, 484, 1264, 1125, 2927, 3504, 3655, 5504, 5465, 5738, 5323, 110, 1990, 1576, 1793, 3639, 1172, 4715, 4398, 232, 1248, 4580, 1596, 4539, 5330, 4555, 4627, 933, 1045, 2103, 448, 453, 12, 4770, 1669, 4291, 2382, 5964, 4998, 5425, 1021, 836, 3661, 