# **prepares data, store in .pickle file**

In [17]:
from sklearn.model_selection import KFold
import pickle
import pandas as pd
import os.path
import numpy as np
# path dictionary
path_data_folder = "5.9.23/"
path_dictionary = {
    '20.7': path_data_folder + "twentypointseven",
    '21.0': path_data_folder + "twentyonepointzero_1",
    '21.2': path_data_folder + "twentyonepointtwo_1",
    '21.5': path_data_folder + "twentyonepointfive_1",
    '21.7': path_data_folder + "TwentyonepointseevendegreeC_1",
    '21.8': path_data_folder + "twentypointeight_1"
}
# separator in this file is tab
# label for entire file is the temperature
# frames = pd.read_csv("5.9.23/twentypointseven", sep="\t", header=None)
# # <- pandas index is [column][row]

In [18]:
# the data are (1000 * 1) column vectors.
# in the file, there are 1000 lines, each with n numbers, 
# where n = number of data vectors
def load_data(filename_dictionary):
    X_data = [] # data
    y_data = [] # label
    for filename, filepath in filename_dictionary.items():
        print(f"reading file:        {filepath}")
        X_in_this_file = pd.read_csv(filepath, sep="\t", header=None)
        value = float(filename)
        print(f"\ttemperature value: {value}")
        number_of_examples = X_in_this_file.shape[0]
        y_in_this_file = np.zeros(shape=(number_of_examples)) + value
        y_in_this_file = pd.DataFrame(y_in_this_file)
        # default column setting is NO array, 
        # need to make it array to use list of indices!
        y_in_this_file.columns = np.asarray(range(y_in_this_file.shape[1]))
        X_data.append(X_in_this_file)
        y_data.append(y_in_this_file)
    X_data = pd.concat(X_data, axis=0, ignore_index=True)
    y_data = pd.concat(y_data, axis=0, ignore_index=True)
    return np.asarray(X_data), np.asarray(y_data)

In [19]:
from sklearn.model_selection import KFold
import pickle
import pandas as pd
# change: response (X) -> spectrum, spectra (y) -> temperature

spectrum_raw, temperature_raw = load_data(filename_dictionary=path_dictionary)
print()
print(f"total number of examples:     {spectrum_raw.shape[0]}")
print(f"length of each example:       {spectrum_raw.shape[1]}")
print(f"shape of X data (spectrum): {spectrum_raw.shape}, type: {spectrum_raw[0][0].dtype}")
print(f"shape of y data (temperature): {temperature_raw.shape}, type: {temperature_raw[0].dtype}")
                                            

reading file:        5.9.23/twentypointseven
	temperature value: 20.7
reading file:        5.9.23/twentyonepointzero_1
	temperature value: 21.0
reading file:        5.9.23/twentyonepointtwo_1
	temperature value: 21.2
reading file:        5.9.23/twentyonepointfive_1
	temperature value: 21.5
reading file:        5.9.23/TwentyonepointseevendegreeC_1
	temperature value: 21.7
reading file:        5.9.23/twentypointeight_1
	temperature value: 21.8

total number of examples:     6000
length of each example:       10000
shape of X data (spectrum): (6000, 10000), type: float64
shape of y data (temperature): (6000, 1), type: float64


In [20]:
"""normalization: longer sklearn implementation"""
# more code, but can change type of scaler easily
from sklearn.preprocessing import StandardScaler
import random
# data is an np array
#   a row = a vector
#   a column = a feature
def normalize(data):
    # normalize direction: column (feature)-wise
    scaler = StandardScaler() # change scaler here
    eg_ct = data.shape[0]
    feature_ct = data.shape[1]
    normalized_data = np.zeros(shape=(eg_ct, feature_ct))
    normalized_data = scaler.fit_transform(data)
    # for f in range(feature_ct):
    #     array = data[:,f]
    #     if array.ndim < 2: 
    #         array = np.reshape(array, newshape=(-1, 1))
    #     normalized_data[:,f] = scaler.fit_transform(array).reshape(-1)
    return normalized_data, scaler

index = random.randint(0, spectrum_raw.shape[1] - 1)
print(f"test with {index}th column")
print("normalized mean will not be exactly 0, but very close to it!")
spectrum, spectrum_scaler = normalize(spectrum_raw)
print("SPECTRUM")
print(f"{index}th column: before normalization: \n{spectrum_raw[:,index]}")
print(f"mean = {sum(spectrum_raw[:,index])/spectrum_raw.shape[0]}")
print(f"{index}th column: after normalization: \n{spectrum[:,index]}")
print(f"mean = {sum(spectrum[:,index])/spectrum_raw.shape[0]}")

# temperature, temperature_scaler = normalize(temperature_raw)
temperature = temperature_raw
print("TEMPERATURE")
# print(f"0th column: before normalization: {temperature_raw[:,0]}")
# print(f"mean = {sum(temperature_raw[:,0])/spectrum_raw.shape[0]}")
# print(f"0th column: after normalization: {temperature[:,0]}")
# print(f"mean = {sum(temperature[:,0])/spectrum_raw.shape[0]}")
print(f"0th column: \n{temperature_raw[:,0]}")
print(f"mean = {sum(temperature_raw[:,0])/spectrum_raw.shape[0]}")

test with 6418th column
normalized mean will not be exactly 0, but very close to it!
SPECTRUM
6418th column: before normalization: 
[0.856 0.848 0.856 ... 0.608 0.608 0.6  ]
mean = 0.916909333333365
6418th column: after normalization: 
[-0.63137683 -0.7143036  -0.63137683 ... -3.20210687 -3.20210687
 -3.28503365]
mean = 1.5329219375341079e-15
TEMPERATURE
0th column: 
[20.7 20.7 20.7 ... 21.8 21.8 21.8]
mean = 21.316666666666247


In [21]:
# saving preprocessed
normalized_data_file_name = 'data_normalized'
with open(normalized_data_file_name + '.pickle', 'wb') as handle:
    pickle.dump([spectrum, temperature], handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"saved data in {normalized_data_file_name}.pickle file")
normalized_data_scaler_file_name = 'data_scaler'
with open(normalized_data_scaler_file_name + '.pickle', 'wb') as handle:
    pickle.dump(spectrum_scaler, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"saved scaler in {normalized_data_scaler_file_name}.pickle file")

saved data in data_normalized.pickle file
saved scaler in data_scaler.pickle file


In [22]:
# try reading data
print("verify: reading data")
with open(normalized_data_file_name + '.pickle', 'rb') as handle:
    spectrum_test_read, temperature_test_read = pickle.load(handle)
    print(f"read data from {normalized_data_file_name}.pickle") 
print(f"read spectrum correctly: {np.array_equal(spectrum, spectrum_test_read)}")
print(f"read temperature correctly: {np.array_equal(temperature, temperature_test_read)}")
print()
print("verify: reading data scaler")
with open(normalized_data_scaler_file_name + '.pickle', 'rb') as handle:
    spectrum_test_read_scaler = pickle.load(handle)
    print(f"read scaler from {normalized_data_file_name}.pickle") 
print("recover spectrum:")
print(f"{index}th column: after normalization: {spectrum_test_read[:,index]}")
recover_spectrum_eg = spectrum_test_read_scaler.inverse_transform(spectrum_test_read)[:,index]
print(f"{index}th column: recovered: {recover_spectrum_eg.reshape(-1)}")
print(f"{index}th column: original: {spectrum_raw[:,index]}")
print("recover temperature:")
# print(f"0th column: after normalization: {temperature_test_read[:,0]}")
# recover_temperature_eg = temperature_test_read_scaler.inverse_transform(temperature_test_read[:,0].reshape(-1, 1))
# print(f"0th column: recovered: {recover_temperature_eg.reshape(-1)}")
print(f"0th column: original: {temperature_raw[:,0]}")


verify: reading data
read data from data_normalized.pickle
read spectrum correctly: True
read temperature correctly: True

verify: reading data scaler
read scaler from data_normalized.pickle
recover spectrum:
6418th column: after normalization: [-0.63137683 -0.7143036  -0.63137683 ... -3.20210687 -3.20210687
 -3.28503365]
6418th column: recovered: [0.856 0.848 0.856 ... 0.608 0.608 0.6  ]
6418th column: original: [0.856 0.848 0.856 ... 0.608 0.608 0.6  ]
recover temperature:
0th column: original: [20.7 20.7 20.7 ... 21.8 21.8 21.8]


In [23]:
# shorter pandas implementation, also work (very small difference in value)
# less code, but more changes needed to change normalization method
# spectrum = spectrum_raw.sub(spectrum_raw.mean(axis=1), axis=0).div(spectrum_raw.std(axis=1), axis=0)
# print(f"0th row: before normalization: {spectrum_raw.iloc[0]}")
# print(f"mean = {sum(spectrum_raw.iloc[0])/spectrum_raw.shape[1]}")
# print(f"0th row: after normalization: {spectrum.iloc[0]}")
# print(f"mean = {sum(spectrum.iloc[0])/spectrum_raw.shape[1]}")
# temperature = temperature_raw.sub(temperature_raw.mean(axis=1), axis=0).div(temperature_raw.std(axis=1), axis=0)
# print(f"0th row: before normalization: {temperature_raw.iloc[0]}")
# print(f"mean = {sum(temperature_raw.iloc[0])/spectrum_raw.shape[1]}")
# print(f"0th row: after normalization: {temperature.iloc[0]}")
# print(f"mean = {sum(temperature.iloc[0])/spectrum_raw.shape[1]}")

In [24]:
# get & save indices
import os.path
file_name = 'cross_validation_resample=2_fold=5'
    # 2x5 resampling
train_indices = []
test_indices = []
number_resamples = 2
n_splits =5
eg_ct = temperature.shape[0]

for i in range(number_resamples):
    kf = KFold(n_splits=n_splits, random_state=i, shuffle=True)
    for i, (train_index, test_index) in enumerate(kf.split(range(eg_ct))):
        train_indices.append(train_index)
        test_indices.append(test_index)
print(f"got indices by KFold method, fold = {n_splits}, resample = {number_resamples}")
with open(file_name+'.pickle', 'wb') as handle:
    pickle.dump([train_indices,test_indices], handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"saved indices in {file_name}.pickle file")
print()
print(f"sets of training indices: {len(train_indices)}")
print(f"number of training indices per set: {len(train_indices[0])}")
print(f"sets of testing indices: {len(test_indices)}")
print(f"number of testing indices per set: {len(test_indices[0])}")

got indices by KFold method, fold = 5, resample = 2
saved indices in cross_validation_resample=2_fold=5.pickle file

sets of training indices: 10
number of training indices per set: 4800
sets of testing indices: 10
number of testing indices per set: 1200


In [25]:
# test read indices
index = 1
with open(file_name+'.pickle', 'rb') as handle:
    train_indices_test_read , test_indices_test_read = pickle.load(handle)
    print(f"got indices from {file_name}.pickle")  
print(f"train_indices row {index}: {train_indices_test_read[index]}")
print(f"\tnumber of folds: {len(train_indices_test_read)}")
print(f"\tnumber of indices per fold: {len(train_indices_test_read[index])}")
print(f"test_index row {index}: {test_indices_test_read[index]}")
print(f"\tnumber of folds: {len(test_indices_test_read)}")
print(f"\tnumber of indices per fold: {len(test_indices_test_read[index])}")

got indices from cross_validation_resample=2_fold=5.pickle
train_indices row 1: [   0    1    2 ... 5997 5998 5999]
	number of folds: 10
	number of indices per fold: 4800
test_index row 1: [   9   18   22 ... 5985 5992 5996]
	number of folds: 10
	number of indices per fold: 1200
