In [11]:
import numpy as np

# Load the previously saved sequences, labels, and dates
X = np.load('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/X_sequences.npy')
y = np.load('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/y_labels.npy')
dates = np.load('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/dates.npy', allow_pickle=True)

# Ensure the data is sorted by date before splitting
sorted_indices = np.argsort(dates)
X = X[sorted_indices]
y = y[sorted_indices]
dates = dates[sorted_indices]

# Split the data chronologically: first 70% for training, next 15% for validation, last 15% for testing
train_size = int(0.7 * len(dates))
val_size = int(0.15 * len(dates))
test_size = len(dates) - train_size - val_size

# Split the dates
train_dates = dates[:train_size]
val_dates = dates[train_size:train_size + val_size]
test_dates = dates[train_size + val_size:]

# Split the data
X_train = X[:train_size]
y_train = y[:train_size]

X_val = X[train_size:train_size + val_size]
y_val = y[train_size:train_size + val_size]

X_test = X[train_size + val_size:]
y_test = y[train_size + val_size:]

# Check and remove overlaps
if np.intersect1d(train_dates, val_dates).size > 0:
    overlap_start = np.where(val_dates > train_dates[-1])[0][0]
    X_val = X_val[overlap_start:]
    y_val = y_val[overlap_start:]
    val_dates = val_dates[overlap_start:]

if np.intersect1d(val_dates, test_dates).size > 0:
    overlap_start = np.where(test_dates > val_dates[-1])[0][0]
    X_test = X_test[overlap_start:]
    y_test = y_test[overlap_start:]
    test_dates = test_dates[overlap_start:]

# Print shapes to verify
print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)
print("Shape of y_test:", y_test.shape)

# Check for overlap in dates again
overlap_train_val = np.intersect1d(train_dates, val_dates)
overlap_val_test = np.intersect1d(val_dates, test_dates)
overlap_train_test = np.intersect1d(train_dates, test_dates)

print("Overlap between training and validation dates:", len(overlap_train_val))
print("Overlap between validation and testing dates:", len(overlap_val_test))
print("Overlap between training and testing dates:", len(overlap_train_test))

# Save the split datasets
np.save('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/X_train.npy', X_train)
np.save('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/X_val.npy', X_val)
np.save('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/X_test.npy', X_test)
np.save('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/y_train.npy', y_train)
np.save('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/y_val.npy', y_val)
np.save('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/y_test.npy', y_test)

# Verify saved files
print("Saved X_train, X_val, X_test, y_train, y_val, y_test to the specified directory.")

Shape of X_train: (32227, 28, 4)
Shape of X_val: (6901, 28, 4)
Shape of X_test: (6904, 28, 4)
Shape of y_train: (32227,)
Shape of y_val: (6901,)
Shape of y_test: (6904,)
Overlap between training and validation dates: 0
Overlap between validation and testing dates: 0
Overlap between training and testing dates: 0
Saved X_train, X_val, X_test, y_train, y_val, y_test to the specified directory.


In [13]:
# This is the final check before moving to model coding
import numpy as np

# Load the datasets
X_train = np.load('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/X_train.npy')
X_val = np.load('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/X_val.npy')
X_test = np.load('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/X_test.npy')
y_train = np.load('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/y_train.npy')
y_val = np.load('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/y_val.npy')
y_test = np.load('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/y_test.npy')
dates = np.load('/Users/hanshookoomsing/Documents/undergraduate_project/LSTM_TensorFlow/dates.npy', allow_pickle=True)

# Ensure the data is sorted by date before splitting
sorted_indices = np.argsort(dates)
dates = dates[sorted_indices]

# Verify shapes
print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)
print("Shape of y_test:", y_test.shape)

# Check for missing values in features
missing_X_train = np.isnan(X_train).sum()
missing_X_val = np.isnan(X_val).sum()
missing_X_test = np.isnan(X_test).sum()

print("Missing values in X_train:", missing_X_train)
print("Missing values in X_val:", missing_X_val)
print("Missing values in X_test:", missing_X_test)

# Check for missing values in labels
missing_y_train = np.isnan(y_train).sum()
missing_y_val = np.isnan(y_val).sum()
missing_y_test = np.isnan(y_test).sum()

print("Missing values in y_train:", missing_y_train)
print("Missing values in y_val:", missing_y_val)
print("Missing values in y_test:", missing_y_test)

# Check for overlap in dates
train_dates = dates[:len(X_train)]
val_dates = dates[len(X_train):len(X_train) + len(X_val)]
test_dates = dates[len(X_train) + len(X_val):]

overlap_train_val = np.intersect1d(train_dates, val_dates)
overlap_val_test = np.intersect1d(val_dates, test_dates)
overlap_train_test = np.intersect1d(train_dates, test_dates)

print("Overlap between training and validation dates:", len(overlap_train_val))
print("Overlap between validation and testing dates:", len(overlap_val_test))
print("Overlap between training and testing dates:", len(overlap_train_test))

# Verify data types
print("Data type of X_train:", X_train.dtype)
print("Data type of X_val:", X_val.dtype)
print("Data type of X_test:", X_test.dtype)
print("Data type of y_train:", y_train.dtype)
print("Data type of y_val:", y_val.dtype)
print("Data type of y_test:", y_test.dtype)

# Print summary of date ranges for each dataset
print("Training date range:", train_dates.min(), "to", train_dates.max())
print("Validation date range:", val_dates.min(), "to", val_dates.max())
print("Testing date range:", test_dates.min(), "to", test_dates.max())

Shape of X_train: (32227, 28, 4)
Shape of X_val: (6901, 28, 4)
Shape of X_test: (6904, 28, 4)
Shape of y_train: (32227,)
Shape of y_val: (6901,)
Shape of y_test: (6904,)
Missing values in X_train: 0
Missing values in X_val: 0
Missing values in X_test: 0
Missing values in y_train: 0
Missing values in y_val: 0
Missing values in y_test: 0
Overlap between training and validation dates: 1
Overlap between validation and testing dates: 0
Overlap between training and testing dates: 0
Data type of X_train: float64
Data type of X_val: float64
Data type of X_test: float64
Data type of y_train: float64
Data type of y_val: float64
Data type of y_test: float64
Training date range: 2001-02-14 00:00:00 to 2017-02-21 00:00:00
Validation date range: 2017-02-21 00:00:00 to 2020-07-24 00:00:00
Testing date range: 2020-07-27 00:00:00 to 2023-12-29 00:00:00
