In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# Load your dataset
def load_data(file_path):
    file_path = "/content/drive/MyDrive/stock_trading_data.csv"
    # Assuming the dataset has 'text' and 'label' columns
    data = pd.read_csv(file_path)
    texts = data['Date'].astype(str).tolist()  # Convert 'Date' to strings
    labels = data['Low'].values  # Replace 'Low' with your target column if different
    return texts, labels  # Return the text column as a list of strings


In [11]:
# Normalize the labels if they are in a wide range
def preprocess_data(texts, labels, vocab_size=10000, max_length=100):

    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

    scaler = MinMaxScaler()
    labels_scaled = scaler.fit_transform(labels.reshape(-1, 1)).flatten()  # Normalize between 0 and 1

    # Original return statement:
    # return padded_sequences, labels_scaled, tokenizer, sequences, scaler

    # Modified to return 4 values as expected:
    return padded_sequences, labels_scaled, tokenizer, sequences # Removed 'scaler' from return values

In [12]:
# Load dataset
file_path = "/content/drive/MyDrive/stock_trading_data.csv"  # Replace with your dataset path
texts, labels = load_data(file_path)

# Preprocess the data and store tokenizer
padded_sequences, labels_scaled, tokenizer, sequences = preprocess_data(texts, labels)

# Access the vocabulary via tokenizer.word_index
vocab = tokenizer.word_index

In [13]:
# Split dataset into training and testing
x_train_texts, x_test_texts, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

vocab_size = 10000  # Adjust the vocabulary size
max_length = 100    # Adjust sequence length if needed

# preprocess_data function returns 4 values: padded_sequences, labels_scaled, tokenizer, sequences
x_train, y_train, tokenizer, sequences = preprocess_data(x_train_texts, y_train, vocab_size, max_length)
x_test, y_test, _, _ = preprocess_data(x_test_texts, y_test, vocab_size, max_length)

In [14]:
X = np.array(range(0, 6))  # Creates an array [0, 1, 2, 3, 4, 5]
X = X.reshape(1, 6)  # 1 sample with 6 features
y = np.array([1])  # Single label
y= y.reshape(-1,1)
X, y


(array([[0, 1, 2, 3, 4, 5]]), array([[1]]))

In [15]:
vocab_size = 6  # Consider only the top 4 most frequent words
max_length = 10  # Adjust sequence length if needed

x_train, y_train, tokenizer, sequences = preprocess_data(x_train_texts, y_train, vocab_size, max_length)
x_test, y_test, _, _ = preprocess_data(x_test_texts, y_test, vocab_size, max_length)

# Access the vocabulary
vocab = tokenizer.word_index
print(f"Vocabulary (top {vocab_size}): {vocab}")


Vocabulary (top 6): {'<OOV>': 1, '2019': 2, '2020': 3, '2018': 4, '2017': 5, '06': 6, '03': 7, '11': 8, '12': 9, '10': 10, '2016': 11, '01': 12, '07': 13, '08': 14, '05': 15, '04': 16, '09': 17, '02': 18, '2021': 19, '26': 20, '13': 21, '21': 22, '28': 23, '19': 24, '23': 25, '15': 26, '22': 27, '18': 28, '14': 29, '24': 30, '16': 31, '29': 32, '30': 33, '25': 34, '27': 35, '20': 36, '17': 37, '31': 38}


In [16]:

# Build the RNN model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=max_length),  # Embedding layer
    SimpleRNN(units=32, activation='tanh', return_sequences=False, name="Simple_RNN"),  # RNN layer
    Dense(units=1, activation='sigmoid', name="Output_Layer")  # Output layer for binary classification
])



In [17]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [18]:
# Assuming 'x_train', 'y_train', 'x_test', 'y_test' are defined as per ipython-input-133-9abc114218f5
model.fit(x_train, y_train)  # Use x_train and y_train instead of sequences and label

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.0021 - loss: 0.6262


<keras.src.callbacks.history.History at 0x7c12ecb46a70>

In [19]:
# Train the model
history = model.fit(x_train, y_train, epochs=5, batch_size=64, validation_split=0.2)


Epoch 1/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.0013 - loss: 0.5925 - val_accuracy: 0.0000e+00 - val_loss: 0.5908
Epoch 2/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0016 - loss: 0.5852 - val_accuracy: 0.0000e+00 - val_loss: 0.5899
Epoch 3/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0016 - loss: 0.5930 - val_accuracy: 0.0000e+00 - val_loss: 0.5900
Epoch 4/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0016 - loss: 0.5878 - val_accuracy: 0.0000e+00 - val_loss: 0.5898
Epoch 5/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0016 - loss: 0.5846 - val_accuracy: 0.0000e+00 - val_loss: 0.5900


In [20]:

# Evaluate the model
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0014 - loss: 0.6363     
Test Loss: 0.6324719786643982, Test Accuracy: 0.003968254197388887
