In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import GPT2Tokenizer, GPT2Model
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout
from keras.optimizers import Adam

# Assuming your DataFrame is named 'df'
df = pd.read_csv('/kaggle/input/dataset/combined_DJIA_NEWS.csv')
# Drop unnecessary columns for training
df.drop(['Unnamed: 0', 'Date'], axis=1, inplace=True)

# Create additional features from numerical data
df['Lowest_Low'] = df['Low'].rolling(window=14).min()
df['Highest_High'] = df['High'].rolling(window=14).max()
df['Stochastic_K'] = ((df['Close'] - df['Lowest_Low']) / (df['Highest_High'] - df['Lowest_Low'])) * 100
df['Stochastic_D'] = df['Stochastic_K'].rolling(window=3).mean()
df['Momentum'] = df['Close'] - df['Close'].shift(10)
df['Rate_of_Change'] = (df['Close'] / df['Close'].shift(10)) * 100
df['William_R'] = ((df['Highest_High'] - df['Close']) / (df['Highest_High'] - df['Lowest_Low'])) * -100
df['A/D_Oscillator'] = (df['High'] - df['Close'].shift()) / (df['High'] - df['Low'])
df['Close_MA_5'] = df['Close'].rolling(window=5).mean()
df['Disparity_5'] = ((df['Close'] - df['Close_MA_5']) / df['Close_MA_5']) * 100
df = df.drop(columns=['Lowest_Low', 'Highest_High', 'Close_MA_5'])

2024-02-08 13:35:32.334284: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-08 13:35:32.334390: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-08 13:35:32.539271: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Load pre-trained GPT-2 tokenizer and model from Hugging Face Model Hub
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")

import numpy as np

class GPT2EmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        embeddings = []
        for row in X.values:
            # Convert each element to string
            row = [str(element) for element in row]

            # Combine headlines into a single string
            combined_headlines = " ".join(row)

            # Tokenize and get embeddings
            inputs = tokenizer(combined_headlines, return_tensors="pt", truncation=True)
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
            embeddings.append(embedding)

        return np.array(embeddings)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [3]:
# Define the features and the target variable
text_features = ['Top' + str(i) for i in range(1, 26)]
numerical_features = ['Stochastic_K', 'Stochastic_D', 'Momentum', 'Rate_of_Change', 'William_R', 'A/D_Oscillator', 'Disparity_5']
target_variable = 'Label'

# Define the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('text', GPT2EmbeddingTransformer(), text_features),
        ('num', StandardScaler(), numerical_features)
    ]
)

# Separate features and target variable in training and testing sets
X_train = train_df[text_features + numerical_features]
y_train = train_df[target_variable]
X_test = test_df[text_features + numerical_features]
y_test = test_df[target_variable]

# Transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [4]:
print("Shape of X_train_transformed:", X_train_transformed.shape)


Shape of X_train_transformed: (1590, 775)


**LSTM**

In [5]:
# Reshape the data for LSTM
X_train_transformed = X_train_transformed.reshape((X_train_transformed.shape[0], X_train_transformed.shape[1], 1))
X_test_transformed = X_test_transformed.reshape((X_test_transformed.shape[0], X_test_transformed.shape[1], 1))
model = Sequential()
model.add(LSTM(100, input_shape=(X_train_transformed.shape[1], 1), return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])


# Train the model
model.fit(X_train_transformed, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
predictions = (model.predict(X_test_transformed) > 0.5).astype(int)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, predictions))

Epoch 1/10


I0000 00:00:1707401516.463329     129 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.44
Classification Report:
              precision    recall  f1-score   support

           0       0.44      1.00      0.61       174
           1       0.00      0.00      0.00       224

    accuracy                           0.44       398
   macro avg       0.22      0.50      0.30       398
weighted avg       0.19      0.44      0.27       398



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# Save the trained model
from tensorflow.keras.models import load_model
model.save('lstm_model_advance.h5')

In [8]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report

# Assuming your X_train_transformed and X_test_transformed are 3D arrays
# (number of samples, number of timesteps, number of features)
# You can adjust input_shape based on your data

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(X_train_transformed.shape[1], X_train_transformed.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_transformed, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
predictions = (model.predict(X_test_transformed) > 0.5).astype(int)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, predictions))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.44
Classification Report:
              precision    recall  f1-score   support

           0       0.44      1.00      0.61       174
           1       0.00      0.00      0.00       224

    accuracy                           0.44       398
   macro avg       0.22      0.50      0.30       398
weighted avg       0.19      0.44      0.27       398



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**LogisticRegression**

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.impute import SimpleImputer
# Flatten the data
X_train_flattened = X_train_transformed.reshape(X_train_transformed.shape[0], -1)
X_test_flattened = X_test_transformed.reshape(X_test_transformed.shape[0], -1)

imputer = SimpleImputer(strategy='mean')
X_train_flattened = imputer.fit_transform(X_train_flattened)
X_test_flattened = imputer.transform(X_test_flattened)


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Assuming X_train_flattened and X_test_flattened are your feature matrices
# and y_train is your target variable

# Impute NaN values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_flattened)
X_test_imputed = imputer.transform(X_test_flattened)

# Define the Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# Train the model
logreg_model.fit(X_train_imputed, y_train)

# Make predictions on training set
logreg_train_predictions = logreg_model.predict(X_train_imputed)

# Make predictions on test set
logreg_test_predictions = logreg_model.predict(X_test_imputed)

# Evaluate the model on training set
train_accuracy = accuracy_score(y_train, logreg_train_predictions)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Evaluate the model on test set
test_accuracy = accuracy_score(y_test, logreg_test_predictions)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Display classification report for test set
print("Classification Report (Test Set):")
print(classification_report(y_test, logreg_test_predictions))



Training Accuracy: 0.92
Test Accuracy: 0.87
Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       174
           1       0.88      0.88      0.88       224

    accuracy                           0.87       398
   macro avg       0.86      0.87      0.86       398
weighted avg       0.87      0.87      0.87       398



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:

model.save('logistic_model_advance.h5')

**SVM**

In [13]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer



# Impute NaN values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_flattened)
X_test_imputed = imputer.transform(X_test_flattened)

# Define the SVM model
svm_model = SVC(random_state=42)

# Train the model
svm_model.fit(X_train_imputed, y_train)

# Make predictions on training set
svm_train_predictions = svm_model.predict(X_train_imputed)

# Make predictions on test set
svm_test_predictions = svm_model.predict(X_test_imputed)

# Evaluate the model on training set
train_accuracy = accuracy_score(y_train, svm_train_predictions)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Evaluate the model on test set
test_accuracy = accuracy_score(y_test, svm_test_predictions)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Display classification report for test set
print("Classification Report (Test Set):")
print(classification_report(y_test, svm_test_predictions))


Training Accuracy: 0.53
Test Accuracy: 0.56
Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       174
           1       0.56      1.00      0.72       224

    accuracy                           0.56       398
   macro avg       0.28      0.50      0.36       398
weighted avg       0.32      0.56      0.41       398



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Save the trained model
from tensorflow.keras.models import load_model
model.save('supportvector_model_advance.h5')

**svm with window size 25**

In [16]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Assuming X_train_flattened and X_test_flattened are your feature matrices
# and y_train is your target variable

# Impute NaN values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_flattened)
X_test_imputed = imputer.transform(X_test_flattened)

# Set the training window size
window_size = 25

# Initialize the SVM model
svm_model = SVC(random_state=42)

# Train the model with a sliding window
for i in range(window_size, len(X_train_imputed)):
    X_train_window = X_train_imputed[i - window_size:i]
    y_train_window = y_train[i - window_size:i]
    svm_model.fit(X_train_window, y_train_window)

# Make predictions on training set
svm_train_predictions = svm_model.predict(X_train_imputed)

# Make predictions on test set
svm_test_predictions = svm_model.predict(X_test_imputed)

# Evaluate the model on training set
train_accuracy = accuracy_score(y_train, svm_train_predictions)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Evaluate the model on test set
test_accuracy = accuracy_score(y_test, svm_test_predictions)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Display classification report for test set
print("Classification Report (Test Set):")
print(classification_report(y_test, svm_test_predictions))


Training Accuracy: 0.47
Test Accuracy: 0.44
Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.44      1.00      0.61       174
           1       0.00      0.00      0.00       224

    accuracy                           0.44       398
   macro avg       0.22      0.50      0.30       398
weighted avg       0.19      0.44      0.27       398



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**CNN-LSTM**

In [17]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Flatten the data
X_train_flattened = X_train_transformed.reshape(X_train_transformed.shape[0], -1)
X_test_flattened = X_test_transformed.reshape(X_test_transformed.shape[0], -1)

# Impute NaN values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_flattened)
X_test_imputed = imputer.transform(X_test_flattened)

# Reshape back to the original shape
X_train_imputed = X_train_imputed.reshape(X_train_transformed.shape)
X_test_imputed = X_test_imputed.reshape(X_test_transformed.shape)

# CNN-LSTM model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_imputed.shape[1], X_train_imputed.shape[2])))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_imputed, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Make predictions
predictions = (model.predict(X_test_imputed) > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, predictions))



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.44
Classification Report:
              precision    recall  f1-score   support

           0       0.44      1.00      0.61       174
           1       0.00      0.00      0.00       224

    accuracy                           0.44       398
   macro avg       0.22      0.50      0.30       398
weighted avg       0.19      0.44      0.27       398



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Save the trained model
from tensorflow.keras.models import load_model
model.save('lstm_model_advance.h5')

**CNN-GRU**

In [18]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GRU, Dense, Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Assuming X_train_transformed, X_test_transformed, y_train, and y_test are your data
# X_train_transformed and X_test_transformed should have shape (samples, timesteps, features)

# Flatten the data
X_train_flattened = X_train_transformed.reshape(X_train_transformed.shape[0], -1)
X_test_flattened = X_test_transformed.reshape(X_test_transformed.shape[0], -1)

# Impute NaN values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_flattened)
X_test_imputed = imputer.transform(X_test_flattened)

# Reshape back to the original shape
X_train_imputed = X_train_imputed.reshape(X_train_transformed.shape)
X_test_imputed = X_test_imputed.reshape(X_test_transformed.shape)

# CNN-GRU model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_imputed.shape[1], X_train_imputed.shape[2])))
model.add(MaxPooling1D(pool_size=2))
model.add(GRU(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_imputed, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Make predictions
predictions = (model.predict(X_test_imputed) > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, predictions))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.89
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.89      0.88       174
           1       0.91      0.89      0.90       224

    accuracy                           0.89       398
   macro avg       0.89      0.89      0.89       398
weighted avg       0.89      0.89      0.89       398



In [20]:
# Save the trained model
from tensorflow.keras.models import load_model
model.save('cnnGgru_advance.h5')

**ANN**

In [21]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Assuming X_train_transformed, X_test_transformed, y_train, and y_test are your data
# X_train_transformed and X_test_transformed should have shape (samples, features)

# Flatten the data
X_train_flattened = X_train_transformed.reshape(X_train_transformed.shape[0], -1)
X_test_flattened = X_test_transformed.reshape(X_test_transformed.shape[0], -1)

# Impute NaN values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_flattened)
X_test_imputed = imputer.transform(X_test_flattened)

# Create a simple ANN model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train_imputed.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_imputed, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Make predictions
predictions = (model.predict(X_test_imputed) > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, predictions))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.82      0.84       174
           1       0.87      0.90      0.88       224

    accuracy                           0.87       398
   macro avg       0.87      0.86      0.86       398
weighted avg       0.87      0.87      0.87       398



In [23]:
# Save the trained model
from tensorflow.keras.models import load_model
model.save('ANN_advance.h5')