In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import GPT2Tokenizer, GPT2Model
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout
from keras.optimizers import Adam

# Assuming your DataFrame is named 'df'
df = pd.read_csv('/kaggle/input/dataset/combined_DJIA_NEWS.csv')
# Drop unnecessary columns for training
df.drop(['Unnamed: 0', 'Date'], axis=1, inplace=True)

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Load pre-trained GPT-2 tokenizer and model from Hugging Face Model Hub
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")

import numpy as np

class GPT2EmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        embeddings = []
        for row in X.values:
            # Convert each element to string
            row = [str(element) for element in row]

            # Combine headlines into a single string
            combined_headlines = " ".join(row)

            # Tokenize and get embeddings
            inputs = tokenizer(combined_headlines, return_tensors="pt", truncation=True)
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
            embeddings.append(embedding)

        return np.array(embeddings)



In [8]:
# Separate features and labels
X_train = train_df.drop('Label', axis=1)
y_train = train_df['Label']
X_test = test_df.drop('Label', axis=1)
y_test = test_df['Label']

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('text', GPT2EmbeddingTransformer(), [f'Top{i}' for i in range(1, 26)]),
        ('num', StandardScaler(), ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'])
    ]
)

# Transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

**LSTM**

In [12]:
# Reshape X_train_transformed to have a third dimension
X_train_transformed = X_train_transformed.reshape(X_train_transformed.shape[0], 1, X_train_transformed.shape[1])

# Print the shape of transformed data for debugging
print("Shape of X_train_transformed:", X_train_transformed.shape)

# Define the LSTM model
model = Sequential()
# Adjust input shape based on the actual shape of X_train_transformed
input_shape = (X_train_transformed.shape[1], X_train_transformed.shape[2])
print("LSTM Input Shape:", input_shape)
model.add(LSTM(100, input_shape=input_shape))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_transformed, y_train, epochs=10, batch_size=32, validation_split=0.1)


Shape of X_train_transformed: (1590, 1, 774)
LSTM Input Shape: (1, 774)
Epoch 1/10

I0000 00:00:1707324363.368577      87 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f8a66072290>

In [13]:
# Evaluate the model on the test set
X_test_transformed_reshaped = X_test_transformed.reshape(X_test_transformed.shape[0], 1, X_test_transformed.shape[1])
accuracy = model.evaluate(X_test_transformed_reshaped, y_test)[1]

# Print accuracy
print(f"Accuracy on the test set: {accuracy:.2%}")


Accuracy on the test set: 56.53%


**Simple Neural Network**

In [15]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

# Define the model
model = Sequential()

# Flatten the input (assuming it is a 2D input)
model.add(Flatten(input_shape=(X_train_transformed.shape[1], X_train_transformed.shape[2])))

# Add Dense layers
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_transformed, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model on the test set
accuracy = model.evaluate(X_test_transformed, y_test)[1]

# Print accuracy
print(f"Accuracy on the test set: {accuracy:.2%}")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy on the test set: 56.28%


**Logistic Regression**

In [18]:
# Flatten the time dimension
X_train_flat = X_train_transformed.reshape(X_train_transformed.shape[0], -1)
X_test_flat = X_test_transformed.reshape(X_test_transformed.shape[0], -1)

# Create a Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# Train the model
logreg_model.fit(X_train_flat, y_train)

# Make predictions
logreg_predictions = logreg_model.predict(X_test_flat)

# Evaluate the model
accuracy = accuracy_score(y_test, logreg_predictions)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, logreg_predictions))




Accuracy: 0.53
Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.42      0.44       174
           1       0.58      0.62      0.60       224

    accuracy                           0.53       398
   macro avg       0.52      0.52      0.52       398
weighted avg       0.53      0.53      0.53       398



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**SVM**

In [19]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initialize the SVM classifier
svm_model = SVC(kernel='linear', random_state=42)

# Train the SVM model
svm_model.fit(X_train_transformed.reshape(X_train_transformed.shape[0], -1), y_train)

# Make predictions on the test set
svm_predictions = svm_model.predict(X_test_transformed.reshape(X_test_transformed.shape[0], -1))

# Evaluate the SVM model
accuracy_svm = accuracy_score(y_test, svm_predictions)
print(f"SVM Accuracy: {accuracy_svm:.2f}")

# Display classification report
print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))


SVM Accuracy: 0.67
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.59      0.61       174
           1       0.70      0.74      0.72       224

    accuracy                           0.67       398
   macro avg       0.67      0.66      0.67       398
weighted avg       0.67      0.67      0.67       398

