In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'stock-market-prediction-and-sentimental-analysis:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F70807%2F7805940%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240330%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240330T143252Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D0cdc03fdd27ac62de74e8cd38da0ce438fc0ad374d10a31a17003f17254fe84dc646471760edbf9cf31ac41884efda1d05c7d8d11b54e6fc86b062441930875f62bbdaed1496842fb8603c0b31c57b6f0e522988e31567a3a41831b5af0edd19cfdbd8e20ae58d89a6a95c7f1b4634fa44e48ff67834a0facf7595dfc05e0b931c36e5d29165dc0da97cc42d22682e95653685052616f2911e2ac07e17fcf24509be13f07313ec291c9d6f13b34f0523873af33a7391ea6458bdaebe5d7915a48e1a6f21b6dbf9bb297d29e5ef57317dab2cdb6d10096f9fb778acd9848a8c10bb47a9f761316843023d7b7d55d7fbd69b01b81a4ce47009c2080f50f6d02d06'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading stock-market-prediction-and-sentimental-analysis, 6474545 bytes compressed
Downloaded and uncompressed: stock-market-prediction-and-sentimental-analysis
Data source import complete.


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

try:
    df = pd.read_csv('/kaggle/input/stock-market-prediction-and-sentimental-analysis/DJIA_table(train).csv')
    reddit_news = pd.read_csv('/kaggle/input/stock-market-prediction-and-sentimental-analysis/RedditNews(train).csv')
except FileNotFoundError as e:
    print("File not found:", e)

try:
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
except KeyError as e:
    print("KeyError:", e)

merged_data = df.drop(columns=['Date']).dropna()

seq_length = 10
batch_size = 32

train_data, val_data = train_test_split(merged_data.to_numpy(), test_size=0.2, shuffle=False)

train_generator = TimeseriesGenerator(train_data, train_data[:, -1], length=seq_length, batch_size=batch_size)
val_generator = TimeseriesGenerator(val_data, val_data[:, -1], length=seq_length, batch_size=batch_size)

lstm_model = Sequential([
    Input(shape=(seq_length, merged_data.shape[1])),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dense(1)
])
lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

history = lstm_model.fit(train_generator, epochs=50, validation_data=val_generator, callbacks=[early_stopping], verbose=1)

lstm_predictions = lstm_model.predict(val_generator)

scaler = MinMaxScaler()
scaler.fit(merged_data)

predicted_close_prices = lstm_predictions[:, -1]
inverse_transformed_predictions = np.hstack((np.zeros((lstm_predictions.shape[0], merged_data.shape[1] - 1)), lstm_predictions))
lstm_predictions = scaler.inverse_transform(inverse_transformed_predictions)

mse = mean_squared_error(df[-len(predicted_close_prices):]['Close'], predicted_close_prices)
print("Model MSE:", mse)

submission_df = pd.DataFrame({
    'Id': range(1, len(predicted_close_prices) + 1),
    'Close': predicted_close_prices
})

submission_df.to_csv("submission.csv", index=False)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model MSE: 83853665.80449462
