In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Project Structure

In [3]:
import os

base_path = "/content/drive/MyDrive/stock-nlp-prediction"

folders = [
    "data/raw",
    "data/processed",
    "models",
    "outputs"
]

for folder in folders:
    os.makedirs(os.path.join(base_path, folder), exist_ok=True)

print("Project structure created.")

Project structure created.


In [4]:
!ls /content/drive/MyDrive/stock-nlp-prediction

data  models  outputs


In [11]:
!pip install kaggle



In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Download News Dataset

In [6]:
!kaggle datasets download -d aaron7sun/stocknews
!unzip -o stocknews.zip

Dataset URL: https://www.kaggle.com/datasets/aaron7sun/stocknews
License(s): CC-BY-NC-SA-4.0
Downloading stocknews.zip to /content
  0% 0.00/5.82M [00:00<?, ?B/s]
100% 5.82M/5.82M [00:00<00:00, 734MB/s]
Archive:  stocknews.zip
  inflating: Combined_News_DJIA.csv  
  inflating: RedditNews.csv          
  inflating: upload_DJIA_table.csv   


Prepare News Data

In [7]:
import pandas as pd

news_df = pd.read_csv("Combined_News_DJIA.csv")
news_df['Date'] = pd.to_datetime(news_df['Date'])

headline_cols = [col for col in news_df.columns if "Top" in col]

news_df['Combined_Text'] = news_df[headline_cols].astype(str).agg(' '.join, axis=1)

news_df = news_df[['Date', 'Combined_Text']]

news_df.head()

Unnamed: 0,Date,Combined_Text
0,2008-08-08,"b""Georgia 'downs two Russian warplanes' as cou..."
1,2008-08-11,b'Why wont America and Nato help us? If they w...
2,2008-08-12,b'Remember that adorable 9-year-old who sang a...
3,2008-08-13,b' U.S. refuses Israel weapons to attack Iran:...
4,2008-08-14,b'All the experts admit that we should legalis...


Download AAPL Price (Matching Dates)

In [8]:
import yfinance as yf

df = yf.download("AAPL", start="2008-08-08", end="2016-07-01")

df.columns = df.columns.get_level_values(0)
df = df.reset_index()

df = df[['Date', 'Close']]

df['Next_Close'] = df['Close'].shift(-1)
df['Label'] = (df['Next_Close'] > df['Close']).astype(int)

df = df.dropna()

df.head()

  df = yf.download("AAPL", start="2008-08-08", end="2016-07-01")
[*********************100%***********************]  1 of 1 completed


Price,Date,Close,Next_Close,Label
0,2008-08-08,5.080226,5.200379,1
1,2008-08-11,5.200379,5.295363,1
2,2008-08-12,5.295363,5.372368,1
3,2008-08-13,5.372368,5.372966,1
4,2008-08-14,5.372966,5.2657,0


Merge

In [9]:
merged_df = df.merge(news_df, on='Date', how='left')
merged_df['Combined_Text'] = merged_df['Combined_Text'].fillna("")

merged_df.head()

Unnamed: 0,Date,Close,Next_Close,Label,Combined_Text
0,2008-08-08,5.080226,5.200379,1,"b""Georgia 'downs two Russian warplanes' as cou..."
1,2008-08-11,5.200379,5.295363,1,b'Why wont America and Nato help us? If they w...
2,2008-08-12,5.295363,5.372368,1,b'Remember that adorable 9-year-old who sang a...
3,2008-08-13,5.372368,5.372966,1,b' U.S. refuses Israel weapons to attack Iran:...
4,2008-08-14,5.372966,5.2657,0,b'All the experts admit that we should legalis...


In [10]:
merged_df['Combined_Text'].str.len().describe()

Unnamed: 0,Combined_Text
count,1987.0
mean,2779.183694
std,437.162812
min,1473.0
25%,2485.5
50%,2785.0
75%,3074.0
max,4424.0


Install & Setup NLTK

In [11]:
!pip install nltk



In [17]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Cleaning Function

In [18]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and len(word) > 2
    ]
    return " ".join(tokens)

Apply Cleaning

In [19]:
merged_df['Clean_Text'] = merged_df['Combined_Text'].apply(clean_text)

In [20]:
from sklearn.model_selection import train_test_split

X = merged_df['Clean_Text']
y = merged_df['Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

Baseline Model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model = LogisticRegression(max_iter=1000)

model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.48743718592964824
              precision    recall  f1-score   support

           0       0.51      0.17      0.26       206
           1       0.48      0.82      0.61       192

    accuracy                           0.49       398
   macro avg       0.50      0.50      0.43       398
weighted avg       0.50      0.49      0.43       398



2nd model

In [27]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)

model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.48743718592964824
[[ 61 145]
 [ 59 133]]
              precision    recall  f1-score   support

           0       0.51      0.30      0.37       206
           1       0.48      0.69      0.57       192

    accuracy                           0.49       398
   macro avg       0.49      0.49      0.47       398
weighted avg       0.49      0.49      0.47       398



Class Distribution of Predictions

In [24]:
pd.Series(y_pred).value_counts(normalize=True)

Unnamed: 0,proportion
1,0.824121
0,0.175879


In [25]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(pd.Series(y_pred).value_counts(normalize=True))

[[ 36 170]
 [ 34 158]]
1    0.824121
0    0.175879
Name: proportion, dtype: float64


In [26]:
y_test.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Label,Unnamed: 1_level_1
0,0.517588
1,0.482412


We compute sentiment using VADER (very popular in finance NLP)

In [28]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [29]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    return analyzer.polarity_scores(text)['compound']

merged_df['Sentiment'] = merged_df['Combined_Text'].apply(get_sentiment)

merged_df[['Sentiment']].head()

Unnamed: 0,Sentiment
0,-0.9979
1,-0.9804
2,-0.9658
3,-0.9809
4,-0.9802


Sentiment Only Baseline

In [34]:
merged_df['Sentiment_3D'] = merged_df['Sentiment'].rolling(window=3).mean()

In [33]:
merged_df = merged_df.dropna()

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X_sent = merged_df[['Sentiment_3D']]
y = merged_df['Label']

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_sent, y, test_size=0.2, shuffle=False
)

model_s = LogisticRegression()
model_s.fit(X_train_s, y_train_s)

y_pred_s = model_s.predict(X_test_s)

from sklearn.metrics import accuracy_score
print("Sentiment Only Accuracy:", accuracy_score(y_test_s, y_pred_s))

Sentiment Only Accuracy: 0.4836272040302267


Starting fresh for DJIA experiment

In [39]:
news_df = pd.read_csv("Combined_News_DJIA.csv")
news_df['Date'] = pd.to_datetime(news_df['Date'])

headline_cols = [col for col in news_df.columns if "Top" in col]

news_df['Combined_Text'] = news_df[headline_cols].astype(str).agg(' '.join, axis=1)

news_df = news_df[['Date', 'Label', 'Combined_Text']]

news_df.head()

Unnamed: 0,Date,Label,Combined_Text
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou..."
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...
4,2008-08-14,1,b'All the experts admit that we should legalis...


In [40]:
from sklearn.model_selection import train_test_split

X = news_df['Combined_Text']
y = news_df['Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

Clean Text

In [41]:
X_train_clean = X_train.apply(clean_text)
X_test_clean = X_test.apply(clean_text)

TF-IDF + Logistic Regression

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

X_train_tfidf = vectorizer.fit_transform(X_train_clean)
X_test_tfidf = vectorizer.transform(X_test_clean)

model = LogisticRegression(max_iter=1000)

model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.4899497487437186
[[ 32 164]
 [ 39 163]]
              precision    recall  f1-score   support

           0       0.45      0.16      0.24       196
           1       0.50      0.81      0.62       202

    accuracy                           0.49       398
   macro avg       0.47      0.49      0.43       398
weighted avg       0.47      0.49      0.43       398



In [43]:
y_test.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Label,Unnamed: 1_level_1
1,0.507538
0,0.492462


**Across both single-stock (AAPL) and index-level (DJIA) experiments, classical TF-IDF + Logistic Regression models failed to outperform naive baselines in predicting next-day market direction. This supports the difficulty of extracting short-horizon predictive signals from broad daily news headlines using linear lexical models.**

In [44]:
!pip freeze > requirements.txt