# **Stock Sentiment Analysis – Predictions**

This notebook makes predictions using the best model from `tm_tests_20.ipynb`

## 0. Imports

In [1]:
# Standard Library
import os
import re

# Local Scripts
os.chdir("../scripts")

# Constants
from constants import (
    TRAIN_DATA, 
    TEST_DATA, 
    LABELS, 
    DATA_DIR,
    NLTK_DATA,
    GENSIM_DATA
)  #type: ignore

# Auxiliaries
from aux_funcs import find_punctuated_tokens  #type: ignore

# Data Preprocessor
from preprocessor import preprocess  #type: ignore

# Local Classes
os.chdir("../classes")
from classes import CLSVectorizer, BERTVectorizer

# Standard Data Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Modelling and modelling metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

# Natural Language ToolKit
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

  from .autonotebook import tqdm as notebook_tqdm
2025-06-15 18:21:52.613085: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-15 18:21:52.806916: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750008112.887648    6707 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750008112.911196    6707 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750008113.076946    6707 computation_placer.cc:177] computation placer already r

In [2]:
# Download if not data not found
if not os.path.isdir(NLTK_DATA):
    nltk.download('stopwords', download_dir=NLTK_DATA)
    nltk.download('wordnet', download_dir=NLTK_DATA)

nltk.data.path.append(NLTK_DATA)

## 1. Data Loading

In [3]:
train_data = pd.read_csv(TRAIN_DATA)
test_data = pd.read_csv(TEST_DATA)

## 2. Data Preprocessing

If the user has not ran the previous notebook, please uncomment the commentted cells.

In [4]:
# # Defining the stopwords
# stop = set(stopwords.words('english'))

# # Initializing Lemmatizer
# lemma = WordNetLemmatizer()

# # Initializing Stemmer
# stemmer = SnowballStemmer('english')

In [5]:
# # Apply preprocessing
# train_data['text'] = preprocess(
#     corpus=train_data['text'],
#     stopwords=stop,
#     lemmatizer=lemma
# )

In [6]:
# # Apply preprocessing
# test_data['text'] = preprocess(
#     corpus=test_data['text'],
#     stopwords=stop,
#     lemmatizer=lemma
# )

In [7]:
# # Export preprocessed data
# train_data.to_csv(f'{DATA_DIR}/train_preprocessed.csv', sep=',', index=False)
# test_data.to_csv(f'{DATA_DIR}/test_preprocessed.csv', sep=',', index=False)

In [8]:
# Reassign datasets
train_data = pd.read_csv(f'{DATA_DIR}/train_preprocessed.csv')
test_data = pd.read_csv(f'{DATA_DIR}/test_preprocessed.csv')

In [9]:
# Fill missing values with empty string
train_data['text'] = train_data['text'].fillna('')
test_data['text'] = test_data['text'].fillna('')

In [10]:
# Drop rows with encoding errors
exceptions = find_punctuated_tokens(train_data['text'])

pattern = r'(?:' + '|'.join(re.escape(word) for word in exceptions) + r')'

train_data = train_data[~train_data['text'].str.contains(pattern, case=False, na=False)]

In [11]:
# Define dependent and independent features

# Train
X_train = train_data['text']
y_train = train_data['label']

# Test
X_test = test_data['text']

## 3. Feature Engineering

In [12]:
roberta_model = "cardiffnlp/twitter-roberta-base-sentiment"
roberta = CLSVectorizer(
    embeddings_model=BERTVectorizer(
        model_name=roberta_model
    ).embed
)

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4. Modelling

In [13]:
knn = KNeighborsClassifier(
    n_neighbors=10
    ,metric='cosine'
    ,weights='distance'
)

In [14]:
# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', roberta),
    ('classifier', knn)
])

In [15]:
# Train the model
pipeline.fit(X_train, y_train)

  0%|          | 0/9458 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 9458/9458 [07:09<00:00, 22.04it/s]


In [16]:
# Predict
y_pred = pipeline.predict(X_test)

100%|██████████| 2388/2388 [01:51<00:00, 21.41it/s]


In [21]:
# Save predictions
pd.DataFrame({
    'id': test_data['id']
    ,'label': y_pred
}).to_csv(
    f'{DATA_DIR}/pred_20.csv'
    ,index=False
)