# **Stock Sentiment Analysis – Model Evaluation and Testing**

This notebook investigates various machine learning techniques and models for sentiment analysis of stock-related tweets, with the objective of identifying the most effective model for accurate sentiment prediction. Its companion notebook — see TK — applies the selected model to the problem context to generate final predictions.

## 0. Imports

In [None]:
# Standard Library
import os
import re

# Local Scripts
os.chdir("../scripts")

# Constants
from constants import (
    TRAIN_DATA, 
    TEST_DATA, 
    LABELS, 
    DATA_DIR,
    NLTK_DATA,
    GENSIM_DATA
) #type: ignore

# Auxiliaries
from aux_funcs import (
    find_punctuated_tokens, 
    eval_sklearn_model, 
    eval_lstm_model,
    eval_llm_model, 
    eval_transformer
) #type: ignore

# Data Preprocessor
# from preprocessor import preprocess #type: ignore

# Local Classes
# os.chdir("../classes")
# from classes import W2VVectorizer, CLSVectorizer, BERTVectorizer

# Standard Data Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Wordclound Visualization
# from wordcloud import WordCloud

# Modelling and modelling metrics
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Transformers
from transformers import AutoTokenizer, AutoModel

# Natural Language ToolKit
#import nltk
#from nltk.corpus import stopwords
#from nltk.stem import SnowballStemmer
#from nltk.stem.wordnet import WordNetLemmatizer

# General Similarity - NLP - Pre-trained Models
# from gensim.models import Word2Vec
# import gensim.downloader as api

  from .autonotebook import tqdm as notebook_tqdm
2025-06-15 21:56:16.711179: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-15 21:56:16.724675: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750020976.741367  351557 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750020976.747516  351557 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750020976.759502  351557 computation_placer.cc:177] computation placer already r

In [None]:
# # Download if not data not found
# if not os.path.isdir(NLTK_DATA):
#     nltk.download('stopwords', download_dir=NLTK_DATA)
#     nltk.download('wordnet', download_dir=NLTK_DATA)

# nltk.data.path.append(NLTK_DATA)

## 1. Data Loading

In [None]:
# train_data = pd.read_csv(TRAIN_DATA)
# test_data = pd.read_csv(TEST_DATA)

## 3. Data Preprocessing

In [None]:
# cfg = {
#     "fix_acronyms": True,
#     "delete_spaces": True,
#     "demojify": True,
#     "clean_ticker":True,
#     "keep_ticker": True,
#     "anonymize_ticker": True,
#     "clean_url": True,
#     "keep_url": True,
#     "clean_handles": True,
#     "keep_handle": True,
#     "clean_hashtags": True,
#     "keep_hashtag": True,
#     "clean_prices": False,
#     "remove_punctuation": True,
#     "remove_special_chars": True,
#     "remove_stopwords": False,
#     "lemmatize_text": False,
#     "stem_text": False,
#     "remove_dates_with_search": False,
#     "clean_remaining_date_time": False,
#     "convert_percentage_changes": False,
#     "remove_contractions": False,
#     "remove_possessives": False,
#     "remove_locations": False,
#     "remove_all_integers": False,
#     "to_lower": False
# }

In [None]:
# # Apply preprocessing
# train_data['text'] = preprocess(
#     corpus=train_data['text'],
# )

In [None]:
# # Apply preprocessing
# test_data['text'] = preprocess(
#     corpus=test_data['text']
# )

In [None]:
# # Export preprocessed data
# train_data.to_csv(f'{DATA_DIR}/train_llm_optimized.csv', sep=',', index=False)
# test_data.to_csv(f'{DATA_DIR}/test_llm_optimized.csv', sep=',', index=False)

In [7]:
# Reassign datasets
train_data = pd.read_csv(f'{DATA_DIR}/train_llm_optimized.csv')
test_data = pd.read_csv(f'{DATA_DIR}/train_llm_optimized.csv')

In [None]:
# Fill missing values with empty string
train_data['text'] = train_data['text'].fillna('')
test_data['text'] = test_data['text'].fillna('')

In [None]:
# Drop rows with encoding errors
exceptions = find_punctuated_tokens(train_data['text'])
pattern = r'(?:' + '|'.join(re.escape(word) for word in exceptions) + r')'
train_data = train_data[~train_data['text'].str.contains(pattern, case=False, na=False)]

In [None]:
# Define dependent and independent features
train_data = train_data.sample(n=100)
# Train
X_train = train_data['text']
y_train = train_data['label']

# Test
X_test = test_data['text']

## 5. Modelling

In [None]:
# Stratified K-Fold
skf = StratifiedKFold(
    n_splits=5
    ,shuffle=True
    ,random_state=20
)

### 5.5. Language Model

In [None]:
system_message = """ You are a financial sentiment analysis engine.  

Your task is to classify the sentiment of financial tweets as:  
  0 = Bearish (negative sentiment)  
  1 = Bullish (positive sentiment)  
  2 = Neutral  

Respond with **only a single digit (0, 1, or 2)**.  

Note: In the training data, sentiment labels are distributed approximately as follows:  
  - 15% Bearish  
  - 25% Bullish  
  - 60% Neutral  


This distribution is believed to reflect the real-world proportions of financial tweet sentiment. 
Keep this in mind when interpreting ambiguous or mixed signals, as it is believed that the inputs 
provided to you will also fall under that distribution — avoid overconfident or unwarranted sentiment assignments.

**Examples:**  
  "JPMorgan reels in expectations on Beyond Meat" → 0  
  "Dougherty & Company starts at Buy" → 1  
  "Analysts React To FCC Decision On Intelsat C-Band Spectrum Auction" → 2  

[INPUT]  
"""

In [None]:
print("Now evaluating please hold...")

In [None]:
# Evaluate the model
y_true_all, y_pred_all = eval_llm_model(
    model="tiiuae/falcon-7b-instruct"
    ,skf=skf
    ,X_train=X_train
    ,y_train=y_train
    ,system_message=system_message
)

In [None]:
print("Overall classification report across all folds:\n")
print(classification_report(y_true_all, y_pred_all, digits=4, target_names=LABELS.keys()))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_true_all, y_pred_all)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=LABELS.keys(), yticklabels=LABELS.keys())
plt.xlabel('Predicted', labelpad=15)
plt.ylabel('True', labelpad=15)
plt.title('Confusion Matrix (Language Model)', fontsize=16)
plt.show()