- Combined train and valid set

In [None]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
from langdetect import detect
import emoji
from urlextract import URLExtract
import contractions
from multiprocessing import Pool, cpu_count
import numpy as np
from tqdm import tqdm
from symspellpy import SymSpell, Verbosity
from collections import Counter

- Merging Train and Valid Set

In [None]:
import pandas as pd

# Load the train and valid dataset
train_df = pd.read_csv("Dataset\\topic_train.csv")
valid_df = pd.read_csv("Dataset\\topic_valid.csv")

# Merge both dataset
combined_df = pd.concat([train_df, valid_df], ignore_index=True)

# Save the merged dataset 
combined_df.to_csv("Dataset\\merged_datasetA.csv", index=False)

# Display first few rows 
print(combined_df.head())
print(len(combined_df))



                                                text  label
0  Here are Thursday's biggest analyst calls: App...      0
1  Buy Las Vegas Sands as travel to Singapore bui...      0
2  Piper Sandler downgrades DocuSign to sell, cit...      0
3  Analysts react to Tesla's latest earnings, bre...      0
4  Netflix and its peers are set for a ‘return to...      0
21107


- Preprocess FinBERT

In [16]:
#Load combined dataset
finpep_df = pd.read_csv("Dataset\\merged_datasetA.csv")
print(finpep_df.head())
print( len(finpep_df))

                                                text  label
0  Here are Thursday's biggest analyst calls: App...      0
1  Buy Las Vegas Sands as travel to Singapore bui...      0
2  Piper Sandler downgrades DocuSign to sell, cit...      0
3  Analysts react to Tesla's latest earnings, bre...      0
4  Netflix and its peers are set for a ‘return to...      0
21107


In [17]:
# Ticker & acronym whitelist
def detect_protected_terms(text_series):
    """Detect uppercase tickers/acronyms from the dataset."""
    pattern = r'\$[A-Z]{1,5}\b|\b[A-Z]{2,5}\b'
    terms = set()
    for text in text_series.dropna():
        matches = re.findall(pattern, text)
        matches = [m.lstrip('$') for m in matches]  # Remove $ for consistency
        terms.update(matches)
    return terms

protected_words = set([
    "USD", "EUR", "AAPL", "TSLA", "GOOG", "MSFT", "SEC", "GDP", "NASDAQ", "NYSE"
])

# SymSpell setup
sym_spell = SymSpell(max_dictionary_edit_distance=1, prefix_length=7)
dictionary_path = "SymSpell_Dictionary/frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

def spell_correct(text):
    corrected_words = []
    for word in text.split():
        if word.upper() in protected_words:  # Protect finance terms
            corrected_words.append(word)
            continue
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=1)
        if suggestions:
            corrected_words.append(suggestions[0].term)
        else:
            corrected_words.append(word)
    return ' '.join(corrected_words)

def preprocess(text):

    def is_valid_text(txt):
        return isinstance(txt, str) and txt.strip() != ''

    def is_english(txt, target_language='en'):
        try:
            return detect(txt) == target_language
        except:
            return False

    def text_cleaning(txt):
        # 1. Expand contractions (e.g., don't → do not)
        txt = contractions.fix(txt)

        # 2. Replace emojis with descriptive text
        txt = emoji.replace_emoji(txt, replace=lambda e, _: emoji.demojize(e).strip(':').replace('_', ' '))

        # 3. Remove HTML tags
        txt = BeautifulSoup(txt, "html.parser").get_text(separator=" ")

        # 4. Remove URLs
        extractor = URLExtract()
        urls = extractor.find_urls(txt)
        for url in urls:
            txt = txt.replace(url, '')

        # 5. Remove unwanted punctuation but KEEP $, %, +, -
        punctuation_to_remove = ''.join(ch for ch in string.punctuation if ch not in ['$','%','+','-'])
        txt = txt.translate(str.maketrans('', '', punctuation_to_remove))

        # 6. Replace multiple spaces with single space
        txt = re.sub(r'\s+', ' ', txt).strip()

        # 7. Optional: Spell correction (safe for finance terms)
        txt = spell_correct(txt)

        return txt

    text = str(text)

    if not is_valid_text(text):
        return ""
    text = text.strip()

    if not is_english(text):
        return ""

    return text_cleaning(text)


# Usage with progress bar
tqdm.pandas()

protected_words = detect_protected_terms(finpep_df['text'])

finpep_df['preprocessed_text'] = finpep_df['text'].progress_apply(preprocess)

# Remove empty rows
finpep_df = finpep_df[finpep_df['preprocessed_text'].str.strip() != '']

# Drop unnecessary columns
finpep_df.drop(columns=['label'], inplace=True)

finpep_df.head()


100%|██████████| 21107/21107 [04:41<00:00, 74.94it/s] 


Unnamed: 0,text,preprocessed_text
0,Here are Thursday's biggest analyst calls: App...,Here are thursdays biggest analyst calls Apple...
1,Buy Las Vegas Sands as travel to Singapore bui...,Buy as vegas hands as travel to singapore buil...
2,"Piper Sandler downgrades DocuSign to sell, cit...",piper handler downgrades DocuSign to sell citi...
3,"Analysts react to Tesla's latest earnings, bre...",analysts react to Teslas latest earnings break...
4,Netflix and its peers are set for a ‘return to...,netflix and its peers are set for a return to ...


In [24]:
# Statistic Analyze After Cleaning
correction_counter = Counter()
def count_urls(text):
    return len(re.findall(r'http\S+|www\.\S+', text))

def count_emojis(text):
    return len(emoji.emoji_list(text))

def count_special_chars(text):
    return len(re.findall(r'[^a-zA-Z0-9\s]', text))

# Create metrics before and after
insights = pd.DataFrame({
    "Metric": [
        "Avg text length (chars)",
        "Avg text length (words)",
        "Total URLs",
        "Total Emojis",
        "Total Special Characters",
        "Total Corrections Made"
    ],
    "Before": [
        finpep_df['text'].str.len().mean(),
        finpep_df['text'].str.split().apply(len).mean(),
        finpep_df['text'].apply(count_urls).sum(),
        finpep_df['text'].apply(count_emojis).sum(),
        finpep_df['text'].apply(count_special_chars).sum(),
        sum(correction_counter.values())
    ],
    "After": [
        finpep_df['preprocessed_text'].str.len().mean(),
        finpep_df['preprocessed_text'].str.split().apply(len).mean(),
        finpep_df['preprocessed_text'].apply(count_urls).sum(),
        finpep_df['preprocessed_text'].apply(count_emojis).sum(),
        finpep_df['preprocessed_text'].apply(count_special_chars).sum(),
        sum(correction_counter.values()) 
    ]
})

print(insights)


                     Metric         Before         After
0   Avg text length (chars)     138.265262    107.217952
1   Avg text length (words)      18.639970     17.550558
2                Total URLs   23290.000000      0.000000
3              Total Emojis     662.000000      0.000000
4  Total Special Characters  195854.000000  24333.000000
5    Total Corrections Made       0.000000      0.000000


In [25]:
finpep_df.to_csv("Dataset/merged_datasetA(cleaned).csv", columns=['preprocessed_text','text'],index=False)
print(finpep_df.head())
print( len(finpep_df))

                                                text  \
0  Here are Thursday's biggest analyst calls: App...   
1  Buy Las Vegas Sands as travel to Singapore bui...   
2  Piper Sandler downgrades DocuSign to sell, cit...   
3  Analysts react to Tesla's latest earnings, bre...   
4  Netflix and its peers are set for a ‘return to...   

                                   preprocessed_text  
0  Here are thursdays biggest analyst calls Apple...  
1  Buy as vegas hands as travel to singapore buil...  
2  piper handler downgrades DocuSign to sell citi...  
3  analysts react to Teslas latest earnings break...  
4  netflix and its peers are set for a return to ...  
20165


Label merged_datasetA(cleaned) with FinBERT

In [None]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm import tqdm 


model_name = "yiyanghkust/finbert-tone"  # pretrained FinBERT for financial sentiment

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

finclas_df= pd.read_csv("Dataset/merged_datasetA(cleaned).csv")  # Make sure it has a column like 'text'

# Add a new column for sentiment label
tqdm.pandas()
finclas_df['sentiment'] = finclas_df['preprocessed_text'].progress_apply(lambda x: sentiment_pipeline(x)[0]['label'])

finclas_df['score'] = finclas_df['preprocessed_text'].progress_apply(lambda x: sentiment_pipeline(x)[0]['score'])    

finclas_df.to_csv("Dataset/merged_datasetA(labeled).csv", index=False)
print(finclas_df.head())
print( len(finclas_df))


Device set to use cpu
  return forward_call(*args, **kwargs)
100%|██████████| 20163/20163 [13:47<00:00, 24.38it/s]
  return forward_call(*args, **kwargs)
100%|██████████| 20163/20163 [14:03<00:00, 23.90it/s]


                                   preprocessed_text  \
0  Here are thursdays biggest analyst calls Apple...   
1  Buy as vegas hands as travel to singapore buil...   
2  piper handler downgrades DocuSign to sell citi...   
3  analysts react to Teslas latest earnings break...   
4  netflix and its peers are set for a return to ...   

                                                text sentiment     score  
0  Here are Thursday's biggest analyst calls: App...   Neutral  0.999992  
1  Buy Las Vegas Sands as travel to Singapore bui...   Neutral  0.999866  
2  Piper Sandler downgrades DocuSign to sell, cit...  Negative  0.999097  
3  Analysts react to Tesla's latest earnings, bre...   Neutral  0.999918  
4  Netflix and its peers are set for a ‘return to...  Positive  0.999999  
20163
Exported to twitter_labeled.csv successfully.


In [4]:
#Load twitter_labeled
twteda_df = pd.read_csv("Dataset/merged_datasetA(labeled).csv")

# Display first few rows
print(twteda_df.head(7))
print("\n Total number of rows: ",len(twteda_df))

                                   preprocessed_text  \
0  Here are thursdays biggest analyst calls Apple...   
1  Buy as vegas hands as travel to singapore buil...   
2  piper handler downgrades DocuSign to sell citi...   
3  analysts react to Teslas latest earnings break...   
4  netflix and its peers are set for a return to ...   
5  barclays believes earnings for these underperf...   
6  bernstein upgrades Alibaba says shares can ral...   

                                                text sentiment     score  
0  Here are Thursday's biggest analyst calls: App...   Neutral  0.999992  
1  Buy Las Vegas Sands as travel to Singapore bui...   Neutral  0.999866  
2  Piper Sandler downgrades DocuSign to sell, cit...  Negative  0.999097  
3  Analysts react to Tesla's latest earnings, bre...   Neutral  0.999918  
4  Netflix and its peers are set for a ‘return to...  Positive  0.999999  
5  Barclays believes earnings for these underperf...   Neutral  0.408876  
6  Bernstein upgrades Alib

In [5]:
print("Shape:", twteda_df.shape)
print("\nInfo:")
print(twteda_df.info())
print("\nMissing values:\n", twteda_df.isnull().sum())
print("\nDuplicate rows:", twteda_df.duplicated().sum())

Shape: (20163, 4)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20163 entries, 0 to 20162
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   preprocessed_text  20163 non-null  object 
 1   text               20163 non-null  object 
 2   sentiment          20163 non-null  object 
 3   score              20163 non-null  float64
dtypes: float64(1), object(3)
memory usage: 630.2+ KB
None

Missing values:
 preprocessed_text    0
text                 0
sentiment            0
score                0
dtype: int64

Duplicate rows: 0


In [6]:
#Statistics
def print_stats(stats_df, title):
    print(f"\n{title} Statistics:")
    print("="*50)
    print(stats_df.to_markdown(tablefmt="grid", stralign='center', numalign='center'))
    print("="*50)


twteda_df["length"] = twteda_df["sentiment"].apply(lambda x: len(x.split()))
grouped_stats = twteda_df.groupby('sentiment')["length"].agg(['max', 'min', 'mean', 'median', 'std', 'count'])
print_stats(grouped_stats, "Text Length per Label")


Text Length per Label Statistics:
+-------------+-------+-------+--------+----------+-------+---------+
|  sentiment  |  max  |  min  |  mean  |  median  |  std  |  count  |
|  Negative   |   1   |   1   |   1    |    1     |   0   |  3815   |
+-------------+-------+-------+--------+----------+-------+---------+
|   Neutral   |   1   |   1   |   1    |    1     |   0   |  12596  |
+-------------+-------+-------+--------+----------+-------+---------+
|  Positive   |   1   |   1   |   1    |    1     |   0   |  3752   |
+-------------+-------+-------+--------+----------+-------+---------+


- PReprocess for All Case

In [30]:
#Load Libraries
import re
import string
import nltk
import spacy
import emoji
import contractions
from bs4 import BeautifulSoup
from langdetect import detect
from urlextract import URLExtract
from tqdm import tqdm
from symspellpy import SymSpell, Verbosity
from collections import Counter
from nltk.corpus import stopwords

In [35]:
import pandas as pd
#Load combined dataset
Nompep_df = pd.read_csv("Dataset\\merged_datasetA.csv")
print(Nompep_df.head())
print( len(Nompep_df))

                                                text  label
0  Here are Thursday's biggest analyst calls: App...      0
1  Buy Las Vegas Sands as travel to Singapore bui...      0
2  Piper Sandler downgrades DocuSign to sell, cit...      0
3  Analysts react to Tesla's latest earnings, bre...      0
4  Netflix and its peers are set for a ‘return to...      0
21107


In [36]:
nlp = spacy.load("en_core_web_sm")
nltk.download('stopwords')

sym_spell = SymSpell(max_dictionary_edit_distance=1, prefix_length=7)
dictionary_path = "SymSpell_Dictionary\\frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

correction_counter = Counter()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JohnTan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def spell_correct(text):
    corrected_words = []
    for word in text.split():
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=1)
        if suggestions:
            corrected_words.append(suggestions[0].term)
        else:
            corrected_words.append(word)
    return ' '.join(corrected_words)

def preprocess(text):

    def is_valid_text(text):
        return isinstance(text, str) and text.strip() != ''

    def is_english(text, target_language='en'):
        try:
            return detect(text) == target_language
        except:
            return False

    def text_cleaning(text):
        # Lowecase text
        text = text.lower()

        # Detect Emoji and Replace with text
        text = emoji.replace_emoji(text, replace=lambda e, _: emoji.demojize(e).strip(':').replace('_', ' '))

        # Detect and Expend contraction
        text = contractions.fix(text)

        # Detect and Remove HTML Tag
        text = BeautifulSoup(text, "html.parser").get_text(separator=" ")

        # Identify and Remove URLs
        extractor = URLExtract()
        urls = extractor.find_urls(text)
        for url in urls:
            text = text.replace(url, '') 

        # Detect and Remove Punctuation
        text = ''.join([char for char in text if char not in string.punctuation])

        # Detect and Remove Special Symbol
        text = re.sub(r'\W', ' ', text)

        # Correct Misspelled Word
        text = spell_correct(text)

        # Lemmatize The Text
        doc = nlp(text)
        text = ' '.join([token.lemma_ for token in doc])

        return text

    text = str(text)

    if not is_valid_text(text):
        return ""
    text = text.strip()

    if not is_english(text):
        return ""

    return text_cleaning(text)
   


tqdm.pandas()
Nompep_df['preprocessed_text'] = Nompep_df['text'].progress_apply(preprocess)
Nompep_df = Nompep_df[Nompep_df['preprocessed_text'].str.strip() != '']
Nompep_df.head()

100%|██████████| 21107/21107 [07:14<00:00, 48.58it/s]


Unnamed: 0,text,label,preprocessed_text
0,Here are Thursday's biggest analyst calls: App...,0,here be thursday big analyst call apple amazon...
1,Buy Las Vegas Sands as travel to Singapore bui...,0,buy las vegas sand as travel to singapore buil...
2,"Piper Sandler downgrades DocuSign to sell, cit...",0,piper handler downgrade docusign to sell cite ...
3,"Analysts react to Tesla's latest earnings, bre...",0,analyst react to tesla late earning break down...
4,Netflix and its peers are set for a ‘return to...,0,netflix and its peer be set for a return to gr...


In [38]:
# Statistic Analyze After Cleaning
def count_urls(text):
    return len(re.findall(r'http\S+|www\.\S+', text))

def count_emojis(text):
    return len(emoji.emoji_list(text))

def count_special_chars(text):
    return len(re.findall(r'[^a-zA-Z0-9\s]', text))

# Create metrics before and after
insights = pd.DataFrame({
    "Metric": [
        "Avg text length (chars)",
        "Avg text length (words)",
        "Total URLs",
        "Total Emojis",
        "Total Special Characters",
        "Total Corrections Made"
    ],
    "Before": [
        finpep_df['text'].str.len().mean(),
        finpep_df['text'].str.split().apply(len).mean(),
        finpep_df['text'].apply(count_urls).sum(),
        finpep_df['text'].apply(count_emojis).sum(),
        finpep_df['text'].apply(count_special_chars).sum(),
        sum(correction_counter.values())
    ],
    "After": [
        finpep_df['preprocessed_text'].str.len().mean(),
        finpep_df['preprocessed_text'].str.split().apply(len).mean(),
        finpep_df['preprocessed_text'].apply(count_urls).sum(),
        finpep_df['preprocessed_text'].apply(count_emojis).sum(),
        finpep_df['preprocessed_text'].apply(count_special_chars).sum(),
        sum(correction_counter.values()) 
    ]
})

print(insights)

                     Metric         Before         After
0   Avg text length (chars)     138.265262    107.217952
1   Avg text length (words)      18.639970     17.550558
2                Total URLs   23290.000000      0.000000
3              Total Emojis     662.000000      0.000000
4  Total Special Characters  195854.000000  24333.000000
5    Total Corrections Made       0.000000      0.000000


In [39]:
Nompep_df.to_csv("Dataset/merged_datasetA_allCase(cleaned).csv", columns=['preprocessed_text','text'],index=False)
print(Nompep_df.head())
print( len(Nompep_df))

                                                text  label  \
0  Here are Thursday's biggest analyst calls: App...      0   
1  Buy Las Vegas Sands as travel to Singapore bui...      0   
2  Piper Sandler downgrades DocuSign to sell, cit...      0   
3  Analysts react to Tesla's latest earnings, bre...      0   
4  Netflix and its peers are set for a ‘return to...      0   

                                   preprocessed_text  
0  here be thursday big analyst call apple amazon...  
1  buy las vegas sand as travel to singapore buil...  
2  piper handler downgrade docusign to sell cite ...  
3  analyst react to tesla late earning break down...  
4  netflix and its peer be set for a return to gr...  
20154


In [40]:
print("Shape:", Nompep_df.shape)
print("\nInfo:")
print(Nompep_df.info())
print("\nMissing values:\n", Nompep_df.isnull().sum())
print("\nDuplicate rows:", Nompep_df.duplicated().sum())

Shape: (20154, 3)

Info:
<class 'pandas.core.frame.DataFrame'>
Index: 20154 entries, 0 to 21106
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               20154 non-null  object
 1   label              20154 non-null  int64 
 2   preprocessed_text  20154 non-null  object
dtypes: int64(1), object(2)
memory usage: 629.8+ KB
None

Missing values:
 text                 0
label                0
preprocessed_text    0
dtype: int64

Duplicate rows: 0
