## Encode and save preprocessed datasets

In [None]:
# Install required packages
!pip install pandas numpy scikit-learn transformers torch datasets sentencepiece
!pip install onnxruntime onnxruntime-tools optimum

In [10]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [7]:
df = pd.read_csv('../model-datasets/final-data.csv', encoding='latin1')

In [4]:
df.head()

Unnamed: 0,text,sentiment
0,$BYND - JPMorgan reels in expectations on Beyo...,negative
1,$CCL $RCL - Nomura points to bookings weakness...,negative
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",negative
3,$ESS: BTIG Research cuts to Neutral https://t....,negative
4,$FNKO - Funko slides after Piper Jaffray PT cu...,negative


In [11]:
# Preprocessing function
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = text.replace('$', '')  # Remove dollar signs (tickers)
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove special characters except basic punctuation
    text = re.sub(r'[^\w\s.,!?]', '', text)
    return text.strip()


In [12]:
# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)

In [14]:
# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentiment'])

In [17]:
# See the class labels and their corresponding encoded values
for i, label in enumerate(label_encoder.classes_):
    print(f"{label} --> {i}")

negative --> 0
neutral --> 1
positive --> 2


In [15]:
# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [18]:

# Save processed data
train_df[['text', 'label']].to_csv('../processed-datasets/train.csv', index=False)
val_df[['text', 'label']].to_csv('../processed-datasets/val.csv', index=False)
test_df[['text', 'label']].to_csv('../processed-datasets/test.csv', index=False)