<a href="https://colab.research.google.com/github/MMaggieZhou/sentiment_analysis/blob/main/sentiment_analysis_sbert_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Clearning

In [2]:
import re
import unicodedata
import pandas as pd

from sklearn.preprocessing import LabelEncoder

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Normalize unicode characters
    text = unicodedata.normalize("NFKC", text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove special characters (optional)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    return text

label_encoder = LabelEncoder()

def load_and_process(file, training):
  df = pd.read_csv(file, names=['id', 'entity', 'label', 'text']).set_index('id')
  df = df.drop_duplicates().dropna()
  df['text_processed'] = df['text'].apply(preprocess_text)

  if training:
      df['label_num'] = label_encoder.fit_transform(df['label'])
  else:
      df['label_num'] = label_encoder.transform(df['label'])

  return df

train_df = load_and_process("/content/twitter_training.csv", True)
test_df = load_and_process("/content/twitter_validation.csv", False)

## Data Preprocessing

In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
pool = model.start_multi_process_pool()
X_train = model.encode_multi_process(train_df["text_processed"].tolist(), pool)
X_test = model.encode_multi_process(test_df["text_processed"].tolist(), pool)
model.stop_multi_process_pool(pool)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
Y_train = train_df['label_num'].to_numpy()
Y_test = test_df['label_num'].to_numpy()

## Model Training & Evaluation

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVM': LinearSVC(),
}
for model in models.values():
    model.fit(X_train, Y_train)

In [14]:
from sklearn.metrics import classification_report

for name, model in models.items():
    Y_predicts = model.predict(X_test)
    report = classification_report(Y_test, Y_predicts, target_names=label_encoder.classes_)
    print(name)
    print(report)

LogisticRegression
              precision    recall  f1-score   support

  Irrelevant       0.61      0.46      0.52       172
    Negative       0.61      0.75      0.67       266
     Neutral       0.62      0.55      0.58       285
    Positive       0.64      0.68      0.66       277

    accuracy                           0.62      1000
   macro avg       0.62      0.61      0.61      1000
weighted avg       0.62      0.62      0.62      1000

SVM
              precision    recall  f1-score   support

  Irrelevant       0.65      0.38      0.48       172
    Negative       0.59      0.77      0.67       266
     Neutral       0.60      0.53      0.56       285
    Positive       0.65      0.70      0.67       277

    accuracy                           0.62      1000
   macro avg       0.62      0.60      0.60      1000
weighted avg       0.62      0.62      0.61      1000

