<a href="https://colab.research.google.com/github/MMaggieZhou/sentiment_analysis/blob/main/sentiment_analysis_bert_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import unicodedata
import torch

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC

from transformers import BertTokenizer, BertModel

## Data Cleaning

In [2]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Normalize unicode characters
    text = unicodedata.normalize("NFKC", text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove special characters (optional)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    return text

label_encoder = LabelEncoder()

def load_and_process(file, training):
  df = pd.read_csv(file, names=['id', 'entity', 'label', 'text']).set_index('id')
  df = df.drop_duplicates().dropna()
  df['text_processed'] = df['text'].apply(preprocess_text)

  if training:
      df['label_num'] = label_encoder.fit_transform(df['label'])
  else:
      df['label_num'] = label_encoder.transform(df['label'])

  return df

train_df = load_and_process("/content/twitter_training.csv", True)
test_df = load_and_process("/content/twitter_validation.csv", False)

## Data Preprocessing

In [11]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)

def get_bert_embedding(sentence):
    tokens = tokenizer(sentence, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    for k,v in tokens.items():
        tokens[k] = v.to(device)
    with torch.no_grad():
        outputs = bert_model(**tokens)

    embedding = outputs.pooler_output.squeeze().cpu().numpy()
    return embedding
X_test = np.array([get_bert_embedding(sentence) for sentence in test_df["text_processed"]])
X_train = np.array([get_bert_embedding(sentence) for sentence in train_df["text_processed"]])

In [16]:
Y_train = train_df['label_num'].to_numpy()
Y_test = test_df['label_num'].to_numpy()

## Mode Training & Evaluation

In [17]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVM': LinearSVC(),
}
for model in models.values():
    model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
for name, model in models.items():
    Y_predicts = model.predict(X_test)
    report = classification_report(Y_test, Y_predicts, target_names=label_encoder.classes_)
    print(name)
    print(report)

LogisticRegression
              precision    recall  f1-score   support

  Irrelevant       0.61      0.30      0.40       172
    Negative       0.60      0.74      0.66       266
     Neutral       0.57      0.58      0.58       285
    Positive       0.63      0.67      0.65       277

    accuracy                           0.60      1000
   macro avg       0.60      0.57      0.57      1000
weighted avg       0.60      0.60      0.59      1000

SVM
              precision    recall  f1-score   support

  Irrelevant       0.63      0.26      0.36       172
    Negative       0.60      0.77      0.68       266
     Neutral       0.56      0.59      0.57       285
    Positive       0.64      0.66      0.65       277

    accuracy                           0.60      1000
   macro avg       0.61      0.57      0.57      1000
weighted avg       0.60      0.60      0.59      1000

