BERT for Sentiment Analysis

In [None]:
!pip3 install transformers
!pip3 install mlflow

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 4.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 32.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 60.2 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created whe

In [None]:
import numpy as np
import pandas as pd
import re
import os

import torch
import transformers as ppb

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import mlflow
import mlflow.sklearn

MLFLOW_SERVER_URL = 'http://127.0.0.1:5000/'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load data

In [None]:
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/train.csv', header=None,
                    names=["sentiment", "id", "date", "query", "user", "text"], engine='python', encoding='latin1')
data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
size_data = len(data)
size_data

1600000

In [None]:
#  truncate the data, because due to the large amount of data, a RAM overflow error occurs
data = data.loc[799000:801000]

Prepare data

In [None]:
# drop features
data.drop(['id', 'date', 'query', 'user'], axis=1, inplace=True)
data.head()

Unnamed: 0,sentiment,text
799000,0,"CAN'T BEAT LIVE MUSIC, WISH I COULD SING BUT I..."
799001,0,Charlie lost an angel today Very sad http:...
799002,0,"at work, and very bored"
799003,0,It's weird how celebrities go in threes. Carra...
799004,0,@adelate Farrah's dead? Had no idea.. RIP Farr...


In [None]:
def clean_tweet(tweet):
    # remove @tag
    tweet = re.sub(r"@\S+", ' ', tweet)
    # remove link
    tweet = re.sub(r"http\S+", ' ', tweet)
    # remove special char
    tweet = re.sub(r"[^A-Za-z ?!,.\'\"]", ' ', tweet)
    # remove excess whitespace
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
data['text'] = data['text'].apply(clean_tweet)
data.head()

Unnamed: 0,sentiment,text
799000,0,"CAN'T BEAT LIVE MUSIC, WISH I COULD SING BUT I..."
799001,0,Charlie lost an angel today Very sad
799002,0,"at work, and very bored"
799003,0,It's weird how celebrities go in threes. Carra...
799004,0,"Farrah's dead? Had no idea.. RIP Farrah, inde..."


In [None]:
data['sentiment'].value_counts()

4    1001
0    1000
Name: sentiment, dtype: int64

Original data labels negative as 0, positive as 4. Change 4 to 1.

In [None]:
data[data['sentiment'] == 4] = 1

BERT
https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

In [None]:
model_class = ppb.DistilBertModel
tokenizer_class = ppb.DistilBertTokenizer
pretrained_weights = "distilbert-base-uncased"

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
tokenized = data['text'].apply((lambda x: tokenizer.encode(str(x), add_special_tokens=True)))

max_len = max(map(len, tokenized.values))
padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

features = last_hidden_states[0][:, 0, :].numpy()
labels = data['sentiment']

train_x, test_x, train_y, test_y = train_test_split(features, labels)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Mlflow

In [None]:
!pip install mlflow --quiet
get_ipython().system_raw("mlflow ui --port 5000 &")
!pip install psycopg2



In [None]:
os.system('mlflow server \
    --backend-store-uri sqlite:///mlflow.db \
    --default-artifact-root /mlruns \
    --host 0.0.0.0')

256

In [None]:
# connect to the server
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment_name = 'sentiment-classifier'
model_name = 'sentiment-model'

In [None]:
mlflow.set_experiment(experiment_name)

2022/05/05 20:39:05 INFO mlflow.tracking.fluent: Experiment with name 'sentiment-classifier' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='sentiment-classifier', tags={}>

In [None]:
params = [(0.1, 100), (0.001, 100), (10, 100), (0.1, 1000), (0.001, 1000), (10, 1000)]

In [None]:
mlflow.set_experiment(experiment_name)
for c, max_iter in params:
    with mlflow.start_run() as run:
          # Create model
          lr = LogisticRegression(C=c, max_iter=max_iter)
          # Fit and predict
          lr.fit(train_x, train_y)
          predictions = lr.predict(test_x)
          rmse = np.sqrt(mean_squared_error(test_y, predictions))
          mae = mean_absolute_error(test_y, predictions)
          r2 = r2_score(test_y, predictions)

          print(f"LogisticRegression (c={c}, max_iter={max_iter})")
          print("RMSE: %s" % rmse)
          print("MAE: %s" % mae)
          print("R2: %s" % r2)

          mlflow.log_param("c", c)
          mlflow.log_param("max_iter", max_iter)
          mlflow.log_metric("rmse", rmse)
          mlflow.log_metric("r2", r2)
          mlflow.log_metric("mae", mae)

LogisticRegression (c=0.1, max_iter=100)
RMSE: 0.07738232325341368
MAE: 0.005988023952095809
R2: 0.9760401721664276
LogisticRegression (c=0.001, max_iter=100)
RMSE: 0.08935341032175405
MAE: 0.007984031936127744
R2: 0.96805356288857


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression (c=10, max_iter=100)
RMSE: 0.0
MAE: 0.0
R2: 1.0
LogisticRegression (c=0.1, max_iter=1000)
RMSE: 0.07738232325341368
MAE: 0.005988023952095809
R2: 0.9760401721664276
LogisticRegression (c=0.001, max_iter=1000)
RMSE: 0.08935341032175405
MAE: 0.007984031936127744
R2: 0.96805356288857
LogisticRegression (c=10, max_iter=1000)
RMSE: 0.0
MAE: 0.0
R2: 1.0


In [None]:
# get prod model by uri
model_uri = f'models:/{model_name}/production'
client = mlflow.tracking.MlflowClient()

# Get best run
exp = client.get_experiment_by_name(experiment_name)
runs = client.search_runs(exp.experiment_id, order_by=["metrics.rmse ASC"], max_results=1)
best_run = runs[0]
print(f"Best run: {best_run.info.run_id} {best_run.data.metrics['rmse']}")

Best run: f318d8d2ad234bdb84afa76b54f61382 0.0
