# Tutorial NLU - Finetuning SmSA
SmSA is a Sentiment Analysis dataset with 3 possible labels: `positive`, `negative`, and `neutral`

In [1]:
import os, sys
sys.path.append('../')
os.chdir('../')
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import fasttext
from nltk import word_tokenize

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from utils.forward_fn import forward_sequence_classification
from utils.metrics import document_sentiment_metrics_fn
from utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [2]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [3]:
# Set random seed
set_seed(26092020)

# Prepare Dataset

In [4]:
train_dataset_path = './dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = './dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = './dataset/smsa_doc-sentiment-prosa/test_preprocess.tsv'

In [5]:
train_df = pd.read_csv(train_dataset_path, sep='\t', header=None)
valid_df = pd.read_csv(valid_dataset_path, sep='\t', header=None)
test_df = pd.read_csv(test_dataset_path, sep='\t', header=None)

train_df.columns = ['text', 'label']
valid_df.columns = ['text', 'label']
test_df.columns = ['text', 'label']

In [6]:
train_df.head(10)

Unnamed: 0,text,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative
5,"makanan beragam , harga makanan di food stall ...",positive
6,pakai kartu kredit bca tidak untung malah rugi...,negative
7,"tempat unik , bagus buat foto , makanan enak ,...",positive
8,saya bersama keluarga baru saja menikmati peng...,positive
9,bersyukur,positive


In [7]:
def get_label_idx(label):
    if label == 'positive':
        return 2
    if label == 'negative':
        return 1
    if label == 'neutral':
        return 0
train_df['label'] = train_df['label'].apply(get_label_idx)
valid_df['label'] = valid_df['label'].apply(get_label_idx)
test_df['label'] = test_df['label'].apply(get_label_idx)

In [8]:
train_df.head(10)

Unnamed: 0,text,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,2
1,mohon ulama lurus dan k212 mmbri hujjah partai...,0
2,lokasi strategis di jalan sumatera bandung . t...,2
3,betapa bahagia nya diri ini saat unboxing pake...,2
4,duh . jadi mahasiswa jangan sombong dong . kas...,1
5,"makanan beragam , harga makanan di food stall ...",2
6,pakai kartu kredit bca tidak untung malah rugi...,1
7,"tempat unik , bagus buat foto , makanan enak ,...",2
8,saya bersama keluarga baru saja menikmati peng...,2
9,bersyukur,2


# Traditional Approach
- Bag of Word
- TF-IDF
- Word Vector

# Bag-of-Word Model
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

<img src="bag_of_word.png"/>

## Count Vectorizer (default)

In [9]:
vectorizer = CountVectorizer()
train_input = vectorizer.fit_transform(train_df['text'])
valid_input = vectorizer.transform(valid_df['text'])
test_input = vectorizer.transform(test_df['text'])

In [10]:
len(vectorizer.get_feature_names_out()), vectorizer.get_feature_names_out()[0:20]

(17241,
 array(['00', '000', '001', '01', '010', '0111', '011770465655617', '02',
        '021', '022', '030360019614718', '0361', '04', '05', '0561', '07',
        '08', '081147286649', '081377744845', '08156189559'], dtype=object))

In [11]:
%%time
model = LogisticRegression()
model = model.fit(train_input, train_df['label'])

CPU times: user 47.7 s, sys: 1min 37s, total: 2min 25s
Wall time: 5.16 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
%%time
hyps = model.predict(train_input)
document_sentiment_metrics_fn(hyps, train_df['label'])

CPU times: user 566 ms, sys: 1.15 s, total: 1.72 s
Wall time: 57.1 ms


{'ACC': 0.9810909090909091,
 'F1': 0.9829617694342899,
 'REC': 0.9828632926562729,
 'PRE': 0.9831031020207582}

In [13]:
%%time
hyps = model.predict(valid_input)
document_sentiment_metrics_fn(hyps, valid_df['label'])

CPU times: user 50.6 ms, sys: 41.7 ms, total: 92.4 ms
Wall time: 8.99 ms


{'ACC': 0.8746031746031746,
 'F1': 0.8383965071776482,
 'REC': 0.8267771483892248,
 'PRE': 0.8546862276279086}

In [14]:
%%time
hyps = model.predict(test_input)
document_sentiment_metrics_fn(hyps, test_df['label'])

CPU times: user 5.68 ms, sys: 124 µs, total: 5.81 ms
Wall time: 5.02 ms


{'ACC': 0.782,
 'F1': 0.7395238527221748,
 'REC': 0.7219508432743726,
 'PRE': 0.7948786330245928}

## Count Vectorizer (N-Gram)

In [15]:
vectorizer = CountVectorizer(ngram_range=(1,3))
train_input = vectorizer.fit_transform(train_df['text'])
valid_input = vectorizer.transform(valid_df['text'])
test_input = vectorizer.transform(test_df['text'])

In [16]:
len(vectorizer.get_feature_names_out()), vectorizer.get_feature_names_out()[0:20]

(391358,
 array(['00', '00 04', '00 04 00', '00 16', '00 16 00', '00 21',
        '00 21 30', '00 agak', '00 agak mahal', '00 agar', '00 agar tidak',
        '00 atau', '00 atau sampai', '00 dan', '00 dan dari',
        '00 dan masih', '00 dan setiap', '00 dari', '00 dari menu',
        '00 disodori'], dtype=object))

In [17]:
%%time
model = LogisticRegression()
model = model.fit(train_input, train_df['label'])

CPU times: user 6min 19s, sys: 9min 4s, total: 15min 24s
Wall time: 45.2 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
%%time
hyps = model.predict(train_input)
document_sentiment_metrics_fn(hyps, train_df['label'])

CPU times: user 35.3 ms, sys: 0 ns, total: 35.3 ms
Wall time: 33 ms


{'ACC': 0.9983636363636363,
 'F1': 0.9985637461674889,
 'REC': 0.9989747207456717,
 'PRE': 0.9981560629716001}

In [19]:
%%time
hyps = model.predict(valid_input)
document_sentiment_metrics_fn(hyps, valid_df['label'])

CPU times: user 9.59 ms, sys: 0 ns, total: 9.59 ms
Wall time: 8.23 ms


{'ACC': 0.8904761904761904,
 'F1': 0.8475910022128509,
 'REC': 0.8310159656272837,
 'PRE': 0.8721277704167449}

In [20]:
%%time
hyps = model.predict(test_input)
document_sentiment_metrics_fn(hyps, test_df['label'])

CPU times: user 8.44 ms, sys: 0 ns, total: 8.44 ms
Wall time: 7 ms


{'ACC': 0.784,
 'F1': 0.7315185957006912,
 'REC': 0.7173116915763975,
 'PRE': 0.7889916271300157}

## Count Vectorizer (N-Gram + Filtering)

In [21]:
vectorizer = CountVectorizer(min_df=3, ngram_range=(1,3))
train_input = vectorizer.fit_transform(train_df['text'])
valid_input = vectorizer.transform(valid_df['text'])
test_input = vectorizer.transform(test_df['text'])

In [22]:
len(vectorizer.get_feature_names_out()), vectorizer.get_feature_names_out()[0:20]

(31829,
 array(['00', '00 dan', '000', '000 dan', '000 orang', '000 per',
        '000 per porsi', '000 porsi', '000 rp', '000 rupiah', '000 untuk',
        '01', '021', '05', '07', '07 00', '08', '09', '09 00', '10'],
       dtype=object))

In [23]:
%%time
model = LogisticRegression()
model = model.fit(train_input, train_df['label'])

CPU times: user 1min 4s, sys: 2min 9s, total: 3min 14s
Wall time: 6.57 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
%%time
hyps = model.predict(train_input)
document_sentiment_metrics_fn(hyps, train_df['label'])

CPU times: user 610 ms, sys: 1.3 s, total: 1.91 s
Wall time: 63 ms


{'ACC': 0.9895454545454545,
 'F1': 0.9904695485530183,
 'REC': 0.991719969502986,
 'PRE': 0.9892579704727803}

In [25]:
%%time
hyps = model.predict(valid_input)
document_sentiment_metrics_fn(hyps, valid_df['label'])

CPU times: user 4.54 ms, sys: 722 µs, total: 5.26 ms
Wall time: 4.97 ms


{'ACC': 0.8865079365079365,
 'F1': 0.8452497775910217,
 'REC': 0.8406379309451012,
 'PRE': 0.8509939811942893}

In [26]:
%%time
hyps = model.predict(test_input)
document_sentiment_metrics_fn(hyps, test_df['label'])

CPU times: user 6.7 ms, sys: 0 ns, total: 6.7 ms
Wall time: 5.73 ms


{'ACC': 0.818,
 'F1': 0.7789645345628964,
 'REC': 0.7641916906622788,
 'PRE': 0.8172109111408878}

## Count Vectorizer (N-Gram + Filtering v2)

In [27]:
vectorizer = CountVectorizer(min_df=5, ngram_range=(1,4), max_features=10000)
train_input = vectorizer.fit_transform(train_df['text'])
valid_input = vectorizer.transform(valid_df['text'])
test_input = vectorizer.transform(test_df['text'])

In [28]:
len(vectorizer.get_feature_names_out()), vectorizer.get_feature_names_out()[0:20]

(10000,
 array(['00', '000', '000 untuk', '10', '10 ribu', '10 tahun', '100',
        '100 000', '100 ribu', '1000', '11', '12', '13', '14', '15',
        '15 menit', '15 ribu', '150', '16', '17'], dtype=object))

In [29]:
%%time
model = LogisticRegression()
model = model.fit(train_input, train_df['label'])

CPU times: user 54.4 s, sys: 1min 52s, total: 2min 46s
Wall time: 5.54 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
%%time
hyps = model.predict(train_input)
document_sentiment_metrics_fn(hyps, train_df['label'])

CPU times: user 276 ms, sys: 672 ms, total: 948 ms
Wall time: 31.5 ms


{'ACC': 0.9815454545454545,
 'F1': 0.9773292783607032,
 'REC': 0.9821751510041,
 'PRE': 0.9726701439372767}

In [31]:
%%time
hyps = model.predict(valid_input)
document_sentiment_metrics_fn(hyps, valid_df['label'])

CPU times: user 71.9 ms, sys: 216 ms, total: 288 ms
Wall time: 9.74 ms


{'ACC': 0.8888888888888888,
 'F1': 0.8432603991514438,
 'REC': 0.8443535288593939,
 'PRE': 0.8430157748514255}

In [32]:
%%time
hyps = model.predict(test_input)
document_sentiment_metrics_fn(hyps, test_df['label'])

CPU times: user 90.9 ms, sys: 122 ms, total: 212 ms
Wall time: 7.2 ms


{'ACC': 0.824,
 'F1': 0.7810418941195736,
 'REC': 0.7646287535993418,
 'PRE': 0.8269000798884538}

## Count Vectorizer (300D)

In [33]:
vectorizer = CountVectorizer(ngram_range=(1,3), max_features=300)
train_input = vectorizer.fit_transform(train_df['text'])
valid_input = vectorizer.transform(valid_df['text'])
test_input = vectorizer.transform(test_df['text'])

In [34]:
len(vectorizer.get_feature_names_out()), vectorizer.get_feature_names_out()[0:20]

(300,
 array(['ada', 'ada di', 'ada yang', 'adalah', 'agak', 'akan', 'aku',
        'anak', 'anda', 'apa', 'apalagi', 'area', 'atas', 'atau', 'ayam',
        'bagi', 'bagus', 'baik', 'bakar', 'bakso'], dtype=object))

In [35]:
%%time
model = LogisticRegression()
model = model.fit(train_input, train_df['label'])

CPU times: user 366 ms, sys: 1.53 ms, total: 367 ms
Wall time: 367 ms


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
%%time
hyps = model.predict(train_input)
document_sentiment_metrics_fn(hyps, train_df['label'])

CPU times: user 14.7 ms, sys: 1.81 ms, total: 16.5 ms
Wall time: 14.3 ms


{'ACC': 0.8342727272727273,
 'F1': 0.7842994615016484,
 'REC': 0.8017245144441164,
 'PRE': 0.7706329981964172}

In [37]:
%%time
hyps = model.predict(valid_input)
document_sentiment_metrics_fn(hyps, valid_df['label'])

CPU times: user 6.86 ms, sys: 0 ns, total: 6.86 ms
Wall time: 5.79 ms


{'ACC': 0.8063492063492064,
 'F1': 0.7398947719823138,
 'REC': 0.7552179368796123,
 'PRE': 0.7286074612213174}

In [38]:
%%time
hyps = model.predict(test_input)
document_sentiment_metrics_fn(hyps, test_df['label'])

CPU times: user 5.3 ms, sys: 1.06 ms, total: 6.36 ms
Wall time: 5.17 ms


{'ACC': 0.656,
 'F1': 0.6007126805728679,
 'REC': 0.5946084144613556,
 'PRE': 0.6588807430212555}

## TF-IDF

<img src="bag_of_word.png"/>

DF(about) = 2 &nbsp; | &nbsp; DF(bird) = 3 &nbsp; | &nbsp; DF(heard) = 1  &nbsp; | &nbsp; DF(is) = 1 &nbsp; | &nbsp; DF(the) = 3 &nbsp; | &nbsp; DF(word) = 1 &nbsp; | &nbsp; DF(you) = 1<br/>


## Formula
<img src="tf_idf.png"/>

## TF-IDF Vectorizer
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [39]:
vectorizer = TfidfVectorizer()
train_input = vectorizer.fit_transform(train_df['text'])
valid_input = vectorizer.transform(valid_df['text'])
test_input = vectorizer.transform(test_df['text'])

In [40]:
len(vectorizer.get_feature_names_out()), vectorizer.get_feature_names_out()[0:20]

(17241,
 array(['00', '000', '001', '01', '010', '0111', '011770465655617', '02',
        '021', '022', '030360019614718', '0361', '04', '05', '0561', '07',
        '08', '081147286649', '081377744845', '08156189559'], dtype=object))

In [41]:
%%time
model = LogisticRegression()
model = model.fit(train_input, train_df['label'])

CPU times: user 43.6 s, sys: 1min 25s, total: 2min 9s
Wall time: 4.3 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
%%time
hyps = model.predict(train_input)
document_sentiment_metrics_fn(hyps, train_df['label'])

CPU times: user 537 ms, sys: 1.17 s, total: 1.71 s
Wall time: 56.7 ms


{'ACC': 0.9352727272727273,
 'F1': 0.9284235049152748,
 'REC': 0.9162891631604279,
 'PRE': 0.9423197196893929}

In [43]:
%%time
hyps = model.predict(valid_input)
document_sentiment_metrics_fn(hyps, valid_df['label'])

CPU times: user 133 ms, sys: 301 ms, total: 434 ms
Wall time: 14.8 ms


{'ACC': 0.8746031746031746,
 'F1': 0.8351209923387012,
 'REC': 0.8135745131288976,
 'PRE': 0.8669546384817232}

In [44]:
%%time
hyps = model.predict(test_input)
document_sentiment_metrics_fn(hyps, test_df['label'])

CPU times: user 4.95 ms, sys: 0 ns, total: 4.95 ms
Wall time: 4.31 ms


{'ACC': 0.73,
 'F1': 0.6679165117865385,
 'REC': 0.6542175145116321,
 'PRE': 0.7709099094573956}

## TF-IDF Vectorizer + Filtering

In [45]:
vectorizer = TfidfVectorizer(min_df=3, ngram_range=(1,3), max_features=20000)
train_input = vectorizer.fit_transform(train_df['text'])
valid_input = vectorizer.transform(valid_df['text'])
test_input = vectorizer.transform(test_df['text'])

In [46]:
len(vectorizer.get_feature_names_out()), vectorizer.get_feature_names_out()[0:20]

(20000,
 array(['00', '000', '000 dan', '000 orang', '000 per', '000 per porsi',
        '000 porsi', '000 rupiah', '000 untuk', '01', '05', '09', '10',
        '10 000', '10 malam', '10 menit', '10 orang', '10 pagi', '10 ribu',
        '10 tahun'], dtype=object))

In [47]:
%%time
model = LogisticRegression()
model = model.fit(train_input, train_df['label'])

CPU times: user 52.8 s, sys: 1min 42s, total: 2min 35s
Wall time: 5.2 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
%%time
hyps = model.predict(train_input)
document_sentiment_metrics_fn(hyps, train_df['label'])

CPU times: user 363 ms, sys: 874 ms, total: 1.24 s
Wall time: 41.3 ms


{'ACC': 0.94,
 'F1': 0.9273045632388275,
 'REC': 0.9122309581630504,
 'PRE': 0.9449576176074709}

In [49]:
%%time
hyps = model.predict(valid_input)
document_sentiment_metrics_fn(hyps, valid_df['label'])

CPU times: user 159 ms, sys: 336 ms, total: 495 ms
Wall time: 16.4 ms


{'ACC': 0.8817460317460317,
 'F1': 0.8360318397305712,
 'REC': 0.8109905932639521,
 'PRE': 0.8742494254266702}

In [50]:
%%time
hyps = model.predict(test_input)
document_sentiment_metrics_fn(hyps, test_df['label'])

CPU times: user 86.7 ms, sys: 70.4 ms, total: 157 ms
Wall time: 6.82 ms


{'ACC': 0.76,
 'F1': 0.6775857115182596,
 'REC': 0.6697032542620778,
 'PRE': 0.7881429681429681}

## TF-IDF (300D)

In [51]:
vectorizer = TfidfVectorizer(min_df=3, ngram_range=(1,3), max_features=300)
train_input = vectorizer.fit_transform(train_df['text'])
valid_input = vectorizer.transform(valid_df['text'])
test_input = vectorizer.transform(test_df['text'])

In [52]:
len(vectorizer.get_feature_names_out()), vectorizer.get_feature_names_out()[0:20]

(300,
 array(['ada', 'ada di', 'ada yang', 'adalah', 'agak', 'akan', 'aku',
        'anak', 'anda', 'apa', 'apalagi', 'area', 'atas', 'atau', 'ayam',
        'bagi', 'bagus', 'baik', 'bakar', 'bakso'], dtype=object))

In [53]:
%%time
model = LogisticRegression()
model = model.fit(train_input, train_df['label'])

CPU times: user 370 ms, sys: 1.72 ms, total: 372 ms
Wall time: 371 ms


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
%%time
hyps = model.predict(train_input)
document_sentiment_metrics_fn(hyps, train_df['label'])

CPU times: user 14.8 ms, sys: 1.25 ms, total: 16 ms
Wall time: 14.3 ms


{'ACC': 0.8262727272727273,
 'F1': 0.7680363475976594,
 'REC': 0.7660091038195164,
 'PRE': 0.7702586980596919}

In [55]:
%%time
hyps = model.predict(valid_input)
document_sentiment_metrics_fn(hyps, valid_df['label'])

CPU times: user 3.13 ms, sys: 2.58 ms, total: 5.71 ms
Wall time: 4.88 ms


{'ACC': 0.807936507936508,
 'F1': 0.7297792963402681,
 'REC': 0.7275002115388721,
 'PRE': 0.7329049003305242}

In [56]:
%%time
hyps = model.predict(test_input)
document_sentiment_metrics_fn(hyps, test_df['label'])

CPU times: user 4.3 ms, sys: 368 µs, total: 4.67 ms
Wall time: 4.41 ms


{'ACC': 0.61,
 'F1': 0.55401489537221,
 'REC': 0.5486939531057179,
 'PRE': 0.6182570483385178}

# Word Vector Model

<img src="word_vector.png"/>

**== Widely-used Word Vector Model ==**
- CBOW (Continuous Bag-of-word)
- Skip-Gram -> **FastText**
- GLoVe

Download pre-trained FastText model (support 157 languages): https://fasttext.cc/docs/en/crawl-vectors.html
<img src="fasttext.png"/>

In [57]:
# Uncommnet and run the following line to download indonesian (id) fasttext embedding
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz cc.id.300.bin.gz
# !gunzip cc.id.300.bin.gz

In [58]:
wv_model = fasttext.load_model("./tutorial/cc.id.300.bin")



In [59]:
def encode_fasttext(sentence):
    word_vectors = []
    for word in word_tokenize(sentence):
        word_vectors.append(wv_model[word])
    return np.stack(word_vectors)

train_df['vector'] = train_df['text'].apply(encode_fasttext)
valid_df['vector'] = valid_df['text'].apply(encode_fasttext)
test_df['vector'] = test_df['text'].apply(encode_fasttext)

In [60]:
train_input = train_df['vector'].apply(lambda x: np.mean(x, axis=0)).tolist()
valid_input = valid_df['vector'].apply(lambda x: np.mean(x, axis=0)).tolist()
test_input = test_df['vector'].apply(lambda x: np.mean(x, axis=0)).tolist()

In [61]:
%%time
model = LogisticRegression()
model = model.fit(train_input, train_df['label'])

CPU times: user 23.3 s, sys: 20.6 s, total: 44 s
Wall time: 1.9 s


In [62]:
%%time
hyps = model.predict(train_input)
document_sentiment_metrics_fn(hyps, train_df['label'])

CPU times: user 1.91 s, sys: 2.98 s, total: 4.89 s
Wall time: 733 ms


{'ACC': 0.8449090909090909,
 'F1': 0.8083042451537518,
 'REC': 0.782718918024996,
 'PRE': 0.843391795515397}

In [63]:
%%time
hyps = model.predict(valid_input)
document_sentiment_metrics_fn(hyps, valid_df['label'])

CPU times: user 217 ms, sys: 264 ms, total: 481 ms
Wall time: 16.4 ms


{'ACC': 0.8420634920634921,
 'F1': 0.791869039515769,
 'REC': 0.7703950140265868,
 'PRE': 0.8206850094183347}

In [64]:
%%time
hyps = model.predict(test_input)
document_sentiment_metrics_fn(hyps, test_df['label'])

CPU times: user 202 ms, sys: 247 ms, total: 449 ms
Wall time: 15.3 ms


{'ACC': 0.67,
 'F1': 0.6037776632851872,
 'REC': 0.5947426756250285,
 'PRE': 0.7024387416556741}

# Deep Learning Approach

## Background
Previous approach only handle a fixed amount of features for predictions, requiring statistics derived features from the original sequence which causing loss of information from the original sequence. Can we make it better?

#### Logistic Regression

<img src="logistic_regression.png"/>

<img src="logistic_regression.png"/>

#### Backpropagation

<img src="backprop.png"/>

#### Gradient Descent

<img src="gradient_descent.png"/>

#### Chain Rule

<div style="background-color: white; padding:10px;">
<img src="chain_rule.png"/>
</div>

## Types of Deep Learning / Neural Network Model

#### Multi Layer Perceptron

<div style="background-color: white; text-align: center;">
<img src="mlp.png"/>
</div>

#### Convolution Neural Network (CNN)
- Convolutional Neural Networks for Visual Recognition (Stanford CS231) - https://cs231n.github.io/convolutional-networks/
- CNN Tutorial (Udacity) - https://github.com/udacity/deep-learning-v2-pytorch/tree/master/convolutional-neural-networks
- But what is a convolution? (3blue1brown) - https://www.youtube.com/watch?v=KuXjwB4LzSA

<img src="cnn.png"/>

#### Recurrent Neural Network (RNN)
- Cheatsheet RNN (Stanford CS230) - https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-recurrent-neural-networks
- RNN Tutorial (Udacity) - https://github.com/udacity/deep-learning-v2-pytorch/tree/master/recurrent-neural-networks
- Sentiment Analysis w/ RNN (Udacity) - https://github.com/udacity/deep-learning-v2-pytorch/tree/master/sentiment-rnn

<img src="rnn.png"/>

## Transformer Model

#### Why Transformer?
<img src="why_transformer.png"/>

#### Variant of Transformer Models

- Encoder-Only: BERT, RoBERTa, ALBERT, ELECTRA, mBERT, XLM-R, ...
- Decoder-Only: GPT2, GPT3, BLOOM, ...
- Encoder-Decoder: BART, T5, mBART, mT5, T0, ...

#### Pre-trained Language Models
- Transformer needs a huge amount of data to train
- To mitigate this problem, many researchers have built various pre-trained transformer models (pretrained LM)
- Rather than training the transformers model from scratch, we can simply use the existing pre-trained models

**Notes**: For language understanding (NLU) tasks, such as sentence classification and sequence tagging, we can simply use an **Encoder-only** model

## How to use Transformer models?

- Use HuggingFace `transformers` package to load a pre-trained model
- Create Dataset & Dataloader for training, validation & testing
- Setting hyperparameter including optimizer
- Run training for N epochs
    - Retrieve a batch of data
    - Compute output & loss
    - Perform backpropagation
    - Update model using the optimizer
    - Run validation per epoch, early stopping if needed
- Evaluate the trained model on the test data

## Load Model
https://huggingface.co/models

In [65]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [67]:
count_param(model)

124443651

## Prepare Dataset

In [80]:
train_dataset_path = './dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = './dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = './dataset/smsa_doc-sentiment-prosa/test_preprocess.tsv'

In [81]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)  
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)  
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)

In [70]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


## Fine Tuning & Evaluation

In [71]:
optimizer = optim.Adam(model.parameters(), lr=5e-6)
model = model.cuda()

In [72]:
# Train
n_epochs = 5
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.2960 LR:0.00000500: 100%|█| 344/344 [01:31<00:00,  3.75it


(Epoch 1) TRAIN LOSS:0.2960 ACC:0.89 F1:0.85 REC:0.82 PRE:0.88 LR:0.00000500


VALID LOSS:0.1760 ACC:0.93 F1:0.90 REC:0.90 PRE:0.91: 100%|█| 40/40 [00:06<00:00


(Epoch 1) VALID LOSS:0.1760 ACC:0.93 F1:0.90 REC:0.90 PRE:0.91


(Epoch 2) TRAIN LOSS:0.1368 LR:0.00000500: 100%|█| 344/344 [01:31<00:00,  3.75it


(Epoch 2) TRAIN LOSS:0.1368 ACC:0.95 F1:0.94 REC:0.94 PRE:0.94 LR:0.00000500


VALID LOSS:0.1785 ACC:0.93 F1:0.91 REC:0.90 PRE:0.91: 100%|█| 40/40 [00:06<00:00


(Epoch 2) VALID LOSS:0.1785 ACC:0.93 F1:0.91 REC:0.90 PRE:0.91


(Epoch 3) TRAIN LOSS:0.0946 LR:0.00000500: 100%|█| 344/344 [01:31<00:00,  3.74it


(Epoch 3) TRAIN LOSS:0.0946 ACC:0.97 F1:0.96 REC:0.96 PRE:0.97 LR:0.00000500


VALID LOSS:0.1729 ACC:0.94 F1:0.91 REC:0.90 PRE:0.93: 100%|█| 40/40 [00:06<00:00


(Epoch 3) VALID LOSS:0.1729 ACC:0.94 F1:0.91 REC:0.90 PRE:0.93


(Epoch 4) TRAIN LOSS:0.0634 LR:0.00000500: 100%|█| 344/344 [01:32<00:00,  3.71it


(Epoch 4) TRAIN LOSS:0.0634 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98 LR:0.00000500


VALID LOSS:0.1943 ACC:0.94 F1:0.92 REC:0.91 PRE:0.93: 100%|█| 40/40 [00:06<00:00


(Epoch 4) VALID LOSS:0.1943 ACC:0.94 F1:0.92 REC:0.91 PRE:0.93


(Epoch 5) TRAIN LOSS:0.0437 LR:0.00000500: 100%|█| 344/344 [01:31<00:00,  3.74it


(Epoch 5) TRAIN LOSS:0.0437 ACC:0.99 F1:0.99 REC:0.98 PRE:0.99 LR:0.00000500


VALID LOSS:0.2164 ACC:0.94 F1:0.91 REC:0.89 PRE:0.93: 100%|█| 40/40 [00:06<00:00

(Epoch 5) VALID LOSS:0.2164 ACC:0.94 F1:0.91 REC:0.89 PRE:0.93





In [82]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp
    list_label += batch_label
metrics = document_sentiment_metrics_fn(list_hyp, list_label)
print("TEST Metrics | {}".format(metrics_to_string(metrics)))

100%|███████████████████████████████████████████| 16/16 [00:07<00:00,  2.13it/s]

TEST Metrics | ACC:0.90 F1:0.86 REC:0.84 PRE:0.92





## Test fine-tuned model on sample sentences

In [83]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (99.747%)


In [84]:
text = 'Budi pergi ke pondok indah mall membeli cakwe'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Budi pergi ke pondok indah mall membeli cakwe | Label : neutral (99.690%)


In [85]:
text = 'Dasar anak sialan!! Kurang ajar!!'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Dasar anak sialan!! Kurang ajar!! | Label : negative (99.895%)


# Conclusion

- What we have learnt:
    - Bag-of-words (BOW)
    - TF-IDF
    - Word Vector (fasttext)
    - A brief introduction to deep learning
    - How to use BERT model for Sentiment Analysis
<br/> <br/>
- What we can conclude:
    - Tranditional approach employs sequence statistics (word occurences (TF), domain frequency (DF), etc) to derive features
    - Word vector model can capture word level semantic
    - Neural network is simply a linear / logistic regression model with multiple layers
    - Deep learning can help to process unstructure data with dynamic dimension
    - Pre-trained BERT model can achieve much better result compared to traditional approaches

**notes**: If we tune more, we can achieve an even higher results

<img src="./indonlu_result.png"/>