# Transformer Model Experiment

Author: Lu ZhiPing

In [1]:
from sentiment.dataset.load_dataset import LoadDataset
from sentiment.dataset.tokenizer import SimpleTokenizer
from transformers import AutoTokenizer

simple_tokenizer = SimpleTokenizer()

dataset = LoadDataset(
        database_name="PLP",
        collection_name="AStarCOVID",
        n_rows="max",
        tokenizer=simple_tokenizer,
        column_name="Text"
)
dataset

NOTICE: sentiment log file will be at /home/ubuntu/miniconda3/envs/tweet/lib/python3.8/site-packages/sentiment-0.0.1-py3.8.egg/sentiment/logs/sentiment.log


[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2022-11-04 19:58:47,697 : INFO : Initialized Mongo Connection to db:PLP, collection: AStarCOVID



        Database: Database(MongoClient(host=['192.168.50.72:27017'], document_class=dict, tz_aware=False, connect=True), 'PLP'),
        Collection: Collection(Database(MongoClient(host=['192.168.50.72:27017'], document_class=dict, tz_aware=False, connect=True), 'PLP'), 'AStarCOVID')
        Length : 161390
        Sample: {'Text': 'If given the choice, would you rather endure a vexing commute before '
         'sitting all day in a cubicle, or roll over and open your laptop '
         'under swaying coconut  palms? \n'
         'https://t.co/crMAJQGOUn\n'
         '@mediarsAccel #startuplife',
 '_id': ObjectId('63463b0f7380598a23658104'),
 'anger_intensity': 0.494,
 'country_region': 'Singapore',
 'date_stamp': '2020-11-21 00:00:00',
 'emotion_category': 'anger',
 'fear_intensity': 0.491,
 'joy_intensity': 0.341,
 'keyword_used': 'covid',
 'sadness_intensity': 0.494,
 'sentiment_category': 'negative',
 't1': 1,
 't10': 1,
 't2': 1,
 't3': 1,
 't4': 1,
 't5': 1,
 't6': 1,
 't7': 1,
 '

In [2]:
df = dataset.to_pandas()

2022-11-04 19:59:12,704 : INFO : Returning Pandas DataFrame with maximum row: 161390
100%|██████████| 161390/161390 [00:42<00:00, 3775.58it/s]


In [3]:
df.head()

Unnamed: 0,_id,tweet_ID,user_ID,t1,t2,t3,t4,t5,t6,t7,...,fear_intensity,sadness_intensity,joy_intensity,sentiment_category,emotion_category,keyword_used,country_region,date_stamp,Text,tokens
0,634637137380598a236355ae,1245550415581716481,37874853,1,0,0,0,0,0,0,...,0.49,0.437,0.281,neutral,no specific emotion,covid,Singapore,2020-04-02 00:00:00,HDB closes Bukit Merah branch office after sec...,"[hdb, close, bukit, merah, branch, office, sec..."
1,634637137380598a236355af,1245550321511718912,44290654,1,0,0,0,0,0,0,...,0.49,0.437,0.281,neutral,no specific emotion,covid,Singapore,2020-04-02 00:00:00,HDB closes Bukit Merah branch office after sec...,"[hdb, close, bukit, merah, branch, office, sec..."
2,634637137380598a236355b0,1245550270190419969,115624161,1,1,1,1,0,0,0,...,0.512,0.446,0.162,negative,fear,covid,Singapore,2020-04-02 00:00:00,Quarantine stress baking? 😆\n\nhttps://t.co/zH...,"[quarantine, stress, baking, 😆, URL]"
3,634637137380598a236355b1,1245550206457954305,20155794,1,0,0,0,0,0,0,...,0.423,0.34,0.319,neutral,no specific emotion,covid,Singapore,2020-04-02 00:00:00,Every vaccine and treatment in development for...,"[every, vaccine, treatment, development, covid..."
4,634637137380598a236355b2,1245548702233583618,35202527,1,1,1,1,0,0,0,...,0.348,0.425,0.255,negative,sadness,covid,Singapore,2020-04-02 00:00:00,this was the second read.. \nhttps://t.co/wYID...,"[wa, second, read, .., URL]"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161390 entries, 0 to 161389
Data columns (total 25 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   _id                 161390 non-null  object 
 1   tweet_ID            161390 non-null  int64  
 2   user_ID             161390 non-null  int64  
 3   t1                  161390 non-null  int64  
 4   t2                  161390 non-null  int64  
 5   t3                  161390 non-null  int64  
 6   t4                  161390 non-null  int64  
 7   t5                  161390 non-null  int64  
 8   t6                  161390 non-null  int64  
 9   t7                  161390 non-null  int64  
 10  t8                  161390 non-null  int64  
 11  t9                  161390 non-null  int64  
 12  t10                 161390 non-null  int64  
 13  valence_intensity   161390 non-null  float64
 14  anger_intensity     161390 non-null  float64
 15  fear_intensity      161390 non-nul

In [5]:
from random import randint

index = randint(0, len(df))

print(f"Index: {index}")
print(f"Text: {df.iloc[index].Text}\n")
print(f"Time: {df.iloc[index].date_stamp}\n")
print(f"Label: {df.iloc[index].sentiment_category}")

Index: 60976
Text: #HRNews Malaysia's Prime Minister Tan Sri Muhyiddin announced yesterday (7 June) that the country will exit the Conditional Movement Control Order on 9 June as planned, and enter into the next phase on the road to COVID-19 recovery. https://t.co/mgXicxW39G

Time: 2020-06-08 00:00:00

Label: positive


In [26]:
df = df[["sentiment_category", "Text", "tokens"]].copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161390 entries, 0 to 161389
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   sentiment_category  161390 non-null  object
 1   Text                161390 non-null  object
 2   tokens              161390 non-null  object
dtypes: object(3)
memory usage: 3.7+ MB


In [5]:
df.head()

Unnamed: 0,sentiment_category,Text,tokens
0,neutral,HDB closes Bukit Merah branch office after sec...,"[hdb, close, bukit, merah, branch, office, sec..."
1,neutral,HDB closes Bukit Merah branch office after sec...,"[hdb, close, bukit, merah, branch, office, sec..."
2,negative,Quarantine stress baking? 😆\n\nhttps://t.co/zH...,"[quarantine, stress, baking, 😆, URL]"
3,neutral,Every vaccine and treatment in development for...,"[every, vaccine, treatment, development, covid..."
4,negative,this was the second read.. \nhttps://t.co/wYID...,"[wa, second, read, .., URL]"


In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df["sentiment_category"])
le

In [7]:
type(le.classes_)

numpy.ndarray

In [8]:
print(le.classes_)

['negative' 'neutral' 'positive' 'very negative' 'very positive']


In [9]:
le.classes_

array(['negative', 'neutral', 'positive', 'very negative',
       'very positive'], dtype=object)

In [10]:
label2id = dict()
for index, element in enumerate(le.classes_):
    label2id[element] = index
label2id

{'negative': 0,
 'neutral': 1,
 'positive': 2,
 'very negative': 3,
 'very positive': 4}

In [11]:
id2label = {value:key for key, value in label2id.items()}
id2label

{0: 'negative',
 1: 'neutral',
 2: 'positive',
 3: 'very negative',
 4: 'very positive'}

In [12]:
df["labels"] = le.fit_transform(df["sentiment_category"])
df.head()

Unnamed: 0,sentiment_category,Text,tokens,labels
0,neutral,HDB closes Bukit Merah branch office after sec...,"[hdb, close, bukit, merah, branch, office, sec...",1
1,neutral,HDB closes Bukit Merah branch office after sec...,"[hdb, close, bukit, merah, branch, office, sec...",1
2,negative,Quarantine stress baking? 😆\n\nhttps://t.co/zH...,"[quarantine, stress, baking, 😆, URL]",0
3,neutral,Every vaccine and treatment in development for...,"[every, vaccine, treatment, development, covid...",1
4,negative,this was the second read.. \nhttps://t.co/wYID...,"[wa, second, read, .., URL]",0


In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.1)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145251 entries, 135790 to 126706
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   sentiment_category  145251 non-null  object
 1   Text                145251 non-null  object
 2   tokens              145251 non-null  object
 3   labels              145251 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 5.5+ MB


In [14]:
from datasets import Dataset

hg_dataset_train = Dataset.from_pandas(train)
hg_dataset_test = Dataset.from_pandas(test)
hg_dataset_train

Dataset({
    features: ['sentiment_category', 'Text', 'tokens', 'labels', '__index_level_0__'],
    num_rows: 145251
})

In [15]:
hg_dataset_train = hg_dataset_train.train_test_split(test_size=0.2)

In [16]:
hg_dataset_train

DatasetDict({
    train: Dataset({
        features: ['sentiment_category', 'Text', 'tokens', 'labels', '__index_level_0__'],
        num_rows: 116200
    })
    test: Dataset({
        features: ['sentiment_category', 'Text', 'tokens', 'labels', '__index_level_0__'],
        num_rows: 29051
    })
})

In [17]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [18]:
def tokenize_function(doc):
    return tokenizer(doc['Text'])

tokenized_dataset = hg_dataset_train.map(tokenize_function, batched=True)

  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

In [19]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentiment_category', 'Text', 'tokens', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 116200
    })
    test: Dataset({
        features: ['sentiment_category', 'Text', 'tokens', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 29051
    })
})

In [20]:
tokenized_dataset = tokenized_dataset.remove_columns(["sentiment_category", "Text", "tokens", "__index_level_0__"])
tokenized_dataset.set_format("torch")
tokenized_dataset["train"].column_names

['labels', 'input_ids', 'attention_mask']

In [21]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=16, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=16, collate_fn=data_collator
)

In [22]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 109]),
 'attention_mask': torch.Size([16, 109])}

In [23]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(le.classes_), id2label=id2label)
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

tensor(1.6813, grad_fn=<NllLossBackward0>) torch.Size([16, 5])


In [24]:
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

21789


In [25]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [26]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/21789 [00:00<?, ?it/s]

In [27]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.788750817527796}

In [44]:
hg_dataset_test

Dataset({
    features: ['sentiment_category', 'Text', 'tokens', 'labels', '__index_level_0__'],
    num_rows: 16139
})

In [51]:
tokenized_dataset_test = hg_dataset_test.map(tokenize_function, batched=True)
tokenized_dataset_test

  0%|          | 0/17 [00:00<?, ?ba/s]

Dataset({
    features: ['sentiment_category', 'Text', 'tokens', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 16139
})

In [52]:
tokenized_dataset_test = tokenized_dataset_test.remove_columns(["sentiment_category", "Text", "tokens", "__index_level_0__"])
tokenized_dataset_test

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 16139
})

In [53]:
test_loader = DataLoader(
    tokenized_dataset_test, batch_size=16, collate_fn=data_collator
)

In [54]:
metric = evaluate.load("accuracy")
model.to(device)
model.eval()

y_pred = []
y_true = []

for batch in tqdm(test_loader, total=len(test_loader)):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    y_pred.append(predictions.detach().cpu().numpy())
    y_true.append(batch["labels"].detach().cpu().numpy())

metric.compute()

  0%|          | 0/1009 [00:00<?, ?it/s]

{'accuracy': 0.7890823471094863}

In [34]:
import numpy as np
from sklearn.metrics import classification_report
from pprint import pprint
pprint(classification_report(y_true=np.concatenate(y_true), y_pred=np.concatenate(y_pred), output_dict=True, target_names=le.classes_))

{'accuracy': 0.7890823471094863,
 'macro avg': {'f1-score': 0.7191905597681043,
               'precision': 0.7209556333310562,
               'recall': 0.7193939755434734,
               'support': 16139},
 'negative': {'f1-score': 0.8604024943310657,
              'precision': 0.8764255810596218,
              'recall': 0.8449547668754349,
              'support': 7185},
 'neutral': {'f1-score': 0.5511462450592884,
             'precision': 0.5159857904085258,
             'recall': 0.5914489311163895,
             'support': 2947},
 'positive': {'f1-score': 0.8445180889257111,
              'precision': 0.8588279348436623,
              'recall': 0.8306772908366534,
              'support': 5522},
 'very negative': {'f1-score': 0.6955345060893099,
                   'precision': 0.6781002638522428,
                   'recall': 0.7138888888888889,
                   'support': 360},
 'very positive': {'f1-score': 0.6443514644351465,
                   'precision': 0.6754385964912281,

In [35]:
id2label

{0: 'negative',
 1: 'neutral',
 2: 'positive',
 3: 'very negative',
 4: 'very positive'}

In [36]:
le.classes_

array(['negative', 'neutral', 'positive', 'very negative',
       'very positive'], dtype=object)

In [55]:
from transformers import TextClassificationPipeline
model.to("cpu")


pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=len(le.classes_))
pipeline(test.iloc[0].Text)

[[{'label': 'positive', 'score': 0.9513806700706482},
  {'label': 'neutral', 'score': 0.04848935082554817},
  {'label': 'negative', 'score': 0.00012132480333093554},
  {'label': 'very positive', 'score': 8.566411452193279e-06},
  {'label': 'very negative', 'score': 9.446768922671822e-10}]]

In [40]:
test.head()

Unnamed: 0,sentiment_category,Text,tokens,labels
56288,positive,"COVID-Safe Worksite, COVID-Safe Workforce, COV...","[covid, safe, worksite, covid, safe, workforce...",2
29035,positive,Our staffing agency in Singapore is here to he...,"[staffing, agency, singapore, help, find, posi...",2
39498,negative,"While some bash Covid-19 parties, others want ...","[bash, covid, 19, party, others, want, immunit...",0
94534,negative,Giganet Launches Home Broadband to 3 Million ...,"[giganet, launch, home, broadband, 3, million,...",0
41526,negative,Dear @CMO_Odisha @Naveen_Odisha ji as you know...,"[dear, cmo_odisha, naveen_odisha, ji, know, co...",0


In [41]:
for i in range(5):
    predict = pipeline(test.iloc[i].Text)
    print(f'Train Label: {test.iloc[i].sentiment_category},\tModel Prediction: {predict[0][0]["label"]}')

Train Label: positive,	Model Prediction: positive
Train Label: positive,	Model Prediction: positive
Train Label: negative,	Model Prediction: negative
Train Label: negative,	Model Prediction: negative
Train Label: negative,	Model Prediction: negative


In [56]:
import os
os.getcwd()

'/home/ubuntu/tweet_sentiment/sentiment/model'

In [58]:
os.listdir("../..")

['.idea',
 '.gitignore',
 'LICENSE',
 'README.md',
 '.git',
 'dist',
 'sentiment.egg-info',
 'sentiment',
 'build',
 'setup.py',
 'requirements.txt',
 '.env']

In [59]:
import json
report = classification_report(y_true=np.concatenate(y_true), y_pred=np.concatenate(y_pred), output_dict=True, target_names=le.classes_)
if not os.path.isdir("../../reports"):
    os.mkdir("../../reports")
with open("../../reports/BertReport.json", "w") as file:
    json.dump(report, file)