In [1]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv('/content/smile-annotations-final.csv',
    names=['id', 'text', 'category'])

df.set_index('id', inplace=True)

In [3]:
df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
614877582664835073,@Sofabsports thank you for following me back. ...,happy
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [4]:
df.text.iloc[78]

'Behind the walls @_TheWhitechapel http://t.co/mylVk96joW'

In [5]:
df.category.value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
nocode,1572
happy,1137
not-relevant,214
angry,57
surprise,35
sad,32
happy|surprise,11
happy|sad,9
disgust|angry,7
disgust,6


In [6]:
df=df[~df.category.str.contains('\|')]

  df=df[~df.category.str.contains('\|')]


In [7]:
df=df[df.category != 'nocode']

In [8]:
df.category.value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
happy,1137
not-relevant,214
angry,57
surprise,35
sad,32
disgust,6


In [9]:
possible_labels=df.category.unique()

In [10]:
label_dict={}
for index, possible_lable in enumerate(possible_labels):
    label_dict[possible_lable]=index

In [11]:
label_dict

{'happy': 0,
 'not-relevant': 1,
 'angry': 2,
 'disgust': 3,
 'sad': 4,
 'surprise': 5}

In [12]:
df['labels']=df.category.replace(label_dict)
df.head(10)

  df['labels']=df.category.replace(label_dict)


Unnamed: 0_level_0,text,category,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0
614499696015503361,Lucky @FitzMuseum_UK! Good luck @MirandaStearn...,happy,0
613601881441570816,Yr 9 art students are off to the @britishmuseu...,happy,0
613696526297210880,@RAMMuseum Please vote for us as @sainsbury #s...,not-relevant,1
610746718641102848,#AskTheGallery Have you got plans to privatise...,not-relevant,1
612648200588038144,@BarbyWT @britishmuseum so beautiful,happy,0


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_val, Y_train, Y_val=train_test_split(
    df.index.values,
    df.labels.values,
    test_size=0.15,
    random_state=17,
    stratify=df.labels.values
)

In [15]:
df['data_type']=['not_set']*df.shape[0]

In [16]:
df.loc[X_train, 'data_type']='train'
df.loc[X_val, 'data_type']='val'

In [17]:
df.groupby(['category','labels','data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,labels,data_type,Unnamed: 3_level_1
angry,2,train,48
angry,2,val,9
disgust,3,train,5
disgust,3,val,1
happy,0,train,966
happy,0,val,171
not-relevant,1,train,182
not-relevant,1,val,32
sad,4,train,27
sad,4,val,5


In [18]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [19]:
tokenizer=BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [20]:
encoded_data_train=tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val=tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    truncation=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train=encoded_data_train['input_ids']
attention_masks_train=encoded_data_train['attention_mask']
labels_train=torch.tensor(df[df.data_type=='train'].labels.values)

input_ids_val=encoded_data_val['input_ids']
attention_masks_val=encoded_data_val['attention_mask']
labels_val=torch.tensor(df[df.data_type=='val'].labels.values)


In [21]:
dataset_train=TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val=TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [22]:
len(dataset_train)

1258

In [23]:
len(dataset_val)

223

In [24]:
from transformers import BertForSequenceClassification

In [25]:
model=BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [27]:
batch_size=4

dataloader_train=DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

batch_size=32

dataloader_val=DataLoader(
    dataset_val,
    sampler=SequentialSampler(dataset_val),
    batch_size=batch_size
)

In [28]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

In [29]:
optimizer=AdamW(
    model.parameters(),
    lr=1e-5,
    eps=1e-8
)

In [30]:
epochs=10

scheduler=get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs
)

In [31]:
import numpy as np

In [32]:
from sklearn.metrics import f1_score

In [33]:
preds=[]

In [40]:
def f1_score_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [41]:
def accuracy_per_class(preds, labels):
    label_dict_inverse={v:k for k, v in label_dict.items()}

    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()

    for label in np.unique(labels_flat):
        y_preds=preds_flat[labels_flat==label]
        y_true=labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)} \n')

In [42]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [43]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [44]:
def evaluate(dataloader_val):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    with torch.no_grad():
        for batch in dataloader_val:
            batch = tuple(b.to(device) for b in batch)

            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }

            outputs = model(**inputs)

            loss = outputs.loss
            logits = outputs.logits

            loss_val_total += loss.item()

            predictions.append(logits.detach().cpu().numpy())
            true_vals.append(inputs['labels'].detach().cpu().numpy())

    loss_val_avg = loss_val_total / len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [45]:
for epoch in range(1, epochs + 1):

    model.train()
    loss_train_total = 0

    progress_bar = tqdm(
        dataloader_train,
        desc=f'Epoch {epoch}',
        leave=False
    )

    for batch in progress_bar:
        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

        outputs = model(**inputs)
        loss = outputs.loss

        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({
            'train_loss': f'{loss.item():.3f}'
        })

    loss_train_avg = loss_train_total / len(dataloader_train)

    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)

    print(f'\nEpoch {epoch}')
    print(f'Training loss: {loss_train_avg:.4f}')
    print(f'Validation loss: {val_loss:.4f}')
    print(f'Validation F1 (weighted): {val_f1:.4f}')

Epoch 1:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.4374
Validation loss: 0.6790
Validation F1 (weighted): 0.7777


Epoch 2:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.3359
Validation loss: 0.6787
Validation F1 (weighted): 0.8313


Epoch 3:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.2087
Validation loss: 0.7805
Validation F1 (weighted): 0.8394


Epoch 4:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.1344
Validation loss: 0.7803
Validation F1 (weighted): 0.8327


Epoch 5:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.0973
Validation loss: 0.7864
Validation F1 (weighted): 0.8561


Epoch 6:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.0492
Validation loss: 0.8053
Validation F1 (weighted): 0.8532


Epoch 7:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.0289
Validation loss: 0.8112
Validation F1 (weighted): 0.8427


Epoch 8:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.0226
Validation loss: 0.8368
Validation F1 (weighted): 0.8462


Epoch 9:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 9
Training loss: 0.0171
Validation loss: 0.8487
Validation F1 (weighted): 0.8487


Epoch 10:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 10
Training loss: 0.0144
Validation loss: 0.8487
Validation F1 (weighted): 0.8487


In [46]:
import os

os.makedirs("models", exist_ok=True)

torch.save({
    'model_state_dict': model.state_dict(),
    'label_dict': label_dict
}, 'models/bert_sentiment.pt')

In [47]:
checkpoint = torch.load('models/bert_sentiment.pt', map_location=device)

model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()

val_loss, predictions, true_vals = evaluate(dataloader_val)

print("Final Validation Loss:", val_loss)
print("Final Validation F1:", f1_score_func(predictions, true_vals))

accuracy_per_class(predictions, true_vals)

Final Validation Loss: 0.8486621709806579
Final Validation F1: 0.8486958733075859
Class: happy
Accuracy: 159/171 

Class: not-relevant
Accuracy: 20/32 

Class: angry
Accuracy: 7/9 

Class: disgust
Accuracy: 0/1 

Class: sad
Accuracy: 2/5 

Class: surprise
Accuracy: 2/5 

