<a href="https://colab.research.google.com/github/Metachondria/SFT_BERT_for_text_classification/blob/main/SFT_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tune BERT for text classification

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/datafromcompetition/train.csv
/kaggle/input/datafromcompetition/test.csv


In [None]:
import numpy as np
import pandas as pd

import transformers
from transformers import BertForSequenceClassification, BertTokenizer, get_linear_schedule_with_warmup, AdamW, BertConfig
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score, accuracy_score

import re
import spacy

import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data = pd.read_csv('/kaggle/input/datafromcompetition/train.csv')
test = pd.read_csv('/kaggle/input/datafromcompetition/test.csv')

In [None]:
len(data.text)

7613

# Preprocessing

In [None]:
nlp = spacy.load("en_core_web_sm")
def clean_text(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = text.lower()
    text = re.sub(r"http\S+", "",text)
    html =re.compile(r'<.*?>')
    text = html.sub(r'',text)
    text = nlp(text)
    cleaned_text = [token.lemma_ for token in text if not token.is_stop and not token.is_punct]


    return cleaned_text

In [None]:
data['text'] = data['text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[deed, reason, earthquake, allah, forgive]",1
1,4,,,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,,,"[resident, ask, shelter, place, notify, office...",1
3,6,,,"[13,000, people, receive, wildfire, evacuation...",1
4,7,,,"[get, send, photo, ruby, alaska, smoke, wildfi...",1


In [None]:
data.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [None]:
s = '<h1>Hello, my name is Ben, I`m human😁. "http://t.co/utbxlcbiuy" </h>'
clean_text(s)

['hello', 'ben', 'i`m', 'human']

In [None]:
tweets = data['text'].values
labels = data['target'].values

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, clean_up_tokenization_spaces=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
tokenizer(tweets[0], return_tensors='pt',max_length=1024, truncation=True)['input_ids']

tensor([[  101, 15046,   102],
        [  101,  3114,   102],
        [  101,  8372,   102],
        [  101, 16455,   102],
        [  101,  9641,   102]])

In [None]:
max_len = 0


for tweet in tweets:
    input_ids = tokenizer.encode(tweet, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  30


In [None]:
input_ids = []
attention_masks = []


for tweet in tweets:
    encoded_dict = tokenizer.encode_plus(
                        tweet,
                        add_special_tokens=True,
                        max_length=max_len,
                        padding='max_length',
                        truncation=True,
                        return_attention_mask=True,
                        return_tensors='pt',
                   )


    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])


input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', tweets[0])
print('Token IDs:', input_ids[0])

Original:  ['deed', 'reason', 'earthquake', 'allah', 'forgive']
Token IDs: tensor([  101, 15046,  3114,  8372, 16455,  9641,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])


In [None]:
dataset = TensorDataset(input_ids, attention_masks, labels)

In [None]:
train_size = int(0.8 * len(dataset))

val_size = len(dataset)  - train_size


train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

6,090 training samples
1,523 validation samples


In [None]:
batch_size = 16


train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size #
        )

In [None]:
config = BertConfig.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
    hidden_dropout_prob=0.35,
    attention_probs_dropout_prob=0.35
)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config=config)
model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!nvidia-smi

Tue Dec 24 13:49:02 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P0             27W /   70W |     575MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

# Fine-Tune BERT

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )



In [None]:
epochs = 4


total_steps = len(train_dataloader) * epochs


scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
def compute_f1(predictions, labels):
    return f1_score(labels, predictions, average='binary')

def compute_acc(predictions, labels):
    return accuracy_score(labels, predictions)

In [None]:
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    all_predictions = []
    all_labels = []

    for step, batch in enumerate(train_dataloader):

        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()
        scheduler.step()


        total_train_loss += loss.item()


        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        all_predictions.extend(preds)
        all_labels.extend(labels.cpu().numpy())

    # train_f1 = compute_f1(all_predictions, all_labels)
    acc = compute_acc(all_predictions, all_labels)

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"  Training loss: {avg_train_loss:.4f}")
    print(f"  acc: {acc:.4f}")

    model.eval()
    total_val_loss = 0
    all_val_predictions = []
    all_val_labels = []

    with torch.no_grad():
        for batch in validation_dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_val_loss += loss.item()

            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_val_predictions.extend(preds)
            all_val_labels.extend(labels.cpu().numpy())


    val_acc = compute_acc(all_val_predictions, all_val_labels)
    avg_val_loss = total_val_loss / len(validation_dataloader)
    print(f"  Val accuracy: {val_acc:.4f}")

Epoch 1/4
  Training loss: 0.5782
  acc: 0.6970
  Val accuracy: 0.7892
Epoch 2/4
  Training loss: 0.4738
  acc: 0.7900
  Val accuracy: 0.7951
Epoch 3/4
  Training loss: 0.4405
  acc: 0.8085
  Val accuracy: 0.7958
Epoch 4/4
  Training loss: 0.4249
  acc: 0.8161
  Val accuracy: 0.8017


# Submit predictions

In [None]:
test['text'] = test['text'].apply(lambda x: clean_text(x))

In [None]:
tweets_test = test['text'].values

In [None]:
test.text

0                          [happen, terrible, car, crash]
1         [hear, earthquake, different, city, stay, safe]
2       [forest, fire, spot, pond, geese, flee, street...
3               [apocalypse, lighting, spokane, wildfire]
4            [typhoon, soudelor, kill, 28, china, taiwan]
                              ...                        
3258    [earthquake, safety, los, angeles, ûò, safety...
3259    [storm, ri, worse, hurricane, city&amp;3other,...
3260                   [green, line, derailment, chicago]
3261       [meg, issue, hazardous, weather, outlook, hwo]
3262    [cityofcalgary, activate, municipal, emergency...
Name: text, Length: 3263, dtype: object

In [None]:
test_input_ids = []
test_attention_masks = []
for tweet in tweets_test:
    encoded_dict = tokenizer.encode_plus(
                        tweet,
                        add_special_tokens=True,
                        max_length=max_len,
                        padding='max_length',
                        truncation=True,
                        return_attention_mask=True,
                        return_tensors='pt',
                   )


    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

In [None]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = batch_size
        )

In [None]:
predictions = []
for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask)
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            pred_flat = np.argmax(logits, axis=1).flatten()

            predictions.extend(list(pred_flat))

In [None]:
df_output = pd.DataFrame()
df_output['id'] = test['id']
df_output['target'] = predictions
df_output.to_csv('submission1.csv', index=False)

In [1]:
# Public score 0.81458