<a href="https://colab.research.google.com/github/Mekatebi/NMA_DL_2023_Project/blob/main/NMA_DL_2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Codes for NMA DL 2023 project
[Doc](https://docs.google.com/document/d/1Oh_WXq_B8Tlgt-A1j7EOfHFZrHP-iDAI1nQmS0hMD6w)

## Fine-Tune BERT

### Set-up environment

In [1]:
!pip install -q transformers transformers[torch] datasets evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset
import torch
import evaluate
import numpy as np

### Load dataset

In [3]:
dataset = load_dataset('csv', data_files='/content/500_Reddit_users_posts_labels.csv')

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-7041fe2d13db438e/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-7041fe2d13db438e/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['User', 'Post', 'Label'],
        num_rows: 500
    })
})

In [5]:
example = dataset['train'][0]
example

{'User': 'user-0',
 'Post': "['Its not a viable option, and youll be leaving your wife behind. Youd Pain her beyond comprehension.It sucks worrying about money, I know that first hand. It can definitely feel hopeless, as you seem to be Tired aware of. Your wife might need to chip in financially. I know time is an issue, but even 10-15 hours a Asthenia could alleviate a lot of the pressure. In the meantime, get your shit together - write that resume tomorrow. No excuses, get it done and send it out. Whether you believe in some sort of powerful being or force governing things or not, things really do work themselves out. This is a big test for you, and youll pull through. Just try to stay as positive as you can and everything will work out.']",
 'Label': 'Supportive'}

### Preprocess

In [6]:
def not_none(example):
    return example['Post'] is not None

dataset = dataset.filter(not_none)

dataset_sampled = dataset['train'].train_test_split(test_size=0.01, seed=21)['train']

train_val_test = dataset_sampled.train_test_split(test_size=0.2, seed=21)
train_dataset = train_val_test['train']
test_val_dataset = train_val_test['test']

test_val_split = test_val_dataset.train_test_split(test_size=0.5, seed=21)
validation_dataset = test_val_split['train']
test_dataset = test_val_split['test']

columns_to_keep = ['Post', 'Label']

columns_to_remove = [col for col in dataset_sampled.column_names if col not in columns_to_keep]

train_dataset = train_dataset.remove_columns(columns_to_remove)
validation_dataset = validation_dataset.remove_columns(columns_to_remove)
test_dataset = test_dataset.remove_columns(columns_to_remove)

le = LabelEncoder()

le.fit(dataset_sampled['Label'])

def encode_labels(example):
    example['Label'] = le.transform([example['Label']])[0]
    return example

train_dataset = train_dataset.map(encode_labels)
validation_dataset = validation_dataset.map(encode_labels)
test_dataset = test_dataset.map(encode_labels)

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/396 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [7]:
train_dataset

Dataset({
    features: ['Post', 'Label'],
    num_rows: 396
})

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def prepare_data(example):
    encoding = tokenizer.encode_plus(
        example['Post'],
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt',
    )
    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': torch.tensor(example['Label'], dtype=torch.long)
    }

train_dataset = train_dataset.map(prepare_data)
validation_dataset = validation_dataset.map(prepare_data)
test_dataset = test_dataset.map(prepare_data)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/396 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [9]:
train_dataset

Dataset({
    features: ['Post', 'Label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 396
})

### Evaluate

In [10]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

### Train

In [11]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=len(le.classes_))

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    output_dir='./Model',
    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=4
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.561343,0.306122
2,No log,1.552013,0.306122
3,No log,1.553613,0.306122
4,No log,1.465112,0.346939
5,No log,1.305988,0.44898
6,No log,1.549283,0.346939
7,No log,1.684014,0.326531
8,No log,1.620231,0.387755


TrainOutput(global_step=200, training_loss=1.1926821899414062, metrics={'train_runtime': 362.9527, 'train_samples_per_second': 8.728, 'train_steps_per_second': 0.551, 'total_flos': 833558275325952.0, 'train_loss': 1.1926821899414062, 'epoch': 8.0})

In [15]:
trainer.evaluate(test_dataset)

{'eval_loss': 1.3134009838104248,
 'eval_accuracy': 0.52,
 'eval_runtime': 1.6484,
 'eval_samples_per_second': 30.333,
 'eval_steps_per_second': 4.247,
 'epoch': 8.0}