In [1]:
!pip install transformers
!pip install pytorch_lightning
!pip install hf-hub-lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 9.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 36.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling Py

In [2]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, random_split
from transformers import BertTokenizer, BertModel
from torchvision.transforms import transforms
import torchvision.models as models
import torch
import torch.nn as nn
import numpy as np
import pytorch_lightning as pl
from huggingface_hub import hf_hub_download
import pandas as pd
from PIL import Image

### 导入数据集

In [3]:
class GetIMGandTXTDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        image_transform,
        tokenizer=None,
        random_state=0,
        dataframe=None,
    ):
        self.samples_frame = dataframe
        if type(dataframe) is "Subset":
          self.samples_frame = self.samples_frame.reset_index(drop=True)
        self.image_transform = image_transform
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.samples_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        img_id = self.samples_frame.loc[idx, "file_number"]

        image = Image.open(
            self.samples_frame.loc[idx, "img"]
        ).convert("RGB")

        image = self.image_transform(image)

        encoded_dict = self.tokenizer.encode_plus(
                        self.samples_frame.loc[idx, "text_content"],    # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        padding= 'max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
        
        text = encoded_dict['input_ids']
        attention_mask = encoded_dict['attention_mask']
        content = self.samples_frame.loc[idx, "text_content"]
        if "label" in self.samples_frame.columns:
            label = torch.Tensor(
                [self.samples_frame.loc[idx, "label"]]
            ).long().squeeze()
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text, 
                "content":content,
                "attention_mask":attention_mask,
                "label": label,
            }
        else:
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text,
                "content":content,
                "attention_mask":attention_mask,
            }

        return sample


In [4]:
!wget https://github.com/JasonXQH/AI_lab5_multiple_emotion_recognition/raw/main/content/drive/Mydrive/lab5_data/test_dataset.pt
test_dataset = torch.load("./test_dataset.pt")

--2022-07-12 06:45:43--  https://github.com/JasonXQH/AI_lab5_multiple_emotion_recognition/raw/main/content/drive/Mydrive/lab5_data/test_dataset.pt
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/JasonXQH/AI_lab5_multiple_emotion_recognition/main/content/drive/Mydrive/lab5_data/test_dataset.pt [following]
--2022-07-12 06:45:43--  https://raw.githubusercontent.com/JasonXQH/AI_lab5_multiple_emotion_recognition/main/content/drive/Mydrive/lab5_data/test_dataset.pt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1023599 (1000K) [application/octet-stream]
Saving to: ‘test_dataset.pt’


2022-07-12 06:45

### 生成dataloader

In [5]:
test_dataloader = torch.utils.data.DataLoader(
            test_dataset, 
            shuffle=False, 
            batch_size=1,  # 此时需要把concat模型中的batch设置为1
            num_workers=16)

  cpuset_checked))


### 模型定义

In [6]:
class LanguageAndVisionConcat(torch.nn.Module):
    def __init__(
        self,
        num_classes,
        loss_fn,
        language_module,
        vision_module,
        language_feature_dim,
        vision_feature_dim,
        fusion_output_size,
        dropout_p,
        batch_size
    ):
        super(LanguageAndVisionConcat, self).__init__()
        self.language_module = language_module
        self.vision_module = vision_module
        self.fusion = torch.nn.Linear(
            in_features=(language_feature_dim + vision_feature_dim) ,
            out_features=fusion_output_size
        )
        self.fc = torch.nn.Linear(
            in_features=fusion_output_size, 
            out_features=num_classes
        )
        self.batch_size = batch_size
        self.loss_fn = loss_fn
        self.dropout = torch.nn.Dropout(dropout_p)
        
    def forward(self, text, image,mask, label=None):
        text.resize_(1,64)
        # print("mask size: ",mask.size())
        text_output = self.language_module(
              input_ids=text, 
              attention_mask=mask)
        # print("text_output.last_hidden_state[:, 0] size: ",text_output.last_hidden_state[:, 0].size())
        text_features = torch.nn.functional.relu(text_output.last_hidden_state[:, 0]) # 16x768
        image_features = torch.nn.functional.relu(
            self.vision_module(image)
        )

        image_features = image_features.view(image_features.shape[0], -1)
        combined = torch.cat(
            [text_features, image_features], dim= -1 
        )
        
        # print("combined size : ",combined.size())
        fused = self.dropout(
            torch.nn.functional.relu(
            self.fusion(combined)
            )
        )
        logits = self.fc(fused)
        pred = torch.nn.functional.softmax(logits)
        loss = (
            self.loss_fn(pred, label) 
            if label is not None else label
        )
        return (pred, loss)

In [7]:
class EmotionRecognitionModel(pl.LightningModule):
    def __init__(self, model_hparams):
        super(EmotionRecognitionModel, self).__init__()
        self.model_hparams = model_hparams
        self.train_acc =None
        self.val_acc = None
        self.embedding_dim = self.hparams.get("embedding_dim", 300)
        self.language_feature_dim = self.hparams.get(
            "language_feature_dim", 768
        )
        self.vision_feature_dim = self.hparams.get(
            "vision_feature_dim", 1024
        )
        self.wandb_logger = None
        self.output_path = None
        self.train_dataset = None
        self.val_dataset = None
        self.model = self._build_model()
        self.trainer_params = self._get_trainer_params()
    
    ## Required LightningModule Methods (when validating) ##
    
    def forward(self, text, image,mask ,label=None):
        return self.model(text, image,mask, label)

    def training_step(self, batch, batch_nb):
        (preds, loss) = self.forward(
            text=batch["text"].to(device), 
            image=batch["image"].to(device), 
            mask = batch["attention_mask"].to(device),
            label=batch["label"].to(device),
        )
        self.train_acc(preds, batch["label"])
        self.log('train_acc_step', self.train_acc, on_step=True, on_epoch=False)
        self.log("train_loss_step", loss)
        return loss

    def validation_step(self, batch, batch_nb):
        (preds, loss) = self.eval().forward(
            text=batch["text"].to(device), 
            image=batch["image"].to(device), 
            mask = batch["attention_mask"].to(device),
            label=batch["label"].to(device),
        )
        self.val_acc(preds, batch["label"])
        self.log('val_acc_step', self.val_acc, on_step=True, on_epoch=False)
        return {"batch_val_loss": loss}

    def test_step(self, batch):

        (preds, loss) = self.eval().forward(
            text = batch['text'],
            image = batch["image"],
            mask = batch['attention_mask'],
        )

        preds = preds.detach().numpy()
        label = np.argmax(preds, axis=1)
        # print(batch['id'],preds)
        return batch['id'], label

    def training_epoch_end(self, outs):
        # log epoch metric
        self.log('train_acc_epoch', self.train_acc,on_epoch=True)

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack(
            tuple(
                output["batch_val_loss"] 
                for output in outputs
            )
        ).mean()
        self.log("avg_val_loss", avg_loss,on_epoch=True)
        self.log('val_acc_epoch', self.val_acc,on_epoch=True)
        return {
            "val_loss": avg_loss,
            "progress_bar":{"avg_val_loss": avg_loss}
        }

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
                self.model.parameters(), 
                lr=self.hparams.get("lr", 0.00001)
            )
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=150, gamma=0.1)
        return {
           'optimizer': optimizer,
           'lr_scheduler': scheduler,
           'monitor': 'avg_val_loss'
       }
    
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, 
            shuffle=True, 
            batch_size=self.hparams.get("batch_size", 16), 
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset, 
            shuffle=False, 
            batch_size=self.hparams.get("batch_size", 16), 
        )
    def fit(self):
        self._set_seed(self.hparams.get("random_state", 42))
        self.trainer = pl.Trainer(accumulate_grad_batches=self.trainer_params["accumulate_grad_batches"],
                                  gpus = self.trainer_params["gpus"],
                                  max_epochs = self.trainer_params["max_epochs"],
                                  gradient_clip_val = self.trainer_params["gradient_clip_val"],
                                  weights_save_path = self.trainer_params["default_save_path"],
                                  callbacks=[HuggingFaceHubCallback('JasonXu/multimodel_emotion_recognize_with_bert_and_resnet'),
                                             self.trainer_params["early_stop_callback"],
                                             self.trainer_params["checkpoint_callback"]],
                                  logger=self.wandb_logger)
        self.trainer.fit(self)
        
    def _set_seed(self, seed):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
    
    def _build_dataset(self, dataframe):
        GetIMGandTXTDataset(
            dataframe = dataframe,
            image_transform= image_transform,
            tokenizer=BertTokenizer.from_pretrained('bert-base-uncased', 
                                                    do_lower_case=True),
        )
    
    def _build_model(self):
        language_module = BertModel.from_pretrained(
            "bert-base-uncased", 
            output_attentions = True, 
            output_hidden_states = True,
        )

        vision_module = models.resnet34(
            pretrained=True
        )

        vision_module.fc = torch.nn.Linear(
                in_features=vision_module.fc.in_features,
                out_features=self.vision_feature_dim
        )

        return LanguageAndVisionConcat(
            num_classes=self.hparams.get("num_classes", 3),
            loss_fn = nn.CrossEntropyLoss(),
            language_module=language_module,
            vision_module=vision_module,
            language_feature_dim = self.language_feature_dim,
            vision_feature_dim=self.vision_feature_dim,
            fusion_output_size=self.hparams.get(
                "fusion_output_size", 512
            ),
            dropout_p = self.hparams.get("dropout_p", 0.1),
            batch_size = self.hparams.get("batch_size",None),
        )
    
    def _get_trainer_params(self):
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            dirpath=self.output_path,
            monitor = "avg_val_loss",
            mode=self.hparams.get(
                "checkpoint_monitor_mode", "min"
            ),
            verbose=self.hparams.get("verbose", True)
        )

        early_stop_callback = pl.callbacks.EarlyStopping(
            monitor=self.hparams.get(
                "early_stop_monitor", "val_loss"
            ),
            min_delta=self.hparams.get(
                "early_stop_min_delta", 0.001
            ),
            patience=self.hparams.get(
                "early_stop_patience", 4
            ),
            strict=False,
            verbose=self.hparams.get("verbose", False),
        )

        trainer_params = {
            "checkpoint_callback": checkpoint_callback,
            "early_stop_callback": early_stop_callback,
            "default_save_path": self.output_path,
            "accumulate_grad_batches": self.hparams.get(
                "accumulate_grad_batches", 1
            ),
            "gpus": self.hparams.get("n_gpu", 1),
            "max_epochs": self.hparams.get("max_epochs", 100),
            "gradient_clip_val": self.hparams.get(
                "gradient_clip_value", 1
            ),
        }
        return trainer_params


### 导入checkpoint

In [8]:
path = '/content/drive/MyDrive/lab5_data/model-outputs/16x1792_1e-5_epoch=7-step=1800.ckpt'

In [9]:
model_hparams = {
    "embedding_dim": 150,
    "tokenizer": BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True),
    "language_feature_dim": 768,
    "vision_feature_dim": 1024,
    "fusion_output_size": 512,
    "output_path": "/content/drive/MyDrive/lab5_data/model-outputs",
    "val_limit": None,
    "lr": 1e-5,
    "dropout_p":0.1,
    "max_epochs": 20,
    "n_gpu": 1,
    "batch_size": 1,
    # allows us to "simulate" having larger batches 
    "accumulate_grad_batches": 16,
    "early_stop_patience": 3,
}

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
ckptpath = hf_hub_download("JasonXu/multimodel_emotion_recognize_with_bert_and_resnet","16x1792_1e-5_epoch=7-step=1800.ckpt")

Downloading:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

In [11]:
my_model = EmotionRecognitionModel.load_from_checkpoint(
    ckptpath,
    model_hparams=model_hparams
)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth


  0%|          | 0.00/83.3M [00:00<?, ?B/s]

### 预测

In [12]:
preds = []
for step, batch in enumerate(test_dataloader):
  pred = my_model.test_step(batch)
  preds.append(pred)

  cpuset_checked))


In [13]:
label_dict = {0:"negative",1:"positive",2:"neutral"}
preds_aslist = []

for pred in preds:
  pred  = list(pred)
  pred[0] = pred[0].item()
  pred[1] =label_dict[pred[1][0]]
  preds_aslist.append(pred)

In [14]:
results = pd.DataFrame(preds_aslist, columns =['guid', 'tag'])
results.tail()

Unnamed: 0,guid,tag
506,1048,positive
507,1059,positive
508,1485,positive
509,3195,negative
510,2029,negative
