In [1]:
# !git clone https://Fatemeh-Tohidian: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@github.com/NLP-Final-Projects/MultiModalEmotionRecognition.git

In [None]:
!pip install scipy scikit-image torch tqdm transformers mediapipe opencv-python torchvision numpy pandas timm evaluate facenet-pytorch mtcnn

In [2]:
from google.colab import drive
from pathlib import Path
drive.mount("/content/drive")
project_path = Path("/content/drive/MyDrive/NLP/MultiModalEmotionRecognition")

Mounted at /content/drive


In [3]:
%cd drive/MyDrive/NLP/MultiModalEmotionRecognition/data

/content/drive/MyDrive/NLP/MultiModalEmotionRecognition/data


In [4]:
!ls

correct_indexes    english_val.txt	  images	       test.zip
dev		   error_indexes	  saved_features       train_ende.zip
dev.zip		   image_index_test.txt   sentiment_test.txt
english_test.txt   image_index_train.txt  sentiment_train.txt
english_train.txt  image_index_val.txt	  sentiment_val.txt


In [None]:
!unzip dev.zip
!unzip test.zip
!unzip train_ende.zip

In [None]:
%mv dev/ images/val
%mv test/ images/test
%mv train_ende/ images/train

In [None]:
%ls images/train -1 | wc -l

In [None]:
%cd ..

In [5]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [6]:
import torch
import numpy as np
import torchvision
import matplotlib.pyplot as plt
from torchvision.transforms import transforms as transforms
from torchvision.models.detection import KeypointRCNN_ResNet50_FPN_Weights


class PoseEmbeddingExtractor:
    def __init__(
        self,
        device='cpu'
    ):
        self.model = torchvision.models.detection.keypointrcnn_resnet50_fpn(weights=KeypointRCNN_ResNet50_FPN_Weights.DEFAULT,num_keypoints=17).to(device)
        self.model.eval()
        self.device = device
        self.transform = transforms.Compose([
            transforms.ToTensor()
        ])

    def extract_embedding(self, image):
        image = self.transform(image)
        image = image.unsqueeze(0).to(self.device)
        with torch.no_grad():
            outputs = self.model(image)
        
        keypoints_scores = outputs[0]['keypoints_scores']
        best_score = torch.mean(keypoints_scores, axis=1).argmax().item()
        keypoints = outputs[0]['keypoints'][best_score,:,:2]
        return keypoints.ravel()

# p = PoseEmbeddingExtractor(device=device)
# path = 'data/images/val/4965.jpg'
# img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
# p.extract_embedding(img).shape

In [7]:
import sys, os, torch, cv2
from pathlib import Path

from glob import glob
from tqdm import tqdm


def remove_non_poses(input_dir, split):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    pee = PoseEmbeddingExtractor(device=device)
    file_name = f"pose_error_{split}.txt"
    os.makedirs("./data/error_indexes", exist_ok=True)
    print(input_dir)
    print(f"./data/error_indexes/{file_name}")
    non_pose_files = open(f"./data/error_indexes/{file_name}", "w")
    img_pattern = os.path.join(input_dir, "*.jpg")
    images = glob(img_pattern)
    print(len(images))
    for image_path in tqdm(images):
        try:
            img = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
            pee.extract_embedding(img)

        except Exception as e:
            print(e)
            img_id = image_path.split("/")[-1].split(".")[0]
            non_pose_files.write(f"{img_id}")
            non_pose_files.write(os.linesep)
            non_pose_files.flush()


# split = "train"

# input_dir = project_path / "data" / "images" / split
# remove_non_poses(input_dir, split)


#Face Embedding

In [8]:
from scipy.spatial.distance import euclidean
import math
from skimage.transform import rotate
from facenet_pytorch import MTCNN as MTCNN2
import mediapipe
import numpy as np
import pandas as pd
import cv2
import os
from PIL import Image
import torch
from torchvision import transforms
import urllib


def get_model_path(model_name):
    model_file = model_name + ".pt"
    cache_dir = os.path.join(os.path.expanduser("~"), ".hsemotions")
    # cache_dir = "emotion_models"
    os.makedirs(cache_dir, exist_ok=True)
    fpath = os.path.join(cache_dir, model_file)
    if not os.path.isfile(fpath):
        print(f"{model_file} not exists")
        url = (
            "https://github.com/HSE-asavchenko/face-emotion-recognition/blob/main/models/affectnet_emotions/"
            + model_file
            + "?raw=true"
        )
        print("Downloading", model_name, "from", url)
        urllib.request.urlretrieve(url, fpath)

    return fpath


class FaceAlignment:
    def __init__(
        self,
    ):
        pass

    @staticmethod
    def apply_rotation_on_images(input_images, angles):
        rotated_images = [
            rotate(image, angle) for image, angle in zip(input_images, angles)
        ]
        return rotated_images

    @staticmethod
    def compute_alignment_rotation_(eyes_coordinates):
        angles = []
        directions = []
        for left_eye_coordinate, right_eye_coordinate in eyes_coordinates:

            left_eye_x, left_eye_y = left_eye_coordinate
            right_eye_x, right_eye_y = right_eye_coordinate

            triangle_vertex = (
                (right_eye_x, left_eye_y)
                if left_eye_y > right_eye_y
                else (left_eye_x, right_eye_y)
            )
            direction = (
                -1 if left_eye_y > right_eye_y else 1
            )  # rotate clockwise else counter-clockwise

            # compute length of triangle edges
            a = euclidean(left_eye_coordinate, triangle_vertex)
            b = euclidean(right_eye_coordinate, triangle_vertex)
            c = euclidean(right_eye_coordinate, left_eye_coordinate)

            # cosine rule
            if (
                b != 0 and c != 0
            ):  # this multiplication causes division by zero in cos_a calculation
                cos_a = (b**2 + c**2 - a**2) / (2 * b * c)
                angle = np.arccos(cos_a)  # angle in radian
                angle = (angle * 180) / math.pi  # radian to degree
            else:
                angle = 0

            angle = angle - 90 if direction == -1 else angle

            angles.append(angle)
            directions.append(direction)

        return angles, directions


class FaceDetection:

    # first call extract_face
    def __init__(self, model_name, minimum_confidence):

        self.detected_faces_information = None
        self.model_name = model_name
        self.minimum_confidence = minimum_confidence
        if model_name == "MTCNN":
            detector_model = MTCNN2(device=device)
            self.detect_faces_function = (
                lambda input_image: detector_model.detect(input_image, landmarks=True)
            )

    def extract_faces(self, input_image, return_detections_information=True):
        self.detect_faces__(input_image)
        faces = self.get_faces__(
            input_image,
        )
        if return_detections_information:
            return faces, self.detected_faces_information

        else:
            return faces

    def detect_faces__(self, input_image):
        detections = self.detect_faces_function(input_image)
        detections = [
            {
                'box': detections[0][i],
                'confidence': detections[1][i],
                'keypoints': {
                    'left_eye': detections[2][i][0],
                    'right_eye': detections[2][i][1],
                    'nose': detections[2][i][2],
                    'mouth_left': detections[2][i][3], 
                    'mouth_right': detections[2][i][4]
                }
              
            }
            for i in range(detections[0].shape[0])]
        self.detected_faces_information = list(
            filter(
                lambda element: element["confidence"] > self.minimum_confidence,
                detections,
            )
        )


    def get_detected_faces_information(self):
        return self.detected_faces_information

    def get_keypoints(
        self,
    ):
        return list(
            map(lambda element: element["keypoints"], self.detected_faces_information)
        )

    def get_faces__(
        self,
        input_image,
    ):
        boxes = [
            detection_information["box"]
            for detection_information in self.detected_faces_information
        ]
        y1y2x1x2 = [(int(y), int(y2), int(x), int(x2)) for x, y, x2, y2 in boxes]
        faces = [input_image[y1:y2, x1:x2] for y1, y2, x1, x2 in y1y2x1x2]
        return faces

    def get_eyes_coordinates(
        self,
    ):
        eyes_coordinates = [
            (info["keypoints"]["left_eye"], info["keypoints"]["right_eye"])
            for info in self.detected_faces_information
        ]
        return eyes_coordinates


class FaceEmotionRecognizer:
    # supported values of model_name: enet_b0_8_best_vgaf, enet_b0_8_best_afew, enet_b2_8, enet_b0_8_va_mtl, enet_b2_7
    def __init__(self, device, model_name="enet_b0_8_best_vgaf"):
        self.device = device
        self.is_mtl = "_mtl" in model_name
        if "_7" in model_name:
            self.idx_to_class = {
                0: "Anger",
                1: "Disgust",
                2: "Fear",
                3: "Happiness",
                4: "Neutral",
                5: "Sadness",
                6: "Surprise",
            }
        else:
            self.idx_to_class = {
                0: "Anger",
                1: "Contempt",
                2: "Disgust",
                3: "Fear",
                4: "Happiness",
                5: "Neutral",
                6: "Sadness",
                7: "Surprise",
            }

        self.img_size = 224 if "_b0_" in model_name else 260
        self.test_transforms = transforms.Compose(
            [
                transforms.Resize((self.img_size, self.img_size)),
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
                ),
            ]
        )

        path = get_model_path(model_name)

        model = torch.load(path)
        model = model.to(device)

        if isinstance(model.classifier, torch.nn.Sequential):
            self.classifier_weights = model.classifier[0].weight.data
            self.classifier_bias = model.classifier[0].bias.data
        else:
            self.classifier_weights = model.classifier.weight.data
            self.classifier_bias = model.classifier.bias.data

        model.classifier = torch.nn.Identity()
        self.model = model.eval()
        # print(path, self.test_transforms)

    def compute_probability(self, features):
        return torch.matmul(features, self.classifier_weights.T) + self.classifier_bias

    def extract_representations_from_faces(self, input_faces):
        faces = [self.test_transforms(Image.fromarray(face)) for face in input_faces]
        features = self.model(torch.stack(faces, dim=0).to(self.device))
        return features

    def predict_emotions_from_representations(
        self, representations, logits=True, return_features=True
    ):
        scores = self.compute_probability(representations)
        if self.is_mtl:
            predictions_indices = torch.argmax(scores[:, :-2], dim=1)

        else:
            predictions_indices = torch.argmax(scores, dim=1)

        if self.is_mtl:
            x = scores[:, :-2]

        else:
            x = scores
        pred = torch.argmax(x[0])

        if not logits:
            e_x = torch.exp(x - torch.max(x, dim=1)[:, None])
            e_x = e_x / e_x.sum(dim=1)[:, None]
            if self.is_mtl:
                scores[:, :-2] = e_x
            else:
                scores = e_x

        return [
            self.idx_to_class[pred.item()] for pred in (predictions_indices)
        ], scores


class FaceNormalizer:
    def __init__(self):
        self.mp_face_mesh = mediapipe.solutions.face_mesh
        face_mesh = self.mp_face_mesh.FaceMesh(static_image_mode=True)

        mp_face_mesh = mediapipe.solutions.face_mesh
        self.face_mesh = mp_face_mesh.FaceMesh(static_image_mode=True)
        self.routes_idx = self.initialize__()

    def initialize__(self):
        df = pd.DataFrame(
            list(self.mp_face_mesh.FACEMESH_FACE_OVAL), columns=["p1", "p2"]
        )
        routes_idx = []

        p1 = df.iloc[0]["p1"]
        p2 = df.iloc[0]["p2"]

        for i in range(0, df.shape[0]):
            obj = df[df["p1"] == p2]
            p1 = obj["p1"].values[0]
            p2 = obj["p2"].values[0]

            route_idx = []
            route_idx.append(p1)
            route_idx.append(p2)
            routes_idx.append(route_idx)

        return routes_idx

    def get_landmarks__(self, input_image: np.ndarray):
        if input_image.dtype == np.float:
            input_image = (input_image * 255).astype(np.uint8)

        results = self.face_mesh.process(input_image)
        landmarks = results.multi_face_landmarks[0]

        routes = []
        # for source_idx, target_idx in mp_face_mesh.FACEMESH_FACE_OVAL:
        for source_idx, target_idx in self.routes_idx:
            source = landmarks.landmark[source_idx]
            target = landmarks.landmark[target_idx]

            relative_source = (
                int(input_image.shape[1] * source.x),
                int(input_image.shape[0] * source.y),
            )
            relative_target = (
                int(input_image.shape[1] * target.x),
                int(input_image.shape[0] * target.y),
            )

            # cv2.line(img, relative_source, relative_target, (255, 255, 255), thickness = 2)

            routes.append(relative_source)
            routes.append(relative_target)

        return routes

    @staticmethod
    def normalize_with_landmark_points__(input_image, landmarks):
        mask = np.zeros((input_image.shape[0], input_image.shape[1]))
        mask = cv2.fillConvexPoly(mask, np.array(landmarks), 1)
        mask = mask.astype(bool)

        out = np.zeros_like(input_image)
        out[mask] = input_image[mask]
        return out

    def normalize_faces_image(self, input_images):
        normalized_faces_images = [
            self.normalize_with_landmark_points__(
                input_image, self.get_landmarks__(input_image)
            )
            for input_image in input_images
        ]
        return normalized_faces_images


class FaceEmbeddingExtractor:
    def __init__(
        self,
        device='cuda'
    ):
        self.faces = None
        self.normalized_rotated_faces = None
        self.rotated_faces = None
        self.rotation_angles = None
        self.rotation_directions = None

        fd = FaceDetection("MTCNN", minimum_confidence=0.95)
        self.face_detection_model: FaceDetection = fd
        fa = FaceAlignment()
        self.face_alignment_model: FaceAlignment = fa
        fn = FaceNormalizer()
        self.face_normalizer_model: FaceNormalizer = fn
        model_name = "enet_b0_8_best_afew"
        fer = FaceEmotionRecognizer(device, model_name)
        self.face_emotion_recognition_model: FaceEmotionRecognizer = fer


    def extract_embedding(self, input_image):
        faces, detected_faces_information = self.face_detection_model.extract_faces(
            input_image, return_detections_information=True
        )

        (
            rotation_angles,
            rotation_directions,
        ) = self.face_alignment_model.compute_alignment_rotation_(
            self.face_detection_model.get_eyes_coordinates()
        )
        rotated_faces = self.face_alignment_model.apply_rotation_on_images(
            faces, rotation_angles
        )
        normalized_rotated_faces = self.face_normalizer_model.normalize_faces_image(
            rotated_faces
        )

        normalized_rotated_faces_255 = [
            (image * 255).astype(np.uint8) for image in normalized_rotated_faces
        ]

        representations = (
            self.face_emotion_recognition_model.extract_representations_from_faces(
                normalized_rotated_faces_255
            )
        )[0] #WARNING: 0 was not here
        del normalized_rotated_faces_255
        del normalized_rotated_faces
        del rotated_faces
        del rotation_angles
        del rotation_directions
        del faces
        del detected_faces_information
        # (
        #     predictions,
        #     scores,
        # ) = self.face_emotion_recognition_model.predict_emotions_from_representations(
        #     representations
        # )

        # self.faces = faces
        # self.rotation_angles, self.rotation_directions = (
        #     rotation_angles,
        #     rotation_directions,
        # )
        # self.rotated_faces = rotated_faces
        # self.normalized_rotated_faces = normalized_rotated_faces_255

        return None, None, representations
        # return preictions, scores, representations

    def get_rotations_information(self):
        return self.rotation_angles, self.rotation_directions

    def get_faces(self):
        return self.faces

    def get_rotated_faces(self):
        return self.rotated_faces

    def get_normalized_rotated_faces(self):
        return self.normalized_rotated_faces

    def clear(self):
        self.faces = None
        self.normalized_rotated_faces = None
        self.rotated_faces = None
        self.rotation_angles = None
        self.rotation_directions = None

    def store_embeddings(self, file, embeddings):
        with open(file, "wb") as file_out:
            pickle.dump(
                {"embeddings": embeddings}, file_out, protocol=pickle.HIGHEST_PROTOCOL
            )

    def load_embeddings(self, file):
        with open(file, "rb") as file_in:
            stored_data = pickle.load(file_in)
            stored_embeddings = stored_data["embeddings"]

        return stored_embeddings

#Text Embedding


In [9]:
from transformers import AutoTokenizer, AutoModel, pipeline
from transformers import RobertaForSequenceClassification
import torch
import pickle


class TextEmbeddingExtractor:
    def __init__(
        self,
        model_name="pysentimiento/robertuito-sentiment-analysis",
        show_progress_bar=True,
        to_tensor=True,
        max_length=128,
        device='cuda'
    ):
        self.model_name = model_name
        self.device = device
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = RobertaForSequenceClassification.from_pretrained(
            self.model_name, num_labels=3, output_hidden_states=True
        ).to(self.device)

        self.generator = pipeline(
            task="sentiment-analysis",
            model=self.model,
            tokenizer=self.tokenizer,
        )

    def extract_embedding(
        self,
        input_batch_sentences,
    ):
        encoded_input = self.tokenizer(
            input_batch_sentences,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        ).to(self.device)

        with torch.no_grad():
            model_output = self.model(**encoded_input)
            hidden_states = model_output["hidden_states"]
            last_layer_hidden_states = hidden_states[
                12
            ]  # 12 = len(hidden_states) , dim = (batch_size, seq_len, 768)
            cls_hidden_state = last_layer_hidden_states[:, 0, :]

        return cls_hidden_state

    def get_labels(self, input_batch_sentences):
        return self.generator(input_batch_sentences)


#Dataset

In [10]:
FACE_EMBEDDING_SIZE = 1280
TEXT_EMBEDDING_SIZE = 768
POSE_EMBEDDING_SIZE = 34
SCENE_EMBEDDING_SIZE = None


In [None]:
!free

In [11]:
import os, cv2, torch, ast
import pandas as pd
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset
from tqdm import trange
from tqdm import tqdm


class MSCTDDataSet(Dataset):
    """MSCTD dataset."""

    def __init__(self, base_path="data/", split="train", data_size=None, load=False, device='cuda'):
        """
        Args:
            base_path (str or path): path to data folder
            split (str): dev, train, test
        """
        if isinstance(base_path, str):
            base_path = Path(base_path)
        self.base_path = base_path
        self.load_path = base_path / 'saved_features'
        self.split = split
        self.text_file_path = base_path / f"english_{split}.txt"
        self.seq_file_path = base_path / f"image_index_{split}.txt"
        self.sentiment_file_path = base_path / f"sentiment_{split}.txt"
        self.image_dir = base_path / "images" / split
        self.correct_indexes_file_path = base_path / "correct_indexes" / f"correct_indexes_{split}.txt"

        self.data_size = data_size
        self.load = load
        self.device = device

        self.texts = None
        self.sentiments = None
        self.indexes = None
        self.face_embeddings = None
        self.pose_embeddings = None
        self.text_embeddings = None
        self.load_data()
        self.face_embedding_extractor = FaceEmbeddingExtractor(device=device)
        self.text_embedding_extractor = TextEmbeddingExtractor(device=device)
        self.pose_embedding_extractor = PoseEmbeddingExtractor(device=device)


    #text is not valid
    def load_data(self):
        if self.load:
          try:
              real_indexes = torch.load(self.load_path / f'real_indexes_{self.split}.pt')
              correct_texts = None
              correct_sentiments = torch.load(self.load_path / f'sentiments_{self.split}.pt')
              face_embeddings = torch.load(self.load_path / f'face_embeddings_{self.split}.pt')
              pose_embeddings = torch.load(self.load_path / f'pose_embeddings_{self.split}.pt')
              text_embeddings = torch.load(self.load_path / f'text_embeddings_{self.split}.pt')

              assert face_embeddings.shape[0] == pose_embeddings.shape[0] , 'ERROR:  face and pose list are not the same size in loading'
              assert pose_embeddings.shape[0] == text_embeddings.shape[0] , 'ERROR: text and pose list are not the same size in loading'
              assert text_embeddings.shape[0] == real_indexes.shape[0] , 'ERROR: text and real index list are not the same size in loading'
              assert real_indexes.shape[0] == correct_sentiments.shape[0] , 'ERROR: real index and sentiment list are not the same size in loading'

          except Exception as e:
              print(e)
              print('Warning: passed load=True but not embedding file was located. Not loading')
              if str(e).startswith('ERROR'):
                raise e
          print(face_embeddings.shape)
          print(pose_embeddings.shape)
          print(text_embeddings.shape)
          print(real_indexes.shape)
          print(correct_sentiments.shape)

        else:
            with open(self.text_file_path) as text_file, open(self.sentiment_file_path) as sentiment_file, open(self.correct_indexes_file_path) as correct_file:
                corrects = [int(c.strip()) for c in correct_file.readlines()]
                real_indexes = corrects
                texts = [t.strip() for t in text_file.readlines()]
                correct_texts = [texts[i] for i in real_indexes]
                sentiments = [int(t.strip()) for t in sentiment_file.readlines()]
                correct_sentiments = [sentiments[i] for i in real_indexes]
                face_embeddings = None
                pose_embeddings = None
                text_embeddings = None
        
        if self.data_size:
            real_indexes = real_indexes[: self.data_size]
            correct_sentiments =correct_sentiments[: self.data_size]
            if not correct_texts is None:
                correct_texts = correct_texts[: self.data_size]
            if not face_embeddings is None:
                face_embeddings = face_embeddings[:self.data_size,:]
            if not pose_embeddings is None:
                pose_embeddings = pose_embeddings[:self.data_size,:]
            if not text_embeddings is None:
                text_embeddings = text_embeddings[:self.data_size,:]


        self.texts = correct_texts
        self.sentiments = correct_sentiments
        self.indexes = real_indexes
        self.face_embeddings = face_embeddings
        self.pose_embeddings = pose_embeddings
        self.text_embeddings = text_embeddings
        # add assertion for this part
        # assert face_embeddings.shape[0] == pose_embeddings.shape[0] , 'ERROR:  face and pose list are not the same size in loading'
        # assert pose_embeddings.shape[0] == text_embeddings.shape[0] , 'ERROR: text and pose list are not the same size in loading'
        # assert text_embeddings.shape[0] == real_indexes.shape[0] , 'ERROR: text and real index list are not the same size in loading'
        # assert real_indexes.shape[0] == correct_sentiments.shape[0] , 'ERROR: real index and sentiment list are not the same size in loading'


    def __len__(self):
        if self.load:
            return self.text_embeddings.shape[0]
        return len(self.texts)

    def get_face_embedding(self, index, image):
        if self.load:
            return self.face_embeddings[index]
        (
            predictions,
            scores,
            representations,
        ) = self.face_embedding_extractor.extract_embedding(image)
        return representations

    def get_pose_embedding(self, index, image):
        if self.load:
            return self.pose_embeddings[index]
        return self.pose_embedding_extractor.extract_embedding(image)

    def get_image_embeddings(self, index):
        image = None
        real_index = self.indexes[index]
        image_name = self.image_dir / f"{real_index}.jpg"
        if not self.load:
            image = cv2.cvtColor(cv2.imread(str(image_name)), cv2.COLOR_BGR2RGB)

        face_embedding = self.get_face_embedding(index, image)
        pose_embedding = self.get_pose_embedding(index, image)
        return face_embedding, pose_embedding

    def get_sentiment(self, index):
        return self.sentiments[index]



    def get_text(self, index):
        if self.load :
            return self.text_embeddings[index]
        text = self.texts[index]
        text = self.text_embedding_extractor.extract_embedding([text])[0]
        return text

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()
        try:
            face_embedding, pose_embedding = self.get_image_embeddings(index)
        except Exception as e:
            print(f'error for split:{self.split} index: {index}')
            print(e)
            face_embedding = torch.ones(FACE_EMBEDDING_SIZE).to(self.device)*-123
            pose_embedding = torch.ones(POSE_EMBEDDING_SIZE).to(self.device)*-123

        sentiment = self.get_sentiment(index)
        text_embedding = self.get_text(index)
        sample = {"real_index": self.indexes[index], "pose_embedding": pose_embedding, "face_embedding": face_embedding, "text_embedding": text_embedding, "sentiment": sentiment}
        return sample


# Save features

In [None]:
%cd data/

In [None]:
!ls

In [None]:
# %mkdir data/saved_features/
# %mkdir backups/

In [12]:
from torch.utils.data import DataLoader
SAVE = True
if SAVE:
    SAVE_SPLIT = "train"
    SAVE_BATCH = 8
    dataset = MSCTDDataSet(base_path=project_path / "data/", split = SAVE_SPLIT, load=False)
    print(len(dataset))
    dataloader = DataLoader(dataset, batch_size=SAVE_BATCH)

enet_b0_8_best_afew.pt not exists
Downloading enet_b0_8_best_afew from https://github.com/HSE-asavchenko/face-emotion-recognition/blob/main/models/affectnet_emotions/enet_b0_8_best_afew.pt?raw=true


Downloading tokenizer_config.json:   0%|          | 0.00/334 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/838k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/925 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/415M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/keypointrcnn_resnet50_fpn_coco-fc266e95.pth" to /root/.cache/torch/hub/checkpoints/keypointrcnn_resnet50_fpn_coco-fc266e95.pth


  0%|          | 0.00/226M [00:00<?, ?B/s]

13228


In [13]:
def save_features(dataloader, split):
    save_path = project_path / 'data' / 'saved_features'
    stop_batch = None

    for batch_index, batch in enumerate(tqdm(dataloader)):
        # print('--------------------------------')
        # print(batch_index)
        # print(batch["face_embedding"].shape)
        # print(batch["text_embedding"].shape)
        # print(batch["real_index"].shape)
        # print(batch["sentiment"].shape)
        errors = (batch["pose_embedding"]==-123).all(dim=1)
        

        torch.save(batch["face_embedding"][~errors], save_path / f'face_embeddings_{split}_{batch_index}.pt')
        torch.save(batch["pose_embedding"][~errors], save_path / f'pose_embeddings_{split}_{batch_index}.pt')
        torch.save(batch["text_embedding"][~errors], save_path / f'text_embeddings_{split}_{batch_index}.pt')
        torch.save(batch["real_index"][~errors], save_path / f'real_indexes_{split}_{batch_index}.pt')
        torch.save(batch["sentiment"][~errors], save_path / f'sentiments_{split}_{batch_index}.pt')
        assert batch["pose_embedding"].shape[0] == batch["text_embedding"].shape[0] , 'text and pose list are not the same size in saving'
        assert batch["face_embedding"].shape[0] == batch["pose_embedding"].shape[0] , 'face and pose list are not the same size in saving'
        assert batch["text_embedding"].shape[0] == batch["real_index"].shape[0] , 'text and real index list are not the same size in saving'
        assert batch["real_index"].shape[0] == batch["sentiment"].shape[0] , 'real index and sentiment list are not the same size in saving'

        if stop_batch and batch_index==stop_batch:
          break


    print('----------------------')
    print(len(dataloader))
    len_batch = len(dataloader)
    if stop_batch:
        len_batch = stop_batch
    face_embeddings = []
    for i in range(len_batch):
        face_embeddings.append(torch.load(save_path / f'face_embeddings_{split}_{i}.pt'))
    face_embeddings = torch.cat(face_embeddings, dim=0)
    print(face_embeddings.shape)
    torch.save(face_embeddings, save_path / f'face_embeddings_{split}.pt')
    del face_embeddings

    pose_embeddings = []
    for i in range(len_batch):
        pose_embeddings.append(torch.load(save_path / f'pose_embeddings_{split}_{i}.pt'))
    pose_embeddings = torch.cat(pose_embeddings, dim=0)
    print(pose_embeddings.shape)
    torch.save(pose_embeddings, save_path / f'pose_embeddings_{split}.pt')
    del pose_embeddings

    text_embeddings = []
    for i in range(len_batch):
        text_embeddings.append(torch.load(save_path / f'text_embeddings_{split}_{i}.pt'))
    text_embeddings = torch.cat(text_embeddings, dim=0)
    print(text_embeddings.shape)
    torch.save(text_embeddings, save_path / f'text_embeddings_{split}.pt')
    del text_embeddings

    sentiments = []
    for i in range(len_batch):
        sentiments.append(torch.load(save_path / f'sentiments_{split}_{i}.pt'))
    sentiments = torch.cat(sentiments, dim=0)
    print(sentiments.shape)
    torch.save(sentiments, save_path / f'sentiments_{split}.pt')
    del sentiments

    real_indexes = []
    for i in range(len_batch):
        real_indexes.append(torch.load(save_path / f'real_indexes_{split}_{i}.pt'))
    real_indexes = torch.cat(real_indexes, dim=0)
    print(real_indexes.shape)
    torch.save(real_indexes, save_path / f'real_indexes_{split}.pt')
    del real_indexes



In [None]:
if SAVE:
    save_features(dataloader, SAVE_SPLIT)

In [None]:
%mkdir backup

In [None]:
# CHANGE VAL TO SPLIT
!cp data/saved_features/face_embeddings_val.pt backup
!cp data/saved_features/pose_embeddings_val.pt backup
!cp data/saved_features/real_indexes_val.pt backup
!cp data/saved_features/text_embeddings_val.pt backup
!cp data/saved_features/sentiments_val.pt backup


In [None]:
!ls -sh backup

In [None]:
# %ls -sh data/saved_features | grep face_embeddings_test.pt
%ls -sh data/saved_features/*_test.pt

In [None]:
!du data/saved_features/text_embeddings_val.pt -h

In [None]:
del dataset
del dataloader

#Data Loader

In [None]:
class MSCTDDataLoader:
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        for b in self.dl:
            yield to_device(b, self.device)

    def __len__(self):
        return len(self.dl)

def to_device(data, device):
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    if isinstance(data, dict):
        return {k: to_device(v, device) for k, v in data.items()}
    if isinstance(data, str):
        return data
    return data.to(device)

# ds = MSCTDDataSet(base_path=project_path + "data/", dataset_type = "val", load=True)
# dl = DataLoader(ds, batch_size=10)
# dl = MSCTDDataLoader(dl, device)
# for x in dl:
#   print(x)
#   print(x['face_embedding'].shape)
#   print(x['text_embedding'].shape)
#   print(x['real_index'])
#   break

In [None]:
import torch
from torch import nn

class SimpleDenseNetwork(nn.Module):
    def __init__(self, n_classes, embedding_dimension):
        super(SimpleDenseNetwork, self).__init__()

        self.n_classes = n_classes
        self.embedding_dimension = embedding_dimension

        self.fc = nn.Sequential(
            nn.Linear(
                in_features=self.embedding_dimension,
                out_features=512,
            ),
            nn.ReLU(inplace=True),
            nn.Linear(in_features=512, out_features=128),
            nn.ReLU(inplace=True),
            nn.Linear(in_features=128, out_features=3),
            # nn.ReLU(inplace=True),
            # nn.Softmax(dim=0),
        )

    def forward(self, input_batch):
        x = input_batch
        x = self.fc(x)
        output_batch = x

        return output_batch

#Train

In [None]:
BATCH_SIZE = 32
num_workers = 1
EPOCHS = 200
# embedding_dimension = 2048 + 34
embedding_dimension = FACE_EMBEDDING_SIZE + TEXT_EMBEDDING_SIZE + POSE_EMBEDDING_SIZE # + SCENE_EMBEDDING_SIZE

learning_rate = 0.001
momentum = 0.001
data_size = None

In [None]:
val_dataset = MSCTDDataSet(project_path / "data", "val", data_size=data_size, load=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
val_dataloader = MSCTDDataLoader(val_dataloader, device)

test_dataset = MSCTDDataSet(project_path / "data", "test", load=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
test_dataloader = MSCTDDataLoader(test_dataloader, device)

In [None]:
import torch.optim as optim
from datetime import datetime


def train_epoch(epoch_index, model, dataloader, loss_fn, optimizer):
    running_loss = 0.0
    # last_loss = 0.0

    for data_pair_index, batch in enumerate(dataloader):
        # print("--------------", data_pair_index, "-------------")
        errors = (batch["pose_embedding"]==-123).all(dim=1)
        text_embedding = batch["text_embedding"][~errors]
        face_embedding = batch["face_embedding"][~errors]
        pose_embedding = batch["pose_embedding"][~errors]
        labels = batch["sentiment"][~errors]
        optimizer.zero_grad()

        # print(text_embedding)
        # print(pose_embedding)
        # print(face_embedding)
        # print(text_embedding.shape)
        # print(pose_embedding.shape)
        # print(face_embedding.shape)
        outputs = model(torch.cat((face_embedding, text_embedding, pose_embedding), 1))

        loss = loss_fn(outputs, labels)
        loss.backward()
        # print(next(model.parameters()).grad)
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        # if data_pair_index % 1000 == 999:
        #     last_loss = running_loss / 1000  # loss per batch
        #     print("  batch {} loss: {}".format(data_pair_index + 1, last_loss))
        #     tb_x = epoch_index * len(dataloader) + data_pair_index + 1
        #     print("Loss/train", last_loss, tb_x)
        #     running_loss = 0.0
    print('Epoch loss: ', running_loss)
    # return last_loss


def train_model(model, epochs, train_dataloader, val_dataloader):
    loss_fn = nn.CrossEntropyLoss()
    # optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    for epoch in range(epochs):
        print("--------------epoch: ", epoch, "-------------")
        model.train()
        train_epoch(epoch, model, train_dataloader, loss_fn, optimizer)
        model.eval()
        validate(model, val_dataloader, loss_fn)

In [None]:
model = SimpleDenseNetwork(n_classes=3, embedding_dimension=embedding_dimension).to(device=device)

In [None]:
train_model(model, EPOCHS, val_dataloader, test_dataloader)

#Evaluating

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")
# precision = evaluate.load("precision")

def validate(model, dataloader, loss_fn):
    running_loss = 0.0
    last_loss = 0.0

    for data_pair_index, batch in enumerate(dataloader):
        # print("--------------", data_pair_index, "-------------")
        errors = (batch["pose_embedding"]==-123).all(dim=1)
        text_embedding = batch["text_embedding"][~errors]
        face_embedding = batch["face_embedding"][~errors]
        pose_embedding = batch["pose_embedding"][~errors]
        labels = batch["sentiment"][~errors]

        logits = model(torch.cat((face_embedding, text_embedding, pose_embedding), 1))

        # print(outputs)
        accuracy.add_batch(predictions=logits.argmax(dim=1), references=labels)
        # precision.add_batch(predictions=logits.argmax(dim=1), references=labels)
        loss = loss_fn(logits, labels)
        running_loss += loss.item()
        # print(running_loss)
        # print('true answer',labels)
        # print('prediction',logits.argmax(dim=1))
        # if data_pair_index==2:
        #   break
    print(accuracy.compute())
    # print(precision.compute(average=None))

In [None]:
validate(model, test_dataloader, nn.CrossEntropyLoss())