# Lie Recognition using LSTM + Dense Layers

### Torch Dataset Creation:

In [19]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import math

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode


def load_coord_data(sample_data_path, frame_cap):
    
    if sample_data_path.endswith(".npy"):
        load_sample = np.load(sample_data_path)[:frame_cap]
    elif sample_data_path.endswith(".csv"):
        load_sample = pd.read_csv(sample_data_path)[:frame_cap]
        return load_sample
        load_sample = load_sample.values()
    else:
        print(">>> WARNING: No support for {sample_data_path}. Returning None.")
        load_sample = None
    
    return load_sample
    
    


class DeceptionDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, mode, collection_type, data_dir, csv_dir,
                 class_to_num={"truth": 0, "lie": 1}, num_to_class={0: "truth", 1: "lie"}, transform=None,
                 seconds_input_size=3, fps_min=29, keypoints_quantity=478, coordinate_amount=3):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data_dir = data_dir
        self.data_df = pd.read_csv(os.path.join(csv_dir, f"{mode}_DARE.csv"))
        print("frame_sum:", sum(self.data_df["total_frame_count"].tolist()))
        self.transform = transform
        if collection_type == "MediaPipe":
            self.sample_postfix = "_MP_coord.npy"
        elif collection_type == "OpenFace":
            self.sample_postfix = ".csv"
        else:
            print(">>> WARNING: No such collection type was used before. Sample postfix append was set to blank.")
            self.sample_postfix = ""
        self.class_to_num = class_to_num
        self.num_to_class = num_to_class
        self.frame_cap = fps_min * seconds_input_size
        self.input_dim = (self.frame_cap*keypoints_quantity*coordinate_amount)  # torch.Size([4, 87, 478, 3])
        self.keypoints_quantity = keypoints_quantity
        self.coordinate_amount = coordinate_amount
        # 126, 87 (29FPS*3=87), *45*
        # 87
            

    def __len__(self):
        return len(self.data_df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        row_data = self.data_df.iloc[idx]
        
        # get path to data itself:
        base_name = row_data["video_name"].split(".")[0]
        data_path = os.path.join(self.data_dir,
                                 f"{base_name}{self.sample_postfix}").replace("\\", "/")
        
        # get label:
        label = self.class_to_num[row_data["label"]]        
        data  = load_coord_data(data_path, self.frame_cap)
        
        # if FPS is lower, duplicate every frame to increase (generate slow video as workaround):
        [1,2,3] => desired size is 6/4 => 2 => [1,1,2,2,3,3]
        
        if data.shape[0] < self.frame_cap:
            repeat_for = int(math.ceil(self.frame_cap / data.shape[0]))
            data = np.repeat(data, repeat_for, axis=0)[:self.frame_cap]
            # add additional edge case carry out:
            if len(data) < self.frame_cap:
                extra_needed = self.frame_cap - len(data)
                extra_array = np.zeros((extra_needed, data.shape[1], 3), dtype=float)
                data = np.concatenate((data, extra_array))
        print("Before flatten keypoints X coord shape:", data.shape)  # (87, 478, 3)  # 87*478*3 = 124758
        data = data.reshape(self.frame_cap*self.keypoints_quantity * self.coordinate_amount)
        print("After flattening keypoints X coord shape:", data.shape)
                
        return data, label

# Model Definition:

In [20]:
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim


class LSTM_DeceptionRecognition(nn.Module):
    def __init__(self, lstm_embedding_length, lstm_size, lstm_layers, inner_linear_size, target_size):
        super(LSTM_DeceptionRecognition, self).__init__()
        
        # var definition:
        self.lstm_size = lstm_size
        self.lstm_layers = lstm_layers
        
        # defining layers:
        self.lstm = nn.LSTM(lstm_embedding_length, lstm_size, lstm_layers, batch_first=True).double()
        self.inner_layer = nn.Linear(lstm_size, inner_linear_size)
        self.label_layer = nn.Linear(inner_linear_size, target_size)
        
    def init_state(self, batch_size):
        return (torch.zeros(self.lstm_layers, batch_size, self.lstm_size),
                torch.zeros(self.lstm_layers, batch_size, self.lstm_size))

    def forward(self, padded_input):
        hs = self.init_state(self.lstm_size)
        lstm_out, lstm_hidden_embedding = self.lstm(padded_input, hs)
        inner_layer_out = self.inner_layer(lstm_out)
        label_out = self.label_layer(inner_layer_out)
        
        return label_out

# Training:

In [22]:
# setup dataset:
deception_dataset = DeceptionDataset(mode="train",
                                     collection_type="MediaPipe",
                                     data_dir=os.path.join(os.getcwd(), 'mediaPipe_keypoints_data_UPD'),
                                     csv_dir='../data',  # audio_data_UPD,
                                     class_to_num={"truth": 0, "lie": 1},
                                     num_to_class={0: "truth", 1: "lie"},
                                     transform=None,
                                     seconds_input_size=3,
                                     fps_min=29,
                                     keypoints_quantity=478,
                                     coordinate_amount=3)


# derive dataloader from deception_dataset:
caut_dataloader = DataLoader(deception_dataset, batch_size=4, shuffle=True, num_workers=0)

# get lstm input size:
input_dim = deception_dataset.input_dim
print("Input dimension set to:", input_dim)


# initializing model:
lstm_caut_model = LSTM_DeceptionRecognition(lstm_embedding_length=input_dim,
                                            lstm_size=32,
                                            lstm_layers=3,
                                            inner_linear_size=50,
                                            target_size=1)
loss_function = nn.BCELoss()
optimizer = optim.Adam(lstm_caut_model.parameters(), lr=0.1)



# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    for i_batch, batch_data in enumerate(caut_dataloader):
        tmp_data, tmp_label = batch_data
        print("Label:", tmp_label)
        print("Input data:", tmp_data.shape)
        lstm_tmp_result = lstm_caut_model(tmp_data)
        print(lstm_tmp_result)
        break

    
'''
for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)
'''

frame_sum: 42201.45619380613
Input dimension set to: 124758
Before flatten keypoints X coord shape: (87, 478, 3)
After flattening keypoints X coord shape: (124758,)
Before flatten keypoints X coord shape: (87, 478, 3)
After flattening keypoints X coord shape: (124758,)
Before flatten keypoints X coord shape: (87, 478, 3)
After flattening keypoints X coord shape: (124758,)
Before flatten keypoints X coord shape: (87, 478, 3)
After flattening keypoints X coord shape: (124758,)
Label: tensor([0, 1, 0, 1])
Input data: torch.Size([4, 124758])


RuntimeError: For unbatched 2-D input, hx and cx should also be 2-D but got (3-D, 3-D) tensors

------------------------------------------------------------------

# Experimental Area:

In [None]:
# print(np.array([[[1,1,1],[2,2,2]], [[3,3,3],[4,4,4]]]).shape)
# np.repeat(np.array([[[1,1,1],[2,2,2]], [[3,3,3],[4,4,4]]]), 2, axis=1).shape