# X & X

In [1]:
import pandas as pd
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#from utils_3A import img_frac, find, postionning, emot_grid, data_video, flatten, cleaning, datafromto
%matplotlib inline
%autosave 60

Autosaving every 60 seconds


## Data preprocessing

In [2]:
def img_frac(img, n, m):
    """
    fractionne l'image en m lignes et n colonnes
    """
    height, width=img.shape[:2]
    x_div=[]
    y_div=[]
    i, j=int(width/n), int(height/m)
    for l in range(1,n):
        x_div.append(l*i)
    for l in range(1,m):
        y_div.append(l*j)

    return (x_div, y_div, img)

def find(x_c, x):
    """
    Position de x_c dans x
    """
    x=[0]+x
    for i in range(len(x)-1):
        if (x_c>=x[i] and x_c<x[i+1]):
            return i
    return i+1

def postionning(frame):
    """
    positionnement de chaque individu dans la grille de l'image
    """
    img_path='Analyse_Json_Stade/Frames 9674/'+str(frame)+'.jpg'
    img=cv2.imread(img_path)
    (x_div, y_div, img)=img_frac(img, n, m)
    base=pd.read_json('Analyse_Json_Stade/AS5I9674/frame_'+str(frame)+'.json')
    base=base[base.type=='person']
    pos_x=[]
    pos_y=[]
    for k in range(len(base)):
        rectangle=base.iloc[k]['data']['rectangle']
        x_c=rectangle['x']+rectangle['width']/2
        y_c=rectangle['y']+rectangle['height']/2
        pos_x.append(find(int(x_c), x_div))
        pos_y.append(find(int(y_c), y_div))
    return (pos_x, pos_y)

def emot_grid(frame, m, n):
    """
    Emotions dans chaque grille de l'image
    """
    (pos_x, pos_y)=postionning(frame)
    base=pd.read_json('Analyse_Json_Stade/AS5I9674/frame_'+str(frame)+'.json')
    base=base[base.type=='person']
    emotions=[[{'anger':0, 'fear':0, 'happiness':0, 'neutral':0, 'sadness':0, 'surprise':0} for _ in range(n)] for _ in range(m)]
    counts=np.zeros((m,n))
    for k in range(len(base)):
        emot=base.iloc[k]['data']['emotion']
        j=pos_x[k]
        i=pos_y[k]
        counts[i][j]+=1
    
        emotions[i][j]['anger']+=emot['anger']
        emotions[i][j]['fear']+=emot['fear']
        emotions[i][j]['happiness']+=emot['happiness']
        emotions[i][j]['neutral']+=emot['neutral']
        emotions[i][j]['sadness']+=emot['sadness']
        emotions[i][j]['surprise']+=emot['surprise']
    for i in range(m):
        for j in range(n):
            if counts[i][j]!=0:
                for sent in ['anger', 'fear', 'happiness', 'neutral', 'sadness', 'surprise']:
                    emotions[i][j][sent]/=counts[i][j]
    return (emotions, counts)

def data_video(video, n, m):
    emotions_video=[]
    counts_video=[]
    i=0
    while os.path.exists('Analyse_Json_Stade/Frames '+str(video)+'/%d.jpg' % i):   
        (emotions, counts)=emot_grid(i, m, n)
        emotions_video.append(emotions)
        counts_video.append(counts)
        i+=1
        if i%500==0:
            print(i)
    return (emotions_video, counts_video)

def flatten(l):
    flatten_emotions=[0 for i in range(len(l))]
    for k in range(len(l)):
        flatten_emotions[k]=[list(l[k][i][j].values()) for i in range(m) for j in range(n)] 
    return flatten_emotions

def cleaning(l):
    for k in range(len(l)):
        for m in range(len(l[k])):
            if sum(l[k][m])==0:
                l[k][m]=[0.01, 0.01, 0.01, 0.95, 0.01, 0.01]#neutral
    return l
    
def datafromto(l, fps, start, end):
    n_i=start*fps
    n_f=end*fps
    x=l[n_i:n_f]
    x=torch.tensor(np.array(x), requires_grad=False)
    x=x.type(torch.FloatTensor)
    return x

def metric(a,b):
    return int(a.index(min(a))==b.index(min(b)))

In [3]:
n, m=15, 10
video='9674'
T_enc=40
T_dec=10
fps=24

(emotions_video, counts_video)=data_video(video, n, m)
flatten_emotions=flatten(emotions_video)
flatten_emotions=cleaning(flatten_emotions)

tensor_enc=datafromto(flatten_emotions, fps, 0, T_enc)
tensor_dec=datafromto(flatten_emotions, fps, T_enc, T_enc+T_dec)

500
1000
1500


## Modélisation

### The Seq2Seq Model


A Recurrent Neural Network, or RNN, is a network that operates on a
sequence and uses its own output as input for subsequent steps.

A `Sequence to Sequence network <http://arxiv.org/abs/1409.3215>`__, or
seq2seq network, or `Encoder Decoder
network <https://arxiv.org/pdf/1406.1078v3.pdf>`, is a model
consisting of two RNNs called the encoder and decoder. The encoder reads
an input sequence and outputs a single vector, and the decoder reads
that vector to produce an output sequence.

<img src=https://pytorch.org/tutorials/_images/seq2seq.png>

### The Encoder

The encoder of a seq2seq network is a RNN that outputs some value for
every word from the input sentence.

In [4]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size1,hidden_size2, hidden_size3):
        super(EncoderRNN, self).__init__()
        self.hidden_size1 = hidden_size1
        self.hidden_size2 = hidden_size2
        self.hidden_size3 = hidden_size3

        self.ih1 = nn.Linear(input_size + hidden_size1, hidden_size1)
        self.h1h2 = nn.Linear(hidden_size1 + hidden_size2, hidden_size2)
        self.h2h3 = nn.Linear(hidden_size2 + hidden_size3, hidden_size3)
        self.h3o = nn.Linear(hidden_size3, input_size)

        
    def forward(self, input, hidden1, hidden2, hidden3):
        combined1 = torch.cat((input, hidden1), 1)
        hidden1 = self.ih1(combined1)
        hidden1=F.relu(hidden1)
        
        combined2 = torch.cat((hidden1, hidden2), 1)
        hidden2 = self.h1h2(combined2)
        hidden2=F.relu(hidden2)
        
        combined3 = torch.cat((hidden2, hidden3), 1)
        hidden3 = self.h2h3(combined3)
        hidden3=F.relu(hidden3)
        
        output = self.h3o(hidden3)
        output=F.tanh(output)
        output = F.log_softmax(output)#softmax
        return output, hidden1, hidden2, hidden3

    def initHidden(self, batch_size):
        #initialization of c(cell's state) et h(hidden state) h=Otanh(c)
        return (torch.zeros(batch_size, self.hidden_size1, device=device),
                torch.zeros(batch_size, self.hidden_size2, device=device),
                torch.zeros(batch_size, self.hidden_size3, device=device))

#### Test encodeur

input_size=6
hidden_size1=10
hidden_size2=20
hidden_size3=10

encoder=EncoderRNN(input_size, hidden_size1,hidden_size2, hidden_size3)
X_enc=tensor_enc[0]

batch_size=X_enc.shape[0]
hidden1_enc, hidden2_enc, hidden3_enc= encoder.initHidden(batch_size)
output_enc, hidden1_enc, hidden2_enc, hidden3_enc=encoder(X_enc, hidden1_enc, hidden2_enc, hidden3_enc)

output_enc[0]

### The Decoder

The decoder is another RNN that takes the encoder output vector(s) and
outputs a sequence of values.

Simple Decoder


In the simplest seq2seq decoder we use only last output of the encoder.
This last output is sometimes called the *context vector* as it encodes
context from the entire sequence. This context vector is used as the
initial hidden state of the decoder.

At every step of decoding, the decoder is given an input and hidden state, 
and the first hidden state is the context vector (the encoder's
last hidden state).

In [5]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size1, hidden_size2, hidden_size3, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size1 = hidden_size1
        self.hidden_size2 = hidden_size2
        self.hidden_size3 = hidden_size3

        self.ih1 = nn.Linear(output_size + hidden_size1, hidden_size1)
        self.h1h2 = nn.Linear(hidden_size1 + hidden_size2, hidden_size2)
        self.h2h3 = nn.Linear(hidden_size2 + hidden_size3, hidden_size3)
        self.h3o = nn.Linear(hidden_size3, output_size)

    def forward(self, input, hidden1, hidden2, hidden3):
        combined1 = torch.cat((input, hidden1), 1)
        hidden1 = self.ih1(combined1)
        hidden1=F.relu(hidden1)
        
        combined2 = torch.cat((hidden1, hidden2), 1)
        hidden2 = self.h1h2(combined2)
        hidden2=F.relu(hidden2)
        
        combined3 = torch.cat((hidden2, hidden3), 1)
        hidden3 = self.h2h3(combined3)
        hidden3=F.relu(hidden3)
        
        output = self.h3o(hidden3)
        output=F.tanh(output)
        output = F.log_softmax(output)#softmax
        return output, hidden1, hidden2, hidden3

    def initHidden(self, batch_size):
        #initialization of c(cell's state) et h(hidden state) h=Otanh(c)
        return (torch.zeros(batch_size, self.hidden_size1, device=device),
                torch.zeros(batch_size, self.hidden_size2, device=device),
                torch.zeros(batch_size, self.hidden_size3, device=device))

#### Test Decoder

input_size=6
hidden_size1=10
hidden_size2=20
hidden_size3=10
output_size=input_size

encoder=EncoderRNN(input_size, hidden_size1, hidden_size2, hidden_size3)
X_enc=tensor_enc[0]

batch_size=X_enc.shape[0]
hidden1_enc, hidden2_enc, hidden3_enc= encoder.initHidden(batch_size)
output_enc, hidden1_enc, hidden2_enc, hidden3_enc=encoder(X_enc, hidden1_enc, hidden2_enc, hidden3_enc)

decoder=DecoderRNN(hidden_size1, hidden_size2, hidden_size3, output_size)
X_dec=tensor_dec[0]

hidden1_dec, hidden2_dec, hidden3_dec=hidden1_enc, hidden2_enc, hidden3_enc
output_dec, hidden1_dec, hidden2_dec, hidden3_dec=decoder(X_dec, hidden1_dec, hidden2_dec, hidden3_dec)

output_dec[0]

## Training the Model

To train we run the input through the encoder, and keep track
of every output and the latest hidden state. Then the decoder is given its first input, and the last hidden state of the
encoder as its first hidden state.

"Teacher forcing" is the concept of using the real target outputs as
each next input, instead of using the decoder's guess as the next input.
Using teacher forcing causes it to converge faster but `when the trained
network is exploited, it may exhibit
instability <http://minds.jacobs-university.de/sites/default/files/uploads/papers/ESNTutorialRev.pdf>`__.

In [6]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    batch_size=input_tensor.shape[1]
    hidden1_enc, hidden2_enc, hidden3_enc= encoder.initHidden(batch_size)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.shape[0]
    target_length = target_tensor.shape[0]

    loss = 0
    metrics=0
    for ei in range(input_length):
        output_enc, hidden1_enc, hidden2_enc, hidden3_enc=encoder(input_tensor[ei], hidden1_enc, hidden2_enc, hidden3_enc)

    hidden1_dec, hidden2_dec, hidden3_dec=hidden1_enc, hidden2_enc, hidden3_enc
    # Teacher forcing: Feed the target as the next input
    for di in range(target_length):
        output_dec, hidden1_dec, hidden2_dec, hidden3_dec=decoder(target_tensor[di], hidden1_dec, hidden2_dec, hidden3_dec)
        loss += criterion(output_dec, target_tensor[di])
        a=[metric(list(output_dec[k]), list(target_tensor[di][k])) for k in range(output_dec.shape[0])]
        metrics+=sum(a)/output_dec.shape[0]
        
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    #return loss.item()/target_length
    return (loss.item()/target_length, metrics/target_length)

In [7]:
import time
import math

def asMinutes(s):
    """
    Transforme les secondes en minutes et secondes
    """
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

The whole training process looks like this:

-  Start a timer
-  Initialize optimizers and criterion
-  Create set of training pairs
-  Start empty losses array for plotting

Then we call ``train`` many times and occasionally print the progress (%
of examples, time so far, estimated time) and average loss.




In [8]:
def trainIters(input_tensor, target_tensor, encoder, decoder, n_iters, print_every, learning_rate):
    start = time.time()
    print_loss_total = 0  # Reset every print_every
    print_metrics_total = 0
    
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.MSELoss()
    batch_size=input_tensor.shape[0]

    for iter in range(1, n_iters + 1):
        #penser à la permutation
        #loss= train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        loss, metrics = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        print_metrics_total += metrics

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_metrics_avg = print_metrics_total / print_every
            print_loss_total = 0
            print_metrics_total = 0
            
            print('# %s (%d %d%%) (%.4f %.2f%%)' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg, print_metrics_avg*100))
            

With all these helper functions in place, we can actually initialize a network and start training.



In [9]:
input_size=6
hidden_size1=10
hidden_size2=20
hidden_size3=10
output_size=input_size

n_iters=20000
print_every=1000
learning_rate=0.01

input_tensor=tensor_enc
target_tensor=torch.tensor(np.log(np.array(tensor_dec)), requires_grad=False).type(torch.FloatTensor)

encoder=EncoderRNN(input_size, hidden_size1, hidden_size2, hidden_size3).to(device)
decoder=DecoderRNN(hidden_size1, hidden_size2, hidden_size3, output_size).to(device)

In [10]:
trainIters(input_tensor, target_tensor, encoder, decoder, n_iters, print_every, learning_rate)



# 5m 48s (- 52m 19s) (100 10%) (0.4958 43.87%)
# 11m 49s (- 47m 17s) (200 20%) (0.2973 50.49%)
# 18m 10s (- 42m 25s) (300 30%) (0.2626 58.65%)
# 23m 45s (- 35m 38s) (400 40%) (0.2581 58.93%)
# 29m 20s (- 29m 20s) (500 50%) (0.2564 59.85%)
# 35m 23s (- 23m 35s) (600 60%) (0.2541 60.14%)
# 41m 29s (- 17m 47s) (700 70%) (0.2546 59.72%)
# 48m 5s (- 12m 1s) (800 80%) (0.2539 60.75%)
# 54m 26s (- 6m 2s) (900 90%) (0.2511 62.66%)
# 60m 36s (- 0m 0s) (1000 100%) (0.2482 62.75%)
