<a href="https://colab.research.google.com/github/MohammadForouhesh/Emotion-Detection/blob/main/EXA_CoLab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
! pip install -U sentence-transformers
! wget https://raw.githubusercontent.com/MohammadForouhesh/Emotion-Detection/main/persian.txt
! pip install pytorch-ignite

--2021-08-22 18:01:44--  https://raw.githubusercontent.com/MohammadForouhesh/Emotion-Detection/main/persian.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14933 (15K) [text/plain]
Saving to: ‘persian.txt.1’


2021-08-22 18:01:44 (25.6 MB/s) - ‘persian.txt.1’ saved [14933/14933]

Collecting pytorch-ignite
  Downloading pytorch_ignite-0.4.6-py3-none-any.whl (232 kB)
[K     |████████████████████████████████| 232 kB 4.1 MB/s 
Installing collected packages: pytorch-ignite
Successfully installed pytorch-ignite-0.4.6


# Parameters

In [10]:
from ignite.metrics import Precision, Recall, Accuracy
from sentence_transformers import SentenceTransformer
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from termcolor import colored
from datetime import datetime
from torch import nn
import pandas as pd
import argparse
import warnings
import logging
import torch
import time
import gc
import re

In [7]:
N_EPOCH = 10

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

file = open('persian.txt', 'r', encoding="utf8")
sw_persian = list(file.read().splitlines())


best_validation_loss = float('inf')

Using cuda device


# Preprocessing

In [11]:
def correction(series:pd.Series):
    assert isinstance(series, pd.Series)
    
    for line in series:
        line = line.replace('\n', '').replace('.', '')
        line = line.split(' ')

        yield list(map(int, line))


def remove_emoji(text:str) -> str:
    assert isinstance(text, str)
    
    emoji_pattern = re.compile(pattern="["
                                       u"\U0001F600-\U0001F64F"  # emoticons
                                       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                       u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                       u"\U00002702-\U000027B0"
                                       u"\U000024C2-\U0001F251"
                                       u"\U0001F300-\U0001F5FF"
                                       u"\U0001F1E6-\U0001F1FF"
                                       u"\U00002700-\U000027BF"
                                       u"\U0001F900-\U0001F9FF"
                                       u"\U0001F600-\U0001F64F"
                                       u"\U0001F680-\U0001F6FF"
                                       u"\U00002600-\U000026FF"
                                       "]+", flags=re.UNICODE)
    
    return str(emoji_pattern.sub(r'', text))


def remove_redundent_characters(text:str) -> str:
    assert isinstance(text, str)
    
    text = re.sub(r'@[A-Za-z0-9]+', ' ', text)  # Removed @mentions
    text = re.sub(r'_[A-Za-z0-9]+', ' ', text)  # Removed underlines
    text = re.sub(r'/(\r\n)+|\r+|\n+|\t+/', ' ', text)  # Removed \n
    text = re.sub(r'#', ' ', text)  # Removing the '#' symbol
    text = re.sub(r'RT[\s]+', ' ', text)  # Removing RT
    text = re.sub(r'https?:\/\/\S+', ' ', text)  # Remove the hyper link
    text = re.sub(r'\([ا-ی]{1,3}\)', ' ', text)  # Remove abbreviations
    text = re.sub(r"[\(\)]", " ", text)  # remove parantesis
    text = re.sub(r"\d|[۰-۹]", " ", text)
    text = re.sub(r"&|:", " ", text)
    text = re.sub(r"[A-Za-z]", " ", text)
    text = re.sub(r"[0-9]", " ", text)
    text = re.sub(r"\"", " ", text)
    text = re.sub(r"\'", " ", text)
    text = re.sub(r"_", " ", text)
    text = re.sub(r"@|=", " ", text)
    text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text)
    text = re.sub(r"{|}|;|\[|\]|\||؟|!|\+|\-|\*|\$", " ", text)
    text = re.sub(r"¹²|\/", " ", text)
    text = re.sub(r"»|>|<|«|,|؛|،|%|؟", " ", text)
    text = re.sub("\.|\^|,", " ", text)
    return text


def remove_stop_words(text:str) -> str:
    assert isinstance(text, str)
    
    return ' '.join([word for word in text.split(' ') if word not in sw_persian])


def preprocess(sentence:str) -> str:
    return remove_stop_words(
            remove_redundent_characters(
                remove_emoji(sentence)))

# Metrics

In [12]:
def ir_metrics(model, iterator):
    precision = Precision()
    recall = Recall()
    acc = Accuracy()
    
    # Start accumulation:
    for seq, label in iterator:
        y_pred = model(seq)
        precision.update((y_pred, label))
        recall.update((y_pred, label))
        acc.update((y_pred, label))
    
    print("Precision: ", precision.compute(), '\n', precision.compute().mean(), '\n')
    print("Recall: ", recall.compute(), '\n', recall.compute().mean(), '\n')
    print("Accuracy : ", acc.compute(), '\n')


def categorical_acc(preds, label):
    max_preds = preds.argmax(dim=1, keepdim=True)
    correct = max_preds.squeeze(1).eq(label)
    return correct.sum() / torch.cuda.FloatTensor([label.shape[0]])


def time_per_epoch(st, et):
    elt = et - st
    elasp_min = int(elt/60)
    elasp_sec = int(elt - elasp_min*60)
    return elasp_min, elasp_sec

# Models

In [13]:
trick_f = lambda tensor: tensor.permute(1, 0)\
                               .unsqueeze(-1)\
                               .expand(384, 1, 4)\
                               .unsqueeze(-1)\
                               .expand(384, 1, 4, 100)


class CNN(nn.Module):
    def __init__(self, input_size=384, embedding_dim=100, n_filters=384,
                 filter_sizes=[2, 3, 4], output_dim=42, drop_out=0.5, pad_idx=2):
        super().__init__()
        self.convs = nn.ModuleList([
                     nn.Conv2d(in_channels=1,
                               out_channels=n_filters,
                               kernel_size=(fs, embedding_dim))
                     for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.linear = nn.Linear(output_dim, 1)
        self.dropout = nn.Dropout(drop_out)

    def forward(self, text):
        embedded = trick_f(text)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        out_matrix = self.fc(cat)
        return self.linear(out_matrix).permute(1, 0)

In [14]:
class LSTM(nn.Module):
    def __init__(self, input_size=384, hidden_layer_size=150, output_size=10):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        
        self.lstm = nn.LSTM(input_size, hidden_layer_size, num_layers=1)
        self.linear = nn.Linear(hidden_layer_size, output_size)
        
        self.hidden_cell = (torch.rand(1, 1, self.hidden_layer_size).to(device),
                            torch.rand(1, 1, self.hidden_layer_size).to(device))
    
    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq), 1, -1), self.hidden_cell)
        predictions = self.linear(lstm_out.view(len(input_seq), -1))
        
        return predictions

# Running

In [26]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for seq, label in iterator:
            preds = model(seq)
            loss = criterion(preds, label)
            acc = categorical_acc(preds, label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [27]:
def train(model, iterator, optimizer, criterion, if_lstm=False):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for seq, label in iterator:
        optimizer.zero_grad()
        if if_lstm: model.hidden_cell = (torch.rand(1, 1, model.hidden_layer_size).to(device),
                                         torch.rand(1, 1, model.hidden_layer_size).to(device))
        preds = model(seq)
        loss = criterion(preds, label)
        acc = categorical_acc(preds, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [28]:
best_validation_loss = float('inf')


def run(model, iterator, optimizer, loss_function, n_epoch=N_EPOCH, if_lstm=False):
    
    global best_validation_loss
    for epoch in range(n_epoch):
        start_time = time.time()
        train_loss, train_acc = train(model, iterator, optimizer, criterion=loss_function, if_lstm=if_lstm)
        
        valid_loss, valid_acc = evaluate(model, iterator, criterion=loss_function)
        
        end_time = time.time()
        epoch_mins, epoch_secs = time_per_epoch(start_time, end_time)
        
        if valid_loss < best_validation_loss:
            best_validation_loss = valid_loss
            torch.save(model.state_dict(), 'exa_emotion_classification.pt')
            
        if (epoch + 1) % 10 != 0: continue
        print(f'Epoch {epoch + 1}, Time: {epoch_mins} mins: {epoch_secs} secs')
        print(f'\t Train Loss {train_loss:.3f}, Train Acc {train_acc * 100:.3f}')
        print(f'\t Valid Loss {valid_loss:.3f}, Valid Acc {valid_acc * 100:.3f}')
    
    return model


# Main

In [37]:
gc.enable()
warnings.filterwarnings("ignore", category=SyntaxWarning)


emb_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')


def preparation(train_path, test_path) -> (pd.DataFrame, pd.DataFrame):
    train_df = pd.read_csv(train_path)
    
    test_df = pd.read_csv(test_path)
    df = train_df.append(test_df)
    df['category_id'] = df['label'].factorize()[0]
    
    category_id_df = df[['label', 'category_id']].drop_duplicates().sort_values('category_id')
    category_to_id = dict(category_id_df.values)
    
    df.label = list(correction(df.label))
    df['preprocessed'] = df.text.apply(preprocess)
    
    return df[:5999], df[5999:], category_to_id


def main():
    train_path = 'Emotion.csv'
    model_name = 'lstm'
    preprocess = True
    epoch = 60
    test_path = 'EmotionTest.csv'
    
    train_df, test_df, category_to_id = preparation(train_path, test_path)
    
    sentences = list(train_df.preprocessed)
    sentence_embeddings = emb_model.encode(sentences)
    
    inputs = torch.from_numpy(sentence_embeddings).to(device)
    target = torch.cuda.LongTensor(train_df.category_id)
    
    train_ds = TensorDataset(inputs, target)
    
    sentences = list(test_df.preprocessed)
    sentence_embeddings = emb_model.encode(sentences)

    inputs = torch.from_numpy(sentence_embeddings).to(device)
    target = torch.cuda.LongTensor(test_df.category_id)

    test_ds = TensorDataset(inputs, target)

    bach_size = 5 if model_name == 'lstm' else 1

    train_dl = DataLoader(train_ds, bach_size, shuffle=True)
    test_dl = DataLoader(test_ds, bach_size, shuffle=True)
    
    loss_function = nn.CrossEntropyLoss()
    loss_function = loss_function.to(device)

    print(colored('[' + str(datetime.now().hour) + ':' + str(datetime.now().minute) + ']', 'cyan'),
          colored('\n====================TRAIN='+model_name+'=====================', 'red'))
    if model_name == 'lstm':        
        model = LSTM(output_size=len(category_to_id)).to(device)

        optimizer = torch.optim.AdamW(model.parameters(), amsgrad=True)
        
        trained_model = run(model=model, iterator=train_dl, optimizer=optimizer,
                            loss_function=loss_function, n_epoch=epoch, if_lstm=True)
            
    elif model_name == 'cnn':
        
        model = CNN(output_dim=len(category_to_id)).to(device)
        optimizer = torch.optim.AdamW(model.parameters(), amsgrad=True)
        
        trained_model = run(model=model, iterator=train_dl, optimizer=optimizer,
                            loss_function=loss_function)

    ir_metrics(model=trained_model, iterator=train_dl)

    print(colored('[' + str(datetime.now().hour) + ':' + str(datetime.now().minute) + ']', 'cyan'),
          colored('\n====================Test=='+model_name+'=====================', 'red'))
    
    ir_metrics(model=trained_model, iterator=test_dl)
        
if __name__ == '__main__':
    with warnings.catch_warnings():
        logging.basicConfig(filename='exa_model.log', format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        warnings.filterwarnings("ignore")
        print(colored('[' + str(datetime.now().hour) + ':' + str(datetime.now().minute) + ']', 'cyan'),
              colored('\n===============EXA=EmotionDetection===============', 'red'))
        main()
        print(colored('[' + str(datetime.now().hour) + ':' + str(datetime.now().minute) + ']', 'cyan'))


[36m[19:18][0m [31m


Batches:   0%|          | 0/188 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[36m[19:18][0m [31m
Epoch 10, Time: 0 mins: 4 secs
	 Train Loss 1.899, Train Acc 41.383
	 Valid Loss 1.787, Valid Acc 44.092
Epoch 20, Time: 0 mins: 5 secs
	 Train Loss 1.271, Train Acc 62.092
	 Valid Loss 1.113, Valid Acc 67.996
Epoch 30, Time: 0 mins: 4 secs
	 Train Loss 0.788, Train Acc 78.350
	 Valid Loss 0.683, Valid Acc 82.450
Epoch 40, Time: 0 mins: 4 secs
	 Train Loss 0.504, Train Acc 88.613
	 Valid Loss 0.429, Valid Acc 91.100
Epoch 50, Time: 0 mins: 4 secs
	 Train Loss 0.356, Train Acc 92.867
	 Valid Loss 0.297, Valid Acc 94.200
Epoch 60, Time: 0 mins: 4 secs
	 Train Loss 0.276, Train Acc 94.917
	 Valid Loss 0.225, Valid Acc 96.267
Precision:  tensor([0.9461, 0.9449, 0.9537, 0.8785, 0.9886, 1.0000, 0.9922, 0.9909, 1.0000,
        0.9695, 1.0000, 0.9706, 0.9664, 1.0000, 0.9694, 0.9951, 0.9317, 0.9907,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000,