To be ran in Google Colab with downloaded GoEmotions: data
email `elguezabala@merrimack.edu` to be added to the Shared Drive containing data.
Data is also in github repository ```/server/data``` directory.

In [32]:
!pip install flask_cors
!pip install waitress



Mount google drive to upload the GoEmotions Data

In [33]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
# Tensor Flow
import tensorflow as tf
import tensorflow_hub as hub

# Numpy and Pandas and torch
import numpy as np
import pandas as pd

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# JSON
import json as json

# Beautiful Soup
import requests
from bs4 import BeautifulSoup

# TOML
import toml

# System Utils
import random
import time

# Transformers
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

# Sklearn
from sklearn.model_selection import train_test_split

# sys
import sys

In [84]:
# Class built for our BERT Classifier.
# Source: https://skimai.com/fine-tuning-bert-for-sentiment-analysis/
class BertClassifier(nn.Module):

    # Constructor, creates the classifier
    def __init__(self, freeze_bert = True):
        """
        @param  bert_model: BertModel object
        @param  freeze_bert (bool): Set `False` to fine-tune the BERT model (according to the resource)
        @param  classifier: torch.nn.Module classifier
            #    Note: torch.nn.Module is used to help train / build        #
            #       neural networks, so we will build on BERT with this     #
        """
        super(BertClassifier, self).__init__()
        # Specify output size of BERT model (768), hidden size of our classifier (50), and number of labels (28)
        D_in, H, D_out = 768, 50, 28


        # Instantiate the pre-trained BERT tokenizer and model
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate one-layer feed-forward classifer
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H), # Applies linear transformation
            nn.ReLU(),
            nn.Linear(H, D_out)
        )
        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert_model.parameters():
                param.requires_grad = False

    # Define a feed-forward function to compute the logits
    # Logits = Output of logistic regression function (done through NN) (between 0 and 1)
    def forward(self, ids, mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask information
        @return   logits (torch.Tensor): an output tensor
        """
        # Feed input to BERT
        outputs = self.bert_model(input_ids=ids, attention_mask=mask)

        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [93]:
class Model:
    # Constructor, creates the classifier
    def __init__(self, dataframe, val_dataset):
        """
        @param  bert_classifier: BertClassifier object
        @param  tokenizer: BertTokenizer object
        """

    # TRAINING DATA
        # Load the CSV data (expected to run from folder: "./Artificial Intelligence/Spotify_NLP_Service")
        df = dataframe

        # Define the emotion labels
        self.emotion_labels = ["admiration", "amusement", "anger", "annoyance", "approval", "caring",
                        "confusion", "curiosity", "desire", "disappointment", "disapproval",
                        "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
                        "joy", "love", "nervousness", "optimism", "pride", "realization",
                        "relief", "remorse", "sadness", "surprise", "neutral"]

        # Step 1: Tokenize inputs so BERT can read it.
            # Initialize the tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            # Tokenize the inputs
        train_input_ids, train_attention_masks, train_labels = self.tokenize_inputs(df)

        # Step 2: Load the data into a DataLoader object
            # Resource(s): https://www.youtube.com/watch?v=mw7ay38--ak
        train_data_loader = self.to_data_loader(train_input_ids, train_attention_masks, train_labels)

        # Step 3: Initialize the BertClassifier, optimizer, and the training scheduler
            # (train_data_loader is used for sizing)
            # Initialize the model for training purposes
        classifier, optimizer, scheduler = self.initialize_model(train_data_loader)

        self.bert_classifier = classifier
        self.optimizer = optimizer
        self.scheduler = scheduler

        # Step 4: Train the data
        self.train(train_data_loader=train_data_loader, validation_df=val_dataset)

    # Initializing the model, optimizer, and learning rate scheduler for training
        # Source: https://skimai.com/fine-tuning-bert-for-sentiment-analysis/
    def initialize_model(self, train_data_loader, epochs=4):
        # Initialize the classifier model, the optimizer, and the learning rate scheduler
        classifier = BertClassifier(freeze_bert=False) #Instantiate the model
        # Try to use GPU (cuda). Otherwise, we will have to use CPU
        self.device = None
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
            print(f'There are {torch.cuda.device_count()} GPU(s) available.')
            print('Device name:', torch.cuda.get_device_name(0))
        else:
            print('No GPU available, using the CPU instead.')
            self.device = torch.device("cpu")

        # Assign the model to hardware
        classifier.to(self.device)

        # Create an AdamW optimizer
        optimizer = torch.optim.AdamW(classifier.parameters(),
                        lr=5e-5,   # Best learning rate (lr) described. Also default
                        eps=1e-8)  # Default epsilon value

        # Total number of training steps for the lr scheduler
        total_steps = len(train_data_loader) * epochs

        # Set up learning rate scheduler for our model
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=0, #Default
                                                    num_training_steps=total_steps)

        return classifier, optimizer, scheduler


    # Define a function to tokenize the input and prepare it for fine-tuning
    def tokenize_inputs(self, data):
        input_ids = []          # Holds tensor information for the word in context
        attention_masks = []    # Holds binary data on what's important (1 for tensor data, 0 for filler)
        labels = []             # Holds binary data on emotions (1 for has emotion, 0 for doesn't)

        # Iterate through each sentence and emotion labels associated with that sentence for each row.
        for sentence, emotions in zip(data['text'], data[self.emotion_labels].values):
            # Tokenize the input sentence
            # Tokenizer understanding credit: https://towardsdatascience.com/how-to-train-a-bert-model-from-scratch-72cfce554fc6
            encoded_dict = self.tokenizer.encode_plus(
                                sentence,
                                add_special_tokens = True,
                                max_length = 128,
                                padding = 'max_length',
                                truncation=True,
                                return_attention_mask = True,
                                return_tensors = 'pt'
                        )

            input_ids.append(encoded_dict['input_ids'])

            # Add the attention mask for the encoded sentence to the list
            attention_masks.append(encoded_dict['attention_mask'])

            # Add the labels to the list
            labels.append(emotions)

        # If input_ids is empty, then return None, None, None
        if not input_ids:
            return None, None, None

        # Convert the lists to tensors
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        labels = torch.tensor(labels)

        # Return the tokenized inputs and labels
        return input_ids, attention_masks, labels

    def to_data_loader(self, input_ids, attention_masks, labels):
        # Place data in a
            # Place data in a PyTorch DataLoader (faster training / less resources)
        data = TensorDataset(input_ids, attention_masks, labels)
        #data_sampler = RandomSampler(data)
        data_loader = DataLoader(data, batch_size=64, shuffle=True) # removed sampler=data_sampler,
        return data_loader


    # Define a function to evaluate a model
    def eval(self, df):

        # Tokenize inputs from the df
        input_ids, attention_masks, labels = self.tokenize_inputs(df)

        # Do a null check (if there's no lyrics to tokenize, return an empty list)
        if(input_ids is None):
            return []


        # Convert to a data loader
        dataloader = self.to_data_loader(input_ids, attention_masks, labels)

        # Put the model into the evaluation mode. The dropout layers are disabled during
        # the test time.
        self.bert_classifier.eval()

        all_logits = []

        # For each batch in our test set...
        for batch in dataloader:
            # Load batch to GPU
            b_input_ids, b_attn_mask = tuple(t.to(self.device) for t in batch)[:2]

            # Compute logits
            with torch.no_grad():
                logits = self.bert_classifier(b_input_ids, b_attn_mask)
            all_logits.append(logits)

        # Concatenate logits from each batch
        all_logits = torch.cat(all_logits, dim=0)

        # Apply softmax to calculate probabilities
        probs = F.softmax(all_logits, dim=1).cpu().numpy()

        #print(probs.shape)
        #print(df.shape)

        # Determine the max value index in each song and append it to the eval_results array
        eval_dataframe = [x.argmax() for x in probs]

        print(eval_dataframe)

        # Create the dataframe for validation
        val_dataframe = []
        for index, row in df.iterrows():
          arr = row.values
          arr = row[-28:]
          numpy_array = arr.values
          val_dataframe.append(numpy_array.argmax())

        print(val_dataframe)

        matches = [xi == yi for xi, yi in zip(eval_dataframe, val_dataframe)]

        total_elements = len(eval_dataframe)

        print(matches)
        print(sum(matches))

        similarity_percentage = (sum(matches) / total_elements) * 100

        return similarity_percentage


    def train(self, train_data_loader, validation_df=None, epochs=4, evaluation=True):
        loss_fn = nn.CrossEntropyLoss()

        """Train the BertClassifier model."""
        # Start training loop
        print("Start training...\n")
        for epoch_i in range(epochs):
            # =======================================
            #               Training
            # =======================================
            # Print the header of the result table
            print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
            print("-"*70)

            # Measure the elapsed time of each epoch
            t0_epoch, t0_batch = time.time(), time.time()

            # Reset tracking variables at the beginning of each epoch
            total_loss, batch_loss, batch_counts = 0, 0, 0

            # Put the model into the training mode
            self.bert_classifier.train()

            # Print out the amount of steps
            print(f'Steps per epoch: {len(train_data_loader)}')

            # For each batch of training data...
            for step, batch in enumerate(train_data_loader):
                batch_counts +=1
                # Load batch to GPU
                b_input_ids, b_attn_mask, b_labels = tuple(t.to(self.device) for t in batch)

                # Zero out any previously calculated gradients
                self.bert_classifier.zero_grad()

                # Perform a forward pass. This will return logits.
                logits = self.bert_classifier(b_input_ids, b_attn_mask)

                # Compute loss and accumulate the loss values
                loss = loss_fn(logits, b_labels.float())
                batch_loss += loss.item()
                total_loss += loss.item()

                # Perform a backward pass to calculate gradients
                loss.backward()

                # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
                torch.nn.utils.clip_grad_norm_(self.bert_classifier.parameters(), 1.0)

                # Update parameters and the learning rate
                self.optimizer.step()
                self.scheduler.step()

                # Print the loss values and time elapsed for every 20 batches
                if (step % 20 == 0 and step != 0) or (step == len(train_data_loader) - 1):
                    # Calculate time elapsed for 20 batches
                    time_elapsed = time.time() - t0_batch

                    # Print training results
                    print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                    # Reset batch tracking variables
                    batch_loss, batch_counts = 0, 0
                    t0_batch = time.time()

            # Calculate the average loss over the entire training data
            avg_train_loss = total_loss / len(train_data_loader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            probs = self.eval(validation_df)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch

            print(f"  Epoch | {'-':^7} | {avg_train_loss:^12.6f} | Model Evlaution | Time Elapsed (s)")
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {probs} | {time_elapsed:^9.2f}")
            print(f"Testing Accuracy")
            print(f"Evaluation Result.")
            #print(probs.shape)
            #tf.print(probs)
            print("-"*70)
        print("\n")

        print("Training complete!")



Train the model


In [94]:
# Data path
path = '/content/drive/Shareddrives/Spotify NLP Service/goemotions_total.csv'

# Validation path
test = '/content/drive/Shareddrives/Spotify NLP Service/testemotions_1_fixed.csv'

# Build the model
model = Model(pd.read_csv(path), pd.read_csv(test))


There are 1 GPU(s) available.
Device name: NVIDIA A100-SXM4-40GB
Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
Steps per epoch: 3301
   1    |   20    |   3.664416   |     -      |     -     |   6.07   
   1    |   40    |   3.470966   |     -      |     -     |   5.90   
   1    |   60    |   3.325488   |     -      |     -     |   5.90   
   1    |   80    |   3.154606   |     -      |     -     |   5.91   
   1    |   100   |   2.907506   |     -      |     -     |   5.90   
   1    |   120   |   2.992801   |     -      |     -     |   5.91   
   1    |   140   |   2.895917   |     -      |     -     |   5.90   
   1    |   160   |   2.856969   |     -      |     -     |   5.90   
   1    |   180   |   2.702466   |     -      |     -     |   5.90   
   1    |   200   |   2.719476   |     -      |     -     |   5.90   
   1    |   220   |   2.630747   |     -      |     -

In [95]:
# Validation path
test = '/content/drive/Shareddrives/Spotify NLP Service/testemotions_1_fixed.csv'

print("-"*70)
# =======================================
#               Evaluation
# =======================================
# After the completion of each training epoch, measure the model's performance
# on our validation set.
probs = model.eval(pd.read_csv(test))

# Print performance over the entire training data
print(f"Evluation Accuracy: {probs}")
print("-"*70)
print("\n")

print("Training complete!")

----------------------------------------------------------------------
[4, 27, 0, 15, 9, 27, 6, 2, 7, 12, 27, 2, 27, 27, 27, 21, 27, 4, 6, 20, 0, 2, 2, 3, 0, 0, 0, 27, 27, 0, 15, 27, 27, 27, 10, 27, 27, 1, 2, 4, 18, 20, 7, 11, 5, 27, 14, 11, 27, 15, 27, 3, 3, 27, 22, 27, 0, 27, 0, 17, 27, 15, 6, 14, 11, 15, 6, 24, 7, 27, 18, 27, 22, 27, 27, 1, 18, 27, 24, 10, 27, 27, 7, 18, 24, 1, 4, 2, 4, 18, 2, 4, 27, 27, 0, 27, 27, 4, 20, 4, 0, 27, 7, 27, 1, 1, 3, 27, 27, 3, 27, 5, 15, 1, 10, 0, 27, 0, 27, 27, 10, 6, 7, 3, 0, 27, 15, 7, 27, 11, 27, 1, 0, 27, 0, 25, 15, 20, 27, 27, 8, 0, 18, 27, 8, 20, 27, 6, 27, 0, 4, 27, 7, 5, 18, 27, 18, 27, 25, 27, 22, 15, 27, 4, 27, 15, 6, 3, 22, 24, 27, 15, 3, 27, 0, 26, 3, 27, 2, 10, 7, 3, 27, 4, 27, 27, 10, 27, 27, 0, 17, 27, 1, 27, 7, 9, 0, 4, 27, 13, 17, 7, 27, 22, 0, 27, 19, 27, 24, 27, 27, 2, 1, 18, 27, 5, 18, 7, 10, 27, 22, 0, 10, 27, 27, 1, 27, 6, 27, 8, 0, 23, 4, 2, 11, 0, 10, 25, 27, 3, 0, 20, 7, 24, 27, 10, 17, 4, 7, 22, 7, 7, 0, 10, 10, 8, 1, 10, 5,

In [96]:
def construct_Data_Frame_from_Song(song):
    cols = ["text", "admiration", "amusement", "anger", "annoyance", "approval", "caring",
            "confusion", "curiosity", "desire", "disappointment", "disapproval",
            "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
            "joy", "love", "nervousness", "optimism", "pride", "realization",
            "relief", "remorse", "sadness", "surprise", "neutral"]

    df = pd.DataFrame(columns=cols)

    emotions = [x for x in cols if x != "text"]

    df["text"] = song

    df[emotions] = 0

    return df # Return the dataframe of song-lyrics.

# Optimism
df = construct_Data_Frame_from_Song(["I could never find the right way to tell you Have you noticed I've been gone? 'Cause I left behind the home that you made me But I will carry it along And it's a long way forward, so trust in me I'll give them shelter like you've done for me And I know, I'm not alone, you'll be watching over us Until you're gone When I'm older, I'll be silent beside you I know words won't be enough And they won't need to know our names or our faces But they will carry on for us And it's a long way forward, so trust in me I'll give them shelter like you've done for me And I know, I'm not alone, you'll be watching over us Until you're gone Oh, it's a long way forward, trust in me I'll give them shelter like you've done for me And I know, I'm not alone, you'll be watching over us", "This is an awesome song!"])
# Love
df_two = construct_Data_Frame_from_Song(["I wanna love somebody, love somebody like you Whoa, I wanna love you baby."])

res = loaded_model.eval(df)

print(res)

def emotions_occurences(probs):
    labels = ["admiration", "amusement", "anger", "annoyance", "approval", "caring",
        "confusion", "curiosity", "desire", "disappointment", "disapproval",
        "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
        "joy", "love", "nervousness", "optimism", "pride", "realization",
        "relief", "remorse", "sadness", "surprise", "neutral"]
    occurrences = []
    for arr in probs:
        emos = sorted(list(zip(labels, arr)), key = lambda x: x[1], reverse=True)
        occurrences.append(emos[0][0])
    return occurrences

#emotional_value = avg_emotions(probs)
emotional_value = emotions_occurences(res)
print(len(emotional_value))
print(emotional_value)

NameError: name 'loaded_model' is not defined

Saving the model localy

In [97]:
# https://pytorch.org/tutorials/beginner/saving_loading_models.html

version = '1.0.1'
torch.save(model, f"spotify-sentiment-analysis-bert-go-emotions-fine-tuned-{version}")


In [None]:
# Zip the file
from zipfile import ZipFile

version = '1.0.1'
zipf = ZipFile(f"spotify-sentiment-analysis-bert-go-emotions-fine-tuned-{version}.zip","w")
zipf.write("spotify-sentiment-analysis-bert-go-emotions-fine-tuned")
zipf.close()

In [None]:
# Move the file to the shared-drive
!cp "spotify-sentiment-analysis-bert-go-emotions-fine-tuned" "/content/drive/Shareddrives/Spotify NLP Service/"

Loading in a model from google drive


In [None]:
# File path to shared Google drive
file_path = "/content/drive/Shareddrives/Spotify NLP Service/spotify-sentiment-analysis-bert-go-emotions-fine-tuned"
loaded_model = torch.load(file_path)


