# Description

In this notebook, I will train the emojify model, which take input as text and return the coresponding emotion.

In [1]:
import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import xgboost
import joblib

from utils.utils_read_dataset import *
from utils.utils_model import *

In [2]:
PATH_FILE_UNICODE = r"data/processed_data/list_processed_unicode.npy"
PATH_FILE_NAME = r"data/processed_data/list_processed_name.npy"

PATH_FILE_PRETRAIN_GLOVE = r"models/glove.6B.50d.txt"

PATH_FOLDER_MODEL = r"models"
TOP_K = 5

# 1. Load dataset

In [3]:
list_unicode = np.load(PATH_FILE_UNICODE)
list_name = np.load(PATH_FILE_NAME)

assert len(list_unicode) == len(list_name)

In [4]:
idx = np.random.randint(0, len(list_unicode))

unicode = list_unicode[idx]
name = list_name[idx]

print(f"Emoji: {convert_unicode_2_emoji(unicode)}")
print(f"Name: {name}")

Emoji: 😡
Name: grimace faces


# 2. Process data

## 2.1. Convert text into vector embedding using GloVe

The input is the `list_name`, where elements are sentence. We need to convert it into a vector by using `sentence_to_vector` function.

In [5]:
glove_embed = load_glove_embeddings(PATH_FILE_PRETRAIN_GLOVE)

In [6]:
X = [sentence_to_vector(name, glove_embed) for name in list_name]
X = np.array(X)

print(f"Shape of X: {X.shape}")

Shape of X: (7061, 50)


## 2.2. Convert label into numerical value

In [7]:
print(f"Number of target categories: {len(np.unique(list_unicode))}") 

Number of target categories: 690


In [8]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(list_unicode)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=100)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (6990, 50)
Shape of y_train: (6990,)
Shape of X_test: (71, 50)
Shape of y_test: (71,)


In [12]:
print(f"Number of target categories on y_train: {len(np.unique(y_train))}") 
print(f"Number of target categories on y_test: {len(np.unique(y_test))}") 

Number of target categories on y_train: 690
Number of target categories on y_test: 68


# 3. Train XGBoost model

## 3.1. Grid search to find best hyper-parameter

In [13]:
%time
xgb_model = xgboost.XGBClassifier({'learning_rate': 0.1})
xgb_model.fit(X_train, y_train)

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 16.7 µs




## 3.3. Evaluation on test set

In [22]:
def calculate_top_k_accuracy(model, X_test, y_test, top_k=10):
    y_pred_proba = model.predict_proba(X_test)

    # Get the indices of the top k predictions
    top_k_preds = np.argsort(y_pred_proba, axis=1)[:, -top_k:]

    top_k_correct = [y_test[i] in top_k_preds[i] for i in range(len(y_test))]

    top_k_accuracy = np.mean(top_k_correct)
    return top_k_accuracy

In [23]:
# Evaluation
y_test_pred = xgb_model.predict(X_test)
acc = accuracy_score(y_test, y_test_pred)
print(f"Accuracy on test set: {acc}")

Accuracy on test set: 0.4647887323943662


In [24]:
top_k = 10
top_k_accuracy = calculate_top_k_accuracy(xgb_model, X_test, y_test, top_k=top_k)
print(f"Top {top_k} accuracy: {top_k_accuracy}")

Top 10 accuracy: 0.6901408450704225


# 4. Evaluation

## 4.1. Get prediction

In [40]:
idx = np.random.randint(0, len(list_name))

true_name = list_name[idx]
true_unicode = list_unicode[idx]

print(f"True name: {true_name}")
print(f"True unicode: {convert_unicode_2_emoji(true_unicode)}")

y_pred_emotion = get_prediction_emotion(xgb_model, true_name, glove_embed, label_encoder)
print(f"predicted emotion: {y_pred_emotion}")

True name: fox look
True unicode: 😖
predicted emotion: 😖


## 4.2. Top-k prediction

Since the emotion can be similar (e.g. happy face vs smile face). Therefore, It is reasonable to predict `top_k`.

In [59]:
idx = np.random.randint(0, len(list_name))

true_name = list_name[idx]
true_unicode = list_unicode[idx]

print(f"True name: {true_name}")
print(f"True unicode: {convert_unicode_2_emoji(true_unicode)}")

list_predcited_emotion = get_top_k_prediction(xgb_model, true_name, TOP_K, glove_embed, label_encoder)

for predcited_emotion in list_predcited_emotion:
    print(predcited_emotion)

True name: check nasdaq
True unicode: 📇
📇
📠
🏁
☑
📑


# 5. Save model

In this section, we will save the trained model, including:
- Trained xgboost model.
- label_encoder.

In [52]:
# Save label encoder
path_label_encoder_output_file = os.path.join(PATH_FOLDER_MODEL, "label_encoder.joblib")
joblib.dump(label_encoder, path_label_encoder_output_file)

['models/label_encoder.joblib']

In [53]:
# Save trained XGBoost
path_xgboost_file = os.path.join(PATH_FOLDER_MODEL, "xgboost_model.joblib")
joblib.dump(xgb_model, path_xgboost_file)

['models/xgboost_model.joblib']