In [22]:
import os
import time
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
class EmojiTextClassifier:
    def __init__(self):
        self.feature_vectors = {}
        self.model = None

    def load_dataset(self, train_path, test_path):
        df_train = pd.read_csv(train_path)
        X_train = np.array(df_train["sentence"])
        Y_train = np.array(df_train["label"], dtype=int)

        df_test = pd.read_csv(test_path)
        X_test = np.array(df_test["sentence"])
        Y_test = np.array(df_test["label"], dtype=int)

        return X_train, Y_train, X_test, Y_test

    def load_feature_vectors(self, link="https://nlp.stanford.edu/data/glove.6B.zip", name="glove.6B.50d.txt"):
        print("Downloading pre-trained model...")

        if not os.path.exists("glove.6B.zip"):
            os.system(f"wget -q {link}")

        if not os.path.exists("glove"):
            os.makedirs("glove", exist_ok=True)
            os.system("unzip -q glove.6B.zip -d glove")

        path = f"glove/{name}"
        if not os.path.exists(path):
            raise FileNotFoundError(f" file '{path}' didn't find! ")

        print("Pre-trained model ✅")

        with open(path, encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split()
                word = parts[0]
                vector = np.array(parts[1:], dtype=np.float64)
                self.feature_vectors[word] = vector

    def sentence_to_feature_vectors_avg(self, sentence, D=50):
        sentence = sentence.lower()
        words = sentence.split()

        sum_vectors = np.zeros((D,))
        count = 0

        for word in words:
            if word in self.feature_vectors:
                sum_vectors += self.feature_vectors[word]
                count += 1

        if count == 0:
            return sum_vectors
        return sum_vectors / count

    def vectorize_dataset(self, X_raw, D=50):
        return np.array([self.sentence_to_feature_vectors_avg(sent, D) for sent in X_raw])

    def train(self, X_train, Y_train, epochs=10, optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"], use_dropout=[True, 0.2]):
        input_dim = X_train.shape[1]
        if use_dropout[0]:
            self.model = tf.keras.models.Sequential([
                tf.keras.layers.Input(shape=(input_dim,)),
                tf.keras.layers.Dense(128, activation="relu"),
                tf.keras.layers.Dropout(use_dropout[1]),
                tf.keras.layers.Dense(5, activation="softmax")
            ])
        else:
            self.model = tf.keras.models.Sequential([
                tf.keras.layers.Input(shape=(input_dim,)),
                tf.keras.layers.Dense(128, activation="relu"),
                tf.keras.layers.Dense(5, activation="softmax")
            ])

        self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
        self.model.fit(X_train, Y_train, epochs=epochs)

    def test(self, X_test, Y_test):
        return self.model.evaluate(X_test, Y_test)

    def predict(self, sentence, D=50):
        vector = self.sentence_to_feature_vectors_avg(sentence, D)
        vector = np.expand_dims(vector, axis=0)
        prediction = self.model.predict(vector)
        return np.argmax(prediction)


In [None]:
model_50 = EmojiTextClassifier()
X_train_raw, Y_train, X_test_raw, Y_test = model_50.load_dataset("dataset/train.csv", "dataset/test.csv")
model_50.load_feature_vectors(name="glove.6B.50d.txt")

X_train = model_50.vectorize_dataset(X_train_raw,D=50)
X_test = model_50.vectorize_dataset(X_test_raw,D=50)

model_50.train(X_train, Y_train, epochs=250)
model_50.test(X_test, Y_test)

result_test = model_50.predict("I love programming",D=50)
print("predict:",result_test)


|feature vector dimensions|Train loss|Train accuracy|Test loss|Test accuracy|Inference Time|
|-------------------------|----------|--------------|---------|-------------|--------------|
|50d|0.1332|0.9647|0.3029|0.8185|138ms|
|100d|0.0339|0.9962|0.4719|0.8080|177ms|
|200d|0.0070|1.0000|0.4825|0.8423|180ms|
|300d|0.0030|1.0000|0.4492|0.8185|188ms|