In [1]:
import pandas as pd
import numpy as np
import ast

from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical


In [None]:
df = pd.read_csv("arabic_dataset_all_embeddings_3d.csv")
df.head()


In [None]:
y = df["targe"].values
y_cat = to_categorical(y, num_classes=5)


In [None]:
def load_embedding(col):
    return np.stack(df[col].apply(ast.literal_eval).values)


In [None]:
def build_lstm(input_shape):
    model = Sequential()
    model.add(LSTM(128, input_shape=input_shape))
    model.add(Dropout(0.3))
    model.add(Dense(5, activation="softmax"))

    model.compile(
        optimizer=Adam(0.001),
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model


In [None]:
X = load_embedding("tfidf_isri")
X = X[:, np.newaxis, :]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y
)

model = build_lstm((X.shape[1], X.shape[2]))

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=64,
    verbose=1
)

tfidf_isri_acc = history.history["val_accuracy"][-1]


In [None]:
X = load_embedding("tfidf_light")
X = X[:, np.newaxis, :]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y
)

model = build_lstm((X.shape[1], X.shape[2]))

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=64,
    verbose=1
)

tfidf_light_acc = history.history["val_accuracy"][-1]


In [None]:
X = load_embedding("tfidf_snowball")
X = X[:, np.newaxis, :]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y
)

model = build_lstm((X.shape[1], X.shape[2]))

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=64,
    verbose=1
)

tfidf_snowball_acc = history.history["val_accuracy"][-1]


In [None]:
X = load_embedding("fasttext_isri")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y
)

model = build_lstm((X.shape[1], X.shape[2]))

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=64,
    verbose=1
)

fasttext_isri_acc = history.history["val_accuracy"][-1]


In [None]:
X = load_embedding("fasttext_light")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y
)

model = build_lstm((X.shape[1], X.shape[2]))

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=64,
    verbose=1
)

fasttext_light_acc = history.history["val_accuracy"][-1]


In [None]:
X = load_embedding("fasttext_snowball")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y
)

model = build_lstm((X.shape[1], X.shape[2]))

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=64,
    verbose=1
)

fasttext_snowball_acc = history.history["val_accuracy"][-1]


In [None]:
X = load_embedding("bert_isri")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y
)

model = build_lstm((X.shape[1], X.shape[2]))

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=32,
    verbose=1
)

bert_isri_acc = history.history["val_accuracy"][-1]


In [None]:
X = load_embedding("bert_light")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y
)

model = build_lstm((X.shape[1], X.shape[2]))

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=32,
    verbose=1
)

bert_light_acc = history.history["val_accuracy"][-1]


In [None]:
X = load_embedding("bert_snowball")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y
)

model = build_lstm((X.shape[1], X.shape[2]))

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=32,
    verbose=1
)

bert_snowball_acc = history.history["val_accuracy"][-1]


In [None]:
results = pd.DataFrame({
    "Experiment": [
        "TF-IDF + ISRI", "TF-IDF + Light", "TF-IDF + Snowball",
        "FastText + ISRI", "FastText + Light", "FastText + Snowball",
        "BERT + ISRI", "BERT + Light", "BERT + Snowball"
    ],
    "Accuracy": [
        tfidf_isri_acc, tfidf_light_acc, tfidf_snowball_acc,
        fasttext_isri_acc, fasttext_light_acc, fasttext_snowball_acc,
        bert_isri_acc, bert_light_acc, bert_snowball_acc
    ]
})

results.sort_values("Accuracy", ascending=False)
