In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install -q datasets numpy pandas scikit-learn tensorflow

In [3]:
import numpy as np
import pandas as pd
import pickle

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


In [4]:
tf.config.list_physical_devices("GPU")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

Load Dataset

In [5]:
ds = load_dataset("Tobi-Bueck/customer-support-tickets")
df = pd.DataFrame(ds["train"])
df = df[["body", "queue"]].dropna()

README.md: 0.00B [00:00, ?B/s]

aa_dataset-tickets-multi-lang-5-2-50-ver(…):   0%|          | 0.00/26.0M [00:00<?, ?B/s]

(…)set-tickets-german_normalized_50_5_2.csv: 0.00B [00:00, ?B/s]

dataset-tickets-multi-lang-4-20k.csv:   0%|          | 0.00/18.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/61765 [00:00<?, ? examples/s]

Train / Val / Test Split

In [6]:
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["queue"]
)

train_df, val_df = train_test_split(
    train_df, test_size=0.1, random_state=42, stratify=train_df["queue"]
)

Load Tokenizer & Label Encoder

In [7]:
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

with open("label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

Text Cleaning + Sequencing

In [8]:
def clean_text(text):
    return text.lower().replace("\n", " ").strip()

MAX_LEN = 200

X_train = pad_sequences(
    tokenizer.texts_to_sequences(train_df["body"].apply(clean_text)),
    maxlen=MAX_LEN, padding="post", truncating="post"
)

X_val = pad_sequences(
    tokenizer.texts_to_sequences(val_df["body"].apply(clean_text)),
    maxlen=MAX_LEN, padding="post", truncating="post"
)

X_test = pad_sequences(
    tokenizer.texts_to_sequences(test_df["body"].apply(clean_text)),
    maxlen=MAX_LEN, padding="post", truncating="post"
)

y_train = label_encoder.transform(train_df["queue"])
y_val   = label_encoder.transform(val_df["queue"])
y_test  = label_encoder.transform(test_df["queue"])

num_classes = len(label_encoder.classes_)


Build Many-to-One LSTM Model

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input

VOCAB_SIZE = 20000
EMBED_DIM = 128
MAX_LEN = 200   # must match preprocessing

model = Sequential([
    Input(shape=(MAX_LEN,)),
    Embedding(VOCAB_SIZE, EMBED_DIM),
    LSTM(128, return_sequences=False),
    Dropout(0.4),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(num_classes, activation="softmax")
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


Train the Model

In [10]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=2,
        restore_best_weights=True
    )
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64,
    callbacks=callbacks
)

Epoch 1/10
[1m695/695[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 19ms/step - accuracy: 0.2152 - loss: 3.0573 - val_accuracy: 0.2297 - val_loss: 2.8610
Epoch 2/10
[1m695/695[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.2279 - loss: 2.8912 - val_accuracy: 0.2297 - val_loss: 2.8573
Epoch 3/10
[1m695/695[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.2300 - loss: 2.8985 - val_accuracy: 0.2297 - val_loss: 2.8564
Epoch 4/10
[1m695/695[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.2310 - loss: 2.8751 - val_accuracy: 0.2297 - val_loss: 2.8571
Epoch 5/10
[1m695/695[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.2306 - loss: 2.8719 - val_accuracy: 0.2297 - val_loss: 2.8567


In [11]:
model.save("ticket_lstm_model.keras")

Quick Sanity Prediction

In [12]:
sample_text = "I am unable to login to my account"

seq = tokenizer.texts_to_sequences([sample_text.lower()])
pad = pad_sequences(seq, maxlen=200, padding="post")

pred = model.predict(pad)
predicted_queue = label_encoder.inverse_transform([np.argmax(pred)])

predicted_queue


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step


array(['Technical Support'], dtype=object)