In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Load Dataset

In [3]:
ds = load_dataset("Tobi-Bueck/customer-support-tickets")
df = pd.DataFrame(ds["train"])

df = df[["body", "queue"]].dropna()
df.head()

README.md: 0.00B [00:00, ?B/s]

aa_dataset-tickets-multi-lang-5-2-50-ver(…):   0%|          | 0.00/26.0M [00:00<?, ?B/s]

(…)set-tickets-german_normalized_50_5_2.csv: 0.00B [00:00, ?B/s]

dataset-tickets-multi-lang-4-20k.csv:   0%|          | 0.00/18.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/61765 [00:00<?, ? examples/s]

Unnamed: 0,body,queue
0,"Sehr geehrtes Support-Team,\n\nich möchte eine...",Technical Support
1,"Dear Customer Support Team,\n\nI am writing to...",Technical Support
2,"Dear Customer Support Team,\n\nI hope this mes...",Returns and Exchanges
3,"Dear Customer Support Team,\n\nI hope this mes...",Billing and Payments
4,"Dear Support Team,\n\nI hope this message reac...",Sales and Pre-Sales


Train / Validation / Test Split

In [4]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["queue"]
)

train_df, val_df = train_test_split(
    train_df,
    test_size=0.1,
    random_state=42,
    stratify=train_df["queue"]
)

len(train_df), len(val_df), len(test_df)

(44469, 4941, 12353)

Text Cleaning

In [5]:
def clean_text(text):
    text = text.lower()
    text = text.replace("\n", " ").strip()
    return text
train_texts = train_df["body"].astype(str).apply(clean_text)
val_texts   = val_df["body"].astype(str).apply(clean_text)
test_texts  = test_df["body"].astype(str).apply(clean_text)

Tokenization

In [6]:
MAX_WORDS = 20000   # Vocabulary size
MAX_LEN = 200       # Sequence length (based on Notebook 01 analysis)

tokenizer = Tokenizer(
    num_words=MAX_WORDS,
    oov_token="<OOV>"
)

In [7]:
tokenizer.fit_on_texts(train_texts)


In [8]:
X_train_seq = tokenizer.texts_to_sequences(train_texts)
X_val_seq   = tokenizer.texts_to_sequences(val_texts)
X_test_seq  = tokenizer.texts_to_sequences(test_texts)

Padding & Truncation

In [9]:
X_train = pad_sequences(
    X_train_seq,
    maxlen=MAX_LEN,
    padding="post",
    truncating="post"
)

X_val = pad_sequences(
    X_val_seq,
    maxlen=MAX_LEN,
    padding="post",
    truncating="post"
)

X_test = pad_sequences(
    X_test_seq,
    maxlen=MAX_LEN,
    padding="post",
    truncating="post"
)

X_train.shape, X_val.shape, X_test.shape


((44469, 200), (4941, 200), (12353, 200))

*Padding ensures all input sequences have equal length, enabling batch processing in LSTM networks.*

Label Encoding

In [10]:
label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(train_df["queue"])
y_val   = label_encoder.transform(val_df["queue"])
y_test  = label_encoder.transform(test_df["queue"])

In [11]:
label_encoder.classes_

array(['Arts & Entertainment/Movies', 'Arts & Entertainment/Music',
       'Autos & Vehicles/Maintenance', 'Autos & Vehicles/Sales',
       'Beauty & Fitness/Cosmetics', 'Beauty & Fitness/Fitness Training',
       'Billing and Payments', 'Books & Literature/Fiction',
       'Books & Literature/Non-Fiction',
       'Business & Industrial/Manufacturing', 'Customer Service',
       'Finance/Investments', 'Finance/Personal Finance',
       'Food & Drink/Groceries', 'Food & Drink/Restaurants', 'Games',
       'General Inquiry', 'Health/Medical Services',
       'Health/Mental Health', 'Hobbies & Leisure/Collectibles',
       'Hobbies & Leisure/Crafts', 'Home & Garden/Home Improvement',
       'Home & Garden/Landscaping', 'Human Resources',
       'IT & Technology/Hardware Support',
       'IT & Technology/Network Infrastructure',
       'IT & Technology/Security Operations',
       'IT & Technology/Software Development', 'IT Support',
       'Jobs & Education/Online Courses', 'Jobs & Educat

In [12]:
num_classes = len(label_encoder.classes_)
num_classes

52

*Label encoding converts categorical queue labels into numeric class IDs for classification.*

Save Preprocessing

In [13]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)


In [14]:
print("Sample text:", train_texts.iloc[0][:200])
print("Encoded sequence:", X_train[0][:20])
print("Label:", y_train[0])
print("Decoded label:", label_encoder.inverse_transform([y_train[0]]))


Sample text: could you please provide the documentation for the datarobot bigcommerce integration?
Encoded sequence: [  37   14   66   54    2  487   21    2 1864 3641   67    0    0    0
    0    0    0    0    0    0]
Label: 49
Decoded label: ['Technical Support']
