In [None]:
pip install nltk gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
# =========================
# STAGE 0: IMPORTS & DATA LOADING
# =========================
import os
import re
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

from tensorflow.keras.layers import (
    Input, Embedding, Dense, Add, LayerNormalization,
    MultiHeadAttention, Concatenate, GlobalAveragePooling1D
)
from tensorflow.keras.models import Model


nltk.download("punkt")
nltk.download("stopwords")
nltk.download('punkt_tab')

stop_english = set(stopwords.words("english"))

df = pd.read_csv("/content/aa_dataset-tickets-multi-lang-5-2-50-version.csv")
df.drop("tag_8", axis=1, inplace=True)
df = df[df["language"] == "en"]

df[["tag_1","tag_2","tag_3","tag_4","tag_5","tag_6","tag_7"]] = \
df[["tag_1","tag_2","tag_3","tag_4","tag_5","tag_6","tag_7"]].fillna("UNKNOWN")

df["text"] = df["subject"].fillna("") + " " + df["body"].fillna("")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# =========================
# STAGE 1: TEXT CLEANING
# =========================
def clean_text(t):
    if pd.isna(t):
        return ""
    t = t.lower()
    tokens = word_tokenize(t)
    tokens = [w for w in tokens if w not in stop_english and len(w) > 2]
    t = " ".join(tokens)
    t = re.sub(r"<.*?>", " ", t)
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    t = re.sub(r"\S+@\S+", " ", t)
    t = re.sub(r"[^a-z\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["cleaned_text"] = df["text"].apply(clean_text)


In [None]:
# =========================
# STAGE 2: LABEL ENCODING
# =========================
le_type = LabelEncoder()
le_queue = LabelEncoder()

df["type_encoded"] = le_type.fit_transform(df["type"])
df["queue_encoded"] = le_queue.fit_transform(df["queue"])

y_type = to_categorical(df["type_encoded"].values)
y_queue = to_categorical(df["queue_encoded"].values)

tags_cols = ["tag_1","tag_2","tag_3","tag_4","tag_5","tag_6","tag_7"]

def clean_tags(row):
    return [str(t) for t in row if str(t).lower() != "unknown"]

df["tags_combined"] = df[tags_cols].apply(clean_tags, axis=1)

mlb = MultiLabelBinarizer()
y_tags = mlb.fit_transform(df["tags_combined"])


In [None]:
# =========================
# STAGE 3: TOKENIZATION
# =========================
MAX_NUM_WORDS = 20000
MAX_SEQ_LEN = 200
EMBEDDING_DIM = 300

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df["cleaned_text"])

sequences = tokenizer.texts_to_sequences(df["cleaned_text"])
X = pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre")

word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)


In [None]:
# =========================
# STAGE 4: WORD2VEC EMBEDDINGS
# =========================
sentences = [t.split() for t in df["cleaned_text"]]

w2v_model = Word2Vec(
    sentences,
    vector_size=EMBEDDING_DIM,
    window=5,
    min_count=2,
    workers=4
)

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, idx in word_index.items():
    if idx < num_words and word in w2v_model.wv:
        embedding_matrix[idx] = w2v_model.wv[word]


In [None]:
# =========================
# STAGE 5: TRAIN / TEST SPLIT
# =========================
X_train, X_test, y_type_train, y_type_test, y_queue_train, y_queue_test, y_tags_train, y_tags_test = train_test_split(
    X, y_type, y_queue, y_tags, test_size=0.2, random_state=42
)


In [None]:
# =========================
# STAGE 6: POSITIONAL ENCODING
# =========================
def positional_encoding(length, depth):
    depth = depth // 2
    positions = np.arange(length)[:, None]
    depths = np.arange(depth)[None, :] / depth
    angle_rates = 1 / (10000 ** depths)
    angle_rads = positions * angle_rates
    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)
    return tf.cast(pos_encoding[None, ...], tf.float32)

pos_enc = positional_encoding(MAX_SEQ_LEN, EMBEDDING_DIM)


In [None]:
# =========================
# STAGE 7: TRANSFORMER ENCODER
# =========================
ENC_LAYERS = 2
NUM_HEADS = 4
FF_DIM = 512

input_seq = Input(shape=(MAX_SEQ_LEN,), name="input_seq")

x = Embedding(
    input_dim=num_words,
    output_dim=EMBEDDING_DIM,
    weights=[embedding_matrix],
    trainable=False
)(input_seq)

x = Add()([x, pos_enc])

for _ in range(ENC_LAYERS):
    attn = MultiHeadAttention(
        num_heads=NUM_HEADS,
        key_dim=EMBEDDING_DIM // NUM_HEADS
    )(x, x)
    x = LayerNormalization()(Add()([x, attn]))

    ff = Dense(FF_DIM, activation="relu")(x)
    ff = Dense(EMBEDDING_DIM)(ff)
    x = LayerNormalization()(Add()([x, ff]))

encoder_out = x   # (batch, seq_len, embed_dim)

encoder_out = GlobalAveragePooling1D(name="encoder_pooling")(encoder_out)

# =========================
# STAGE 8: MULTI-TASK OUTPUT HEADS
# =========================
type_branch = Dense(256, activation="relu")(encoder_out)
type_out = Dense(y_type.shape[1], activation="softmax", name="type_out")(type_branch)

queue_input = Concatenate()([encoder_out, type_out])
queue_branch = Dense(256, activation="relu")(queue_input)
queue_out = Dense(y_queue.shape[1], activation="softmax", name="queue_out")(queue_branch)

tags_input = Concatenate()([encoder_out, queue_out])
tags_branch = Dense(256, activation="relu")(tags_input)
tags_out = Dense(y_tags.shape[1], activation="sigmoid", name="tags_out")(tags_branch)

# =========================
# STAGE 9: BUILD & TRAIN
# =========================
model = Model(
    inputs=input_seq,
    outputs=[type_out, queue_out, tags_out]
)

model.summary()

In [None]:
model.compile(
    optimizer= 'adam',#tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss={
        'type_out': 'categorical_crossentropy',
        'queue_out': 'categorical_crossentropy',
        'tags_out': 'binary_crossentropy'
    },
  # loss_weights={'type_out': 1.0, 'queue_out': 1.0, 'tags_out': 1.0},

)

history = model.fit(
    X_train,
    {
        "type_out": y_type_train,
        "queue_out": y_queue_train,
        "tags_out": y_tags_train
    },
    validation_split=0.1,
    epochs=2,
    batch_size=32
)

Epoch 1/2
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 59ms/step - loss: 2.2542 - queue_out_loss: 1.7655 - tags_out_loss: 0.0129 - type_out_loss: 0.4758 - val_loss: 2.2199 - val_queue_out_loss: 1.7565 - val_tags_out_loss: 0.0129 - val_type_out_loss: 0.4514
Epoch 2/2
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 31ms/step - loss: 2.2021 - queue_out_loss: 1.7471 - tags_out_loss: 0.0127 - type_out_loss: 0.4424 - val_loss: 2.2088 - val_queue_out_loss: 1.7392 - val_tags_out_loss: 0.0132 - val_type_out_loss: 0.4572


In [None]:
metrics=  {
        'type_out': ['accuracy'],
        'queue_out': ['accuracy'],
        'tags_out': ['accuracy']# [tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall')]
    }

In [None]:
import keras
keras.models.save_model(model,'/content/model_ticket.keras')

In [None]:
import pickle

with open('/content/tokenizer_ticket.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open("/content/le_type.pkl", "wb") as f:
    pickle.dump(le_type, f)

with open("/content/le_queue.pkl", "wb") as f:
    pickle.dump(le_queue, f)

with open("/content/mlb.pkl", "wb") as f:
    pickle.dump(mlb, f)


In [None]:
keras.models.save_model(model,'/content/model_ticket1.h5')



In [None]:
# ===============================
# STAGE 11: TEST / INFERENCE CODE
# ===============================
def preprocess_single_text(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    seq = pad_sequences(seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre")
    return seq

def predict_ticket(text):
    X_in = preprocess_single_text(text)

    type_p, queue_p, tags_p = model.predict(X_in)

    type_label = le_type.inverse_transform([np.argmax(type_p)])[0]
    queue_label = le_queue.inverse_transform([np.argmax(queue_p)])[0]

    tag_indices = np.where(tags_p[0] > 0.5)[0]
    tag_labels = mlb.classes_[tag_indices]

    return {
        "predicted_type": type_label,
        "predicted_queue": queue_label,
        "predicted_tags": list(tag_labels),
        "type_confidence": float(np.max(type_p)),
        "queue_confidence": float(np.max(queue_p))
    }


In [None]:
# =========================
# STAGE 12: SAMPLE TEST
# =========================
sample_text = """
Unable to access my account after password reset.
The system throws authentication error.
"""

result = predict_ticket(sample_text)
print(result)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
{'predicted_type': 'Incident', 'predicted_queue': 'Technical Support', 'predicted_tags': ['IT', 'Performance', 'Tech Support'], 'type_confidence': 0.6518449783325195, 'queue_confidence': 0.4231639802455902}


In [38]:
model

<Functional name=functional_6, built=True>