**Imports**

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.model_selection import StratifiedShuffleSplit
import gc

**Load The Dataset**

In [None]:
dataset = pd.read_csv("dataset_after_preprocessing.csv")

**Extract labels and summaries**

In [None]:
# Extract labels and summaries
labels = dataset['Assignee'].values
summaries = dataset['Summary_Stemmed'].values

In [None]:
del dataset

**Encode the labels**

In [None]:
# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
categorical_labels = to_categorical(encoded_labels)


In [None]:
del labels

**Tokenize and Pad Text Data**

In [None]:
# Tokenize and pad text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(summaries)
sequences = tokenizer.texts_to_sequences(summaries)
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')
vocab_size = len(tokenizer.word_index) + 1


In [None]:
del summaries
del sequences

**Split Data into Training and Test Sets**

In [None]:
# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Split data into training and test sets
for train_index, test_index in sss.split(padded_sequences, encoded_labels):
    X_train, X_test = padded_sequences[train_index], padded_sequences[test_index]
    y_train, y_test = categorical_labels[train_index], categorical_labels[test_index]

# Further split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)


In [None]:
del sss
gc.collect()

0

In [None]:
# Clear unnecessary variables to free up memory
del padded_sequences, encoded_labels, categorical_labels
gc.collect()

0

**Build the LSTM Model**

In [None]:
# Build the LSTM model
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=y_train.shape[1], activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3762, 100)         102752800 
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 2932)              378228    
                                                                 
Total params: 103248276 (393.86 MB)
Trainable params: 103248276 (393.86 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


**Train the LSTM Model**

In [None]:
# Train the model
epochs = 10
batch_size = 32

history = model.fit(
    X_train, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.1,
    verbose=2,
    #callbacks=[PrintTrainingAccuracy()]
)

In [None]:
# Clear session and free memory
K.clear_session()
gc.collect()

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f'Test Accuracy: {accuracy * 100:.2f}%')