In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error, median_absolute_error, accuracy_score, confusion_matrix

In [None]:
storypoint_mapping = {
    1: 0,
    2: 1,
    3: 2,
    5: 3,
    8: 4
}

df = pd.read_csv("/content/Dataset_with_noCols.csv")
df = df[df["storypoint"] != 13]
df.dropna(inplace=True)
df['storypoint'] = df['storypoint'].map(storypoint_mapping)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

label_encoder = LabelEncoder()
train_df['storypoint'] = label_encoder.fit_transform(train_df['storypoint'])
val_df['storypoint'] = label_encoder.fit_transform(val_df['storypoint'])
test_df['storypoint'] = label_encoder.fit_transform(test_df['storypoint'])

# Tokenize the text fields (title and description)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['title'] + ' ' + train_df['description'])

train_sequences = tokenizer.texts_to_sequences(train_df['title'] + ' ' + train_df['description'])
val_sequences = tokenizer.texts_to_sequences(val_df['title'] + ' ' + val_df['description'])
test_sequences = tokenizer.texts_to_sequences(test_df['title'] + ' ' + test_df['description'])

# Pad the sequences to a fixed length
max_sequence_length = 512  # adjust as needed
train_data = pad_sequences(train_sequences, maxlen=max_sequence_length)
val_data = pad_sequences(val_sequences, maxlen=max_sequence_length)
test_data = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Prepare the target labels
num_classes = len(label_encoder.classes_)
train_labels = train_df['storypoint']
val_labels = val_df['storypoint']
test_labels = test_df['storypoint']

In [None]:
# Build the LSTM model with regularization
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(LSTM(units=128, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model with early stopping
batch_size = 128  # adjust as needed
epochs = 25  # adjust as needed

callbacks = [EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)]

model.fit(train_data, train_labels, validation_data=(val_data, val_labels), batch_size=batch_size, epochs=epochs,
          callbacks=callbacks)

In [None]:
test_loss, test_acc = model.evaluate(test_data, test_labels, verbose=0)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

# Predict story points for the test set
test_predictions = model.predict(test_data)
test_predictions_classes = np.argmax(test_predictions, axis=1)

# Convert the predicted labels back to their original values
test_predictions = label_encoder.inverse_transform(test_predictions_classes)

# Calculate evaluation metrics
test_mae = mean_absolute_error(test_df['storypoint'], test_predictions)
test_mdae = median_absolute_error(test_df['storypoint'], test_predictions)
test_accuracy = accuracy_score(test_df['storypoint'], test_predictions)
classwise_accuracy = accuracy_score(test_df['storypoint'], test_predictions, normalize=False)
confusion_mat = confusion_matrix(test_df['storypoint'], test_predictions)

print('Test MAE:', test_mae)
print('Test MdAE:', test_mdae)
print('Test overall Accuracy:', test_accuracy)
print('Test class-wise Accuracy:', classwise_accuracy)
print('Confusion Matrix:\n', confusion_mat)