# Training Models

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer

# Load the dataset
Reg_data = pd.read_csv("training_dataset.csv")

X_body = Reg_data['Body_tokens'].apply(lambda x: eval(x)).values
X_title = Reg_data['Title_tokens'].apply(lambda x: eval(x)).values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_body, X_title, test_size=0.2, random_state=42)

# Initialize Tokenizer and fit on the combined text
Reg_tokenizer = Tokenizer()
Reg_tokenizer.fit_on_texts(X_body)
Reg_tokenizer.fit_on_texts(X_title)

# Convert tokens to sequences of integers
X_train_seq = Reg_tokenizer.texts_to_sequences(X_train)
X_test_seq = Reg_tokenizer.texts_to_sequences(X_test)
y_train_seq = Reg_tokenizer.texts_to_sequences(y_train)
y_test_seq = Reg_tokenizer.texts_to_sequences(y_test)

# Pad sequences to make them of equal length
maxlen = 100  # Maximum sequence length
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post', truncating='post')
y_train_pad = pad_sequences(y_train_seq, maxlen=maxlen, padding='post', truncating='post')
y_test_pad = pad_sequences(y_test_seq, maxlen=maxlen, padding='post', truncating='post')

# Define the LSTM model architecture
embedding_dim = 100
lstm_units = 64

Reg_model = Sequential()
Reg_model.add(Embedding(input_dim=len(Reg_tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=maxlen))
Reg_model.add(LSTM(units=lstm_units, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
Reg_model.add(Dense(len(Reg_tokenizer.word_index) + 1, activation='softmax'))  # Predicting word indices

# Compile the model
Reg_model.compile(
    optimizer='adam', 
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy']
    )

# Train the model
batch_size = 32
epochs = 2
Reg_model.fit(X_train_pad, y_train_pad, batch_size=batch_size, epochs=epochs, validation_split=0.1)

# Evaluate the model
loss, accuracy = Reg_model.evaluate(X_test_pad, y_test_pad)
print("Test Accuracy:", accuracy)





Epoch 1/2


Epoch 2/2
Test Accuracy: 0.9397284388542175


In [15]:
# Prompt the user to enter their input
user_input = input("Enter the body text: ")

user_input_seq = Reg_tokenizer.texts_to_sequences([user_input])
user_input_pad = pad_sequences(user_input_seq, maxlen=maxlen, padding='post', truncating='post')

# Predict the title based on the user input
predicted_title_probs = Reg_model.predict(user_input_pad)

# Find the index of the word with the highest probability for each prediction
predicted_title_indices = predicted_title_probs.argmax(axis=2)

# Get the probabilities associated with each word in the vocabulary
for i, indices in enumerate(predicted_title_indices):
    print("Prediction", i+1, ":")
    for j, index in enumerate(indices):
        probability = predicted_title_probs[i, j, index]
        word = Reg_tokenizer.sequences_to_texts([[index]])[0]
        print(f"Word: {word}, Probability: {probability:.4f}")
    print()

predicted_title = Reg_tokenizer.sequences_to_texts(predicted_title_indices)[0]

print("Predicted Title:", predicted_title)

[2179  270   89  383 2458  609  904   52  586  123    2    1    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
[  166   605   310 97560  1557  3647  1146    61   389   106   111    97
    46     6   328   784   820   125  2916   111  3333   134   997   182
     6  2916  1600   227  7720   703   783     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     

In [16]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Get predictions
predictions = Reg_model.predict(X_test_pad)
y_pred = (predictions > 0.5).astype(int)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

  8/322 [..............................] - ETA: 1:01:04