In [4]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

# Step 1: Load the dataset
file_path = r"D:\H\works\RP\Kaushi\datsets\dataset.csv"
df = pd.read_csv(file_path, encoding='utf-8')  # Ensuring UTF-8 encoding
print("Loaded Data:")
print(df.head())

# Step 2: Clean the data (remove NaN or non-string values)
df = df.dropna(subset=['Correct Word', 'Jumbled word'])  # Remove rows with NaN values
df['Correct Word'] = df['Correct Word'].astype(str)  # Ensure all values are strings
df['Jumbled word'] = df['Jumbled word'].astype(str)  # Ensure all values are strings

# Step 3: Tokenize and pad the sentences
correct_sentences = df['Correct Word'].values
jumbled_sentences = df['Jumbled word'].values

# Tokenizer to convert words into numbers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(correct_sentences + jumbled_sentences)  # Fit on both columns

# Convert sentences into sequences (list of token IDs)
correct_sequences = tokenizer.texts_to_sequences(correct_sentences)
jumbled_sequences = tokenizer.texts_to_sequences(jumbled_sentences)

# Get the vocabulary size (the total number of unique words)
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token
print("Vocabulary Size:", vocab_size)

# Find the maximum sentence length (for padding)
max_length = max([len(seq) for seq in correct_sequences])

# Pad sequences to ensure uniform length
correct_sequences = pad_sequences(correct_sequences, maxlen=max_length, padding='post')
jumbled_sequences = pad_sequences(jumbled_sequences, maxlen=max_length, padding='post')

# Step 4: Split into training and testing sets (80% train, 20% test)
X = jumbled_sequences  # Input (Jumbled sentences)
y = correct_sequences  # Output (Correct sentences)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Reshape the labels to match the model output shape
y_train = y_train[:, 1:]  # Remove the first token in each sentence (shift labels by 1)
y_test = y_test[:, 1:]  # Same for test labels (shift labels by 1)

# Step 6: Ensure labels have the same length as model output
y_train = pad_sequences(y_train, maxlen=max_length, padding='post')
y_test = pad_sequences(y_test, maxlen=max_length, padding='post')

# Check the shape of the data to ensure it's ready for training
print("y_train shape after reshaping:", y_train.shape)
print("y_test shape after reshaping:", y_test.shape)

# Step 7: Build the Seq2Seq Model
model = Sequential()

# Embedding Layer: Convert words into embeddings (size of 128)
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))

# Encoder: Bidirectional LSTM (captures context from both directions)
model.add(Bidirectional(LSTM(256, return_sequences=True)))

# Decoder: LSTM for sentence generation (outputs the sequence)
model.add(LSTM(256, return_sequences=True))

# Output Layer: Dense layer with softmax activation to predict the next word
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model with Adam optimizer and sparse categorical crossentropy loss
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary to show the architecture
model.summary()

# Step 8: Train the model with reshaped labels
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Step 9: Save the model after training
model.save('sentence_reordering_model.h5')

# Step 10: Evaluate the model on test data
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_acc)


Loaded Data:
               Correct Word               Jumbled word  Word Count
0    நாங்கள் விளையாடினோம்.       விளையாடினோம் நாங்கள்.         2.0
1      நான் சாப்பிடவில்லை.         சாப்பிடவில்லை நான்.         2.0
2          இது ஒரு புத்தகம்          இது புத்தகம் ஒரு.         2.0
3       எங்கள் வீடு பெரியது       வீடு எங்கள் பெரியது.         2.0
4  நான் பழம் சாப்பிடுகிறேன்  பழம் நான் சாப்பிடுகிறேன்.         2.0
Vocabulary Size: 414
y_train shape after reshaping: (201, 5)
y_test shape after reshaping: (51, 5)
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 5, 128)            52992     
                                                                 
 bidirectional_3 (Bidirecti  (None, 5, 512)            788480    
 onal)                                                           
                                                                 
 l

  saving_api.save_model(


In [5]:
# Test the model with a new sentence
sample_sentence = "புத்தகத்தை எடு"
sample_seq = tokenizer.texts_to_sequences([sample_sentence])
sample_pad = pad_sequences(sample_seq, maxlen=max_length, padding='post')

# Predict the output
pred = model.predict(sample_pad)
predicted_word = tokenizer.index_word[pred.argmax()]
print("Predicted word:", predicted_word)




KeyError: 1656