In [1]:
import pandas as pd


df = pd.read_excel('./parallel-corpus.xlsx')

# Keep only the first two columns
df = df.iloc[:, :2]

df.rename(columns = {'SENTENCES ':'SENTENCES'}, inplace = True)


df.head()

Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Tokenize the sentences
tokenizer_eng = Tokenizer()
tokenizer_urdu = Tokenizer()

# Convert the 'SENTENCES' column to string type before fitting the tokenizer
df['SENTENCES'] = df['SENTENCES'].astype(str)
# Convert the 'MEANING' column to string type before fitting the tokenizer
df['MEANING'] = df['MEANING'].astype(str)

tokenizer_eng.fit_on_texts(df['SENTENCES'])
tokenizer_urdu.fit_on_texts(df['MEANING'])

eng_sequences = tokenizer_eng.texts_to_sequences(df['SENTENCES'])
urdu_sequences = tokenizer_urdu.texts_to_sequences(df['MEANING'])

# Pad sequences
max_len_eng = max(len(seq) for seq in eng_sequences)
max_len_urdu = max(len(seq) for seq in urdu_sequences)

max_len = max(max_len_eng,max_len_urdu)

eng_sequences = pad_sequences(eng_sequences, maxlen=max_len, padding='post')
urdu_sequences = pad_sequences(urdu_sequences, maxlen=max_len, padding='post')

# Vocabulary sizes
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_urdu = len(tokenizer_urdu.word_index) + 1

2024-10-12 17:56:10.514685: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-12 17:56:10.515062: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-12 17:56:10.517525: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-12 17:56:10.524711: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:476] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1728737770.534673   66811 cuda_dnn.cc:8312] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1728737770.53

In [3]:
# Split the data into training and validation sets
train_size = int(len(eng_sequences) * 0.7)
test_size = int(len(eng_sequences) * 0.15)

# For English train, validation and test
x_train, x_temp = eng_sequences[:train_size], eng_sequences[train_size:]
x_test, x_val = x_temp[:test_size], x_temp[test_size:]

# For Urdu train, validation and test
y_train, y_temp = urdu_sequences[:train_size], urdu_sequences[train_size:]
y_test, y_val = y_temp[:test_size], y_temp[test_size:]


In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Build Model
model = Sequential(
    [
        Embedding(input_dim=vocab_size_eng , output_dim=64),
        LSTM(64, return_sequences=True),
        Dense(vocab_size_urdu, activation='softmax')
    ]
)


In [10]:
print(x_train.shape, y_train.shape)


(21114, 938) (21114, 938)


In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# Train Model
model.fit(x_train, y_train, epochs=25,validation_data=(x_val,y_val))


model.evaluate(x_test,y_test)

In [None]:
# Translate function
def translate(text):
    sequence = tokenizer_eng.texts_to_sequences([text])
    sequence = pad_sequences(sequence, maxlen=max_len_eng, padding='post')
    prediction = model.predict(sequence)
    predicted_sequence = np.argmax(prediction, axis=-1)
    translated_text = ' '.join([tokenizer_urdu.index_word[idx] for idx in predicted_sequence[0] if idx != 0])
    return translated_text

# Example translation
print(translate('hello'))