In [67]:
import pandas as pd
import numpy as np

In [68]:
data = pd.read_csv("eng_-french.csv")

In [69]:
data.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [70]:
data['English words/sentences count'] = data['English words/sentences'].str.split().str.len()

In [71]:
data['French words/sentences count'] = data['French words/sentences'].str.split().str.len()

In [72]:
data.head()

Unnamed: 0,English words/sentences,French words/sentences,English words/sentences count,French words/sentences count
0,Hi.,Salut!,1,1
1,Run!,Cours !,1,2
2,Run!,Courez !,1,2
3,Who?,Qui ?,1,2
4,Wow!,Ça alors !,1,3


In [73]:
data = data[(data['English words/sentences count']<6)&(data['French words/sentences count']<6)]

In [74]:
data.head()

Unnamed: 0,English words/sentences,French words/sentences,English words/sentences count,French words/sentences count
0,Hi.,Salut!,1,1
1,Run!,Cours !,1,2
2,Run!,Courez !,1,2
3,Who?,Qui ?,1,2
4,Wow!,Ça alors !,1,3


In [75]:
data = data.drop_duplicates()
data.head()

Unnamed: 0,English words/sentences,French words/sentences,English words/sentences count,French words/sentences count
0,Hi.,Salut!,1,1
1,Run!,Cours !,1,2
2,Run!,Courez !,1,2
3,Who?,Qui ?,1,2
4,Wow!,Ça alors !,1,3


# **Dividing Data Into Train Test Vaildate**

In [76]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2)

In [77]:
train["English words/sentences count"].max()

5

In [78]:
train["French words/sentences count"].max()

5

In [79]:
train, valid = train_test_split(data, test_size=0.2)

# **Word Frequency**

In [80]:
freq_x = train['English words/sentences'].str.split(expand=True).stack().value_counts().reset_index()
freq_y = train['French words/sentences'].str.split(expand=True).stack().value_counts().reset_index()

freq_x.to_csv('English_freq.csv', index=False)
freq_y.to_csv('French_freq.csv', index=False)

In [81]:
freq_x = pd.read_csv('English_freq.csv')
freq_y = pd.read_csv('French_freq.csv')

In [82]:
freq_x.head()

Unnamed: 0,index,count
0,I,7497
1,a,4206
2,you,4145
3,is,3664
4,the,2848


In [83]:
freq_y.head()

Unnamed: 0,index,count
0,?,6395
1,Je,5737
2,est,2912
3,Il,2807
4,pas,2709


# **Data Preaparation / PreProcessing**

In [84]:
def get_data(raw_lines) :
  text=[]
  for raw_line in raw_lines:
    text.append('<start>' + raw_line + '<end>')
  return text

In [85]:
english_train = get_data(list(train['English words/sentences']))

In [86]:
english_train

['<start>It was raining.<end>',
 '<start>Tom certainly enjoys his wine.<end>',
 '<start>He interpreted for me.<end>',
 '<start>I messed up.<end>',
 "<start>You're skinny.<end>",
 '<start>He will be here soon.<end>',
 '<start>How is your wife?<end>',
 "<start>I'm from America.<end>",
 "<start>Where've you been?<end>",
 '<start>Tom was nowhere in sight.<end>',
 '<start>Tom is dead.<end>',
 '<start>I like their house.<end>',
 "<start>I'm methodical.<end>",
 '<start>Who are you all?<end>',
 "<start>It's true!<end>",
 "<start>Here's the address.<end>",
 '<start>Stop worrying.<end>',
 '<start>Everyone needs to stay focused.<end>',
 "<start>The baby's name was Tom.<end>",
 '<start>Tom is being very cooperative.<end>',
 '<start>Control your fear.<end>',
 '<start>I need some water.<end>',
 '<start>Is anybody hurt?<end>',
 '<start>He held his breath.<end>',
 '<start>Do you have Time magazine?<end>',
 '<start>He predicted she would win.<end>',
 "<start>This isn't an easy task.<end>",
 '<start>I k

In [87]:
french_train = get_data(list(train['French words/sentences']))

In [88]:
french_train

['<start>Il pleuvait.<end>',
 '<start>Tom apprécie certainement son vin.<end>',
 '<start>Il interpréta pour moi.<end>',
 "<start>J'ai dégueulassé.<end>",
 '<start>Vous êtes maigrichonnes.<end>',
 '<start>Il sera bientôt là.<end>',
 '<start>Comment va ta femme\u202f?<end>',
 "<start>Je viens d'Amérique.<end>",
 '<start>Où as-tu été ?<end>',
 '<start>Tom était introuvable.<end>',
 '<start>Tom est mort.<end>',
 "<start>J'aime leur maison.<end>",
 '<start>Je suis méthodique.<end>',
 '<start>Qui êtes-vous tous\u202f?<end>',
 "<start>C'est vrai\u202f!<end>",
 "<start>Voici l'adresse.<end>",
 "<start>Arrête de t'inquiéter.<end>",
 '<start>Chacun doit rester concentré.<end>',
 "<start>Le bébé s'appelait Tom.<end>",
 '<start>Tom se montre très coopératif.<end>',
 '<start>Contrôle ta peur.<end>',
 "<start>J'ai besoin d'eau.<end>",
 '<start>Quiconque est-il blessé ?<end>',
 '<start>Il a retenu son souffle.<end>',
 '<start>As-tu le magazine Time\xa0?<end>',
 "<start>Il a prédit qu'elle gagnerait.<

In [89]:
french_valid = get_data(list(valid['French words/sentences']))
english_valid = get_data(list(valid['English words/sentences']))

In [90]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenizing and padding input language (French)
fre_token = Tokenizer(filters='', lower=False)
fre_token.fit_on_texts(french_train)
fre_tokenized = fre_token.texts_to_sequences(french_train)
fre_padded = pad_sequences(fre_tokenized, padding='post')

# Tokenizing and padding target language (English)
eng_token = Tokenizer(filters='', lower=False)
eng_token.fit_on_texts(english_train)
eng_tokenized = eng_token.texts_to_sequences(english_train)
eng_padded = pad_sequences(eng_tokenized, padding='post')


In [91]:
# Assuming fre_token and end_token are Tokenizer objects that have been fit on the respective datasets
# fre_token.word_index is a dictionary mapping words to their integer index in French
# end_token.word_index is a dictionary mapping words to their integer index in English

# Number of unique tokens in the output (French) language
num_op_tokens = len(fre_token.word_index)

# Number of unique tokens in the input (English) language
num_ip_tokens = len(eng_token.word_index)

# Assuming fre_padded and eng_padded are padded sequences of your tokenized text data
# fre_padded.shape[1] gives the maximum length of the padded sequences in French
# eng_padded.shape[1] gives the maximum length of the padded sequences in English

# Maximum length of sentence in the output (French) language
max_len_op = fre_padded.shape[1]

# Maximum length of sentence in the input (English) language
max_len_ip = eng_padded.shape[1]


In [92]:
num_op_tokens

19469

In [93]:
num_ip_tokens

11436

In [94]:
max_len_op

5

In [95]:
max_len_ip

5

In [96]:
french_train[0]

'<start>Il pleuvait.<end>'

In [97]:
fre_padded[0]

array([   4, 3688,    0,    0,    0], dtype=int32)

# **K-Text**

In [98]:
pip install ktext



In [99]:
from ktext.preprocess import processor

In [100]:
# Initialize the processor with specific parameters
# keep_n: the maximum number of unique words to keep
# padding_maxlen: the maximum length of sequences to pad or truncate
english_pp = processor(keep_n=10407, padding_maxlen=7)

# Fit the processor on the training data and transform the data into vectors
# This step learns the vocabulary from the training data and converts the text into numerical vectors
english_train_vecs = english_pp.fit_transform(english_train)

# The result is a list of sequences where each sequence represents a processed and vectorized version of the input text
print(english_train_vecs)

  self.pid = os.fork()


[[   0    0    2 ...   20  646    3]
 [   2   11 1772 ...   52  415    3]
 [   0    2   15 ...   43   23    3]
 ...
 [   0    2    9 ...   31  350    3]
 [   2   30   10 ...   37  456    3]
 [   0    0    2 ...   19  202    3]]


In [101]:
# Initialize the processor with specific parameters for French text data
# append_indicators: whether to add start and end indicators to the sequences
# keep_n: the maximum number of unique words to keep
# padding_maxlen: the maximum length of sequences to pad or truncate
# padding: whether padding is added 'pre' (at the beginning) or 'post' (at the end) of sequences
french_pp = processor(append_indicators=True, keep_n=17228, padding_maxlen=7, padding='post')

# Fit the processor on the French training data and transform the data into vectors
# This step learns the vocabulary from the training data and converts the text into numerical vectors
french_train_vecs = french_pp.fit_transform(french_train)



In [102]:
import dill as dpickle
import numpy as np

#save the perprocessor
with open('english_pp.dpkl', 'wb') as f:
  dpickle.dump(english_pp, f)

with open('french_pp.dpkl', 'wb') as f:
  dpickle.dump(french_pp, f)

#save the processed data

np.save('french_train_vecs.npy', french_train_vecs)
np.save('english_train_vecs.npy', english_train_vecs)

In [103]:
def load_decoder_inputs(decoder_np_vecs='french_train_vecs.npy'):
    # Load the vectorized title data from a .npy file
    vectorized_title = np.load(decoder_np_vecs)

    # Decoder input data: all rows and all columns except the last one
    decoder_input_data = vectorized_title[:, :-1]

    # Decoder target data: all rows and all columns except the first one
    decoder_target_data = vectorized_title[:, 1:]

    # Print the shapes of the input and target data
    print(f'Shape of decoder input: {decoder_input_data.shape}')
    print(f'Shape of decoder target: {decoder_target_data.shape}')
    return decoder_input_data, decoder_target_data


def load_encoder_inputs(encoder_np_vecs='english_train_vecs.npy'):
    # Load the vectorized body data from a .npy file
    vectorized_body = np.load(encoder_np_vecs)

    # Encoder input data is the entire loaded array
    encoder_input_data = vectorized_body

    # Document length is the number of columns in the array
    doc_length = encoder_input_data.shape[1]

    # Print the shape of the encoder input data
    print(f'Shape of encoder input: {encoder_input_data.shape}')

    return encoder_input_data, doc_length

In [104]:
def load_text_processor(fname='english_pp.dpkl'):
    # Load files from disk
    with open(fname, 'rb') as f:
        pp = dpickle.load(f)

    num_tokens = max(pp.id2token.keys()) + 1
    print(f'Size of vocabulary for {fname}: {num_tokens:,}')
    return num_tokens, pp

In [105]:
import dill as dpickle

In [106]:
num_encoder_tokens, english_pp = load_text_processor('english_pp.dpkl')
num_decoder_tokens, french_pp = load_text_processor('french_pp.dpkl')

Size of vocabulary for english_pp.dpkl: 6,773
Size of vocabulary for french_pp.dpkl: 10,971


In [107]:
encoder_input_data, doc_length = load_encoder_inputs('english_train_vecs.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('french_train_vecs.npy')

Shape of encoder input: (43369, 7)
Shape of decoder input: (43369, 6)
Shape of decoder target: (43369, 6)


In [108]:
print('Original String: ', french_train[0])
print('After Pre-Processing: ', decoder_input_data[0])

Original String:  <start>Il pleuvait.<end>
After Pre-Processing:  [   3    4   11 2247    2    5]


In [109]:
print('Original String: ', english_train[0])
print('After Pre-Processing: ', encoder_input_data[0])

Original String:  <start>It was raining.<end>
After Pre-Processing:  [  0   0   2   9  20 646   3]


# **Define Model Architecture**

In [110]:
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, BatchNormalization
from keras import optimizers

In [111]:
# Arbitrarily set latent dimension for embedding and hidden units
latent_dim = 300
doc_length = 100  # Assuming doc_length is predefined
num_encoder_tokens = 10000  # Assuming num_encoder_tokens is predefined
num_decoder_tokens = 10000  # Assuming num_decoder_tokens is predefined

#######################
#### Encoder Model ####
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')

# Word embedding for encoder (ex: Issue Body)
x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

# Intermediate GRU Layer (optional)
# x = GRU(Latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True) (x)
# x = BatchNormalization (name='Encoder-Batchnorm-2')(x)

# We do not need the `encoder_output` just the hidden state.
_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

# Encapsulate the encoder as a separate entity so we can just
# encode without decoding if we want to.
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

#######################
#### Decoder Model ####
decoder_inputs = Input(shape=(None,), name='Decoder-Input')

# Word Embedding For Decoder (ex: Issue Titles)
dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using decoder_state_input as initial state.
decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense Layer for prediction
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

#######################
#### Seq2Seq Model ####

seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

seq2seq_Model.compile(optimizer=optimizers.Nadam(learning_rate=0.001), loss='sparse_categorical_crossentropy')


In [112]:
seq2seq_Model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 Decoder-Input (InputLayer)  [(None, None)]               0         []                            
                                                                                                  
 Decoder-Word-Embedding (Em  (None, None, 300)            3000000   ['Decoder-Input[0][0]']       
 bedding)                                                                                         
                                                                                                  
 Encoder-Input (InputLayer)  [(None, 100)]                0         []                            
                                                                                                  
 Decoder-Batchnorm-1 (Batch  (None, None, 300)            1200      ['Decoder-Word-Embedding

In [113]:
from keras.callbacks import CSVLogger, ModelCheckpoint
# import numpy as np

# Script name base
script_name_base = 'tutorial_seq2seq'

# Callbacks
csv_logger = CSVLogger(f'{script_name_base}.log')
model_checkpoint = ModelCheckpoint(f'{script_name_base}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5', save_best_only=True)

# Parameters
batch_size = 1200
epochs = 70

# # Ensure that the decoder target data has the correct shape
decoder_target_data_exp = np.expand_dims(decoder_target_data, axis=-1)

# Model fitting
history = seq2seq_Model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data_exp,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.12,
    callbacks=[csv_logger, model_checkpoint]
)


Epoch 1/70


ValueError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model_5" is incompatible with the layer: expected shape=(None, 100), found shape=(None, 7)


In [114]:
del seq2seq_Model

In [115]:
seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

seq2seq_Model.compile(optimizer = optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')



In [131]:
# Import necessary libraries
from keras.models import load_model

# Provide the correct path to the weights file
weights_file_path = 'tutorial_seq2seq.epoch23-val1.78077.hdf5'

# Check if the file exists
import os
if os.path.exists(weights_file_path):
    # Load the weights
    seq2seq_Model.load_weights(weights_file_path)
    print("Weights loaded successfully.")
else:
    print(f"Error: The file '{weights_file_path}' does not exist.")


Error: The file 'tutorial_seq2seq.epoch23-val1.78077.hdf5' does not exist.


In [132]:
def extract_encoder_model(model):
    """
    Extracts the encoder model from the given seq2seq model.

    Args:
        model (keras.Model): The seq2seq model containing the encoder.

    Returns:
        keras.Model: The encoder model.
    """
    encoder_model = model.get_layer('Encoder-Model')
    return encoder_model

def extract_decoder_model(model):
    """
    Extracts the decoder model from the given seq2seq model.

    Args:
        model (keras.Model): The seq2seq model containing the decoder.

    Returns:
        keras.Model: The decoder model.
    """
    # The latent dimension is the same throughout the architecture so we are going to
    # cheat and grab the latent dimension of the embedding because that is the same as what is
    # output from the decoder
    latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]

    # Reconstruct the input into the decoder
    decoder_inputs = model.get_layer('Decoder-Input').input
    dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
    dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)

    # Instead of setting the initial state from the encoder and forgetting about it during
    # inference, we are not doing teacher forcing, so we will have to have a feedback loop from predictions
    # to the GRU, thus we define this input layer for the state so we can add this capability
    gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')

    # We need to reuse the weights, which is why we are getting this.
    # If you inspect the decoder GRU that we created for training, it will take as input
    # 2 tensors:
    #   (1) the embedding Layer output for the teacher forcing
    #   (which will now be the last step's prediction, and will be _start_ on the first step)
    #   (2) the state, which we will initialize with the encoder on the first time step,
    #   and then grab the state after the first prediction and feed that back in again.
    gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])

    # Reconstruct dense Layers
    dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
    dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
    decoder_model = Model([decoder_inputs, gru_inference_state_input], [dense_out, gru_state_out])

    return decoder_model

In [133]:
class Seq2Seq_Inference(object):
    def __init__(self, encoder_preprocessor, decoder_preprocessor, seq2seq_model):
        self.bp_english = encoder_preprocessor
        self.pp_french = decoder_preprocessor
        self.seq2seq_model = seq2seq_model
        self.encoder_model = extract_encoder_model(seq2seq_model)
        self.decoder_model = extract_decoder_model(seq2seq_model)
        self.default_max_len_french = self.pp_french.padding_maxlen
        self.nn = None
        self.rec_df = None

    def generate_french(self, raw_input_text, max_len_french=None):
        """
        Use the seq2seq model to generate a title given the body of an issue.
        Inputs:
            raw_input_text: str
                The body of the issue text as an input string.
            max_len_french: int (optional)
                The maximum length of the title the model will generate.
        Returns:
            tuple: A tuple containing the original body encoding and the generated French text.
        """
        if max_len_french is None:
            max_len_french = self.default_max_len_french

        # Get the encoder's features for the decoder
        raw_tokenized = self.bp_english.transform([raw_input_text])
        body_encoding = self.encoder_model.predict(raw_tokenized)

        # We want to save the encoder's embedding before it's updated by the decoder
        # because we can use that as an embedding for other tasks.
        original_body_encoding = body_encoding
        state_value = np.array(self.pp_french.token2id['_start_']).reshape(1, 1)

        decoded_sentence = []
        stop_condition = False
        while not stop_condition:
            preds, st = self.decoder_model.predict([state_value, body_encoding])

            # We are going to ignore indices (padding) and indices 1 (unknown)
            # Argmax will return the integer index corresponding to the prediction + 2 b/c we chopped off first two
            pred_idx = np.argmax(preds[:, :, 2:]) + 2

            # Retrieve word from index prediction
            pred_word_str = self.pp_french.id2token[pred_idx]
            if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_french:
                stop_condition = True
                break

            decoded_sentence.append(pred_word_str)

            # Update the decoder for the next word
            body_encoding = st
            state_value = np.array(pred_idx).reshape(1, 1)

        return original_body_encoding, ' '.join(decoded_sentence)

    def print_example(self, i, english_text, french_text, threshold):
        if i:
            print("\n\n===========================================")
            print(f'=============== Example # {i} =============\n')

        print(f"English Text:\n {english_text} \n")

        if french_text:
            print(f"Original French text: \n {french_text}")

        emb, gen_french = self.generate_french(english_text)
        print("\n****** Machine Generated Title (Prediction) ******: \n", gen_french)

        if self.nn:
            # Return neighbors and distances
            n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4, include_distances=True)

            if min(d) <= threshold:
                cols = ['French', 'English']
                dfcopy = self.rec_df.iloc[n][cols].copy(deep=True)
                dfcopy['dist'] = d
                similar_issues_df = dfcopy.query(f'dist <= {threshold}')

                display(similar_issues_df)

    def demo_model_predictions(self, n, df, threshold=1):
        # Extract body and title from DF
        english_text_list = df['English words/sentences'].tolist()
        french_text_list = df['French words/sentences'].tolist()

        demo_list = np.random.randint(low=0, high=len(english_text_list), size=n)

        for i in demo_list:
            self.print_example(i, english_text=english_text_list[i], french_text=french_text_list[i], threshold=threshold)


In [134]:
# Instantiate Seq2Seq_Inference class
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=english_pp, decoder_preprocessor=french_pp, seq2seq_model=seq2seq_Model)

In [141]:
# Call the demo_model_predictions method to display predictions on random rows of the holdout set
seq2seq_inf.demo_model_predictions(n=5, df=test)




English Text:
 I'm soaked to the bone. 

Original French text: 
 Je suis trempé jusqu'à l'os.

****** Machine Generated Title (Prediction) ******: 
 dodo processus impatients tatouage liberte devenu legal



English Text:
 I went over the report. 

Original French text: 
 J'ai parcouru le rapport.

****** Machine Generated Title (Prediction) ******: 
 voudrait sacrifices assister taquiner eloigna echouera indescriptible



English Text:
 He's too drunk. 

Original French text: 
 Il est trop saoul.

****** Machine Generated Title (Prediction) ******: 
 mobilier ballon talentueuse dyslexique frappez passames souhait



English Text:
 You are drunk! 

Original French text: 
 Tu es saoul !

****** Machine Generated Title (Prediction) ******: 
 decue repliez autorisera acceptons scandaleusement scandaleusement montons



English Text:
 He is a dramatist. 

Original French text: 
 Il est dramaturge.

****** Machine Generated Title (Prediction) ******: 
 voudrait sacrifices assister taquin

In [136]:
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

# **Machine Learning Model**

**MarianMT Model:**
- The MarianMTModel used in this script is a machine translation model based on the Transformer architecture, developed by the Marian team at the University of Edinburgh.

- These models are pre-trained on large parallel corpora of text in multiple languages and can translate between various language pairs.

- The specific models used here are Helsinki-NLP/opus-mt-fr-en for French to English translation and Helsinki-NLP/opus-mt-en-ur for English to Urdu translation. These models are available through the Hugging Face Model Hub and have been trained on large datasets to perform high-quality translations.

**How It Works**
1. **Input Handling:** The script takes user input to determine the translation direction and the text to be translated.

2. **Preprocessing:** The text is tokenized into a format suitable for the model using MarianTokenizer.

3. **Translation:** The model generates translated text in the target language.
Postprocessing: The tokenized output is decoded back into a human-readable string.

4. **Output:** The translated text is displayed to the user.

In [137]:
from transformers import MarianMTModel, MarianTokenizer

# French to English
def translate_fr_en(text):
    model_name = 'Helsinki-NLP/opus-mt-fr-en'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated][0]

# English to Urdu
def translate_en_ur(text):
    model_name = 'Helsinki-NLP/opus-mt-en-ur'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated][0]

In [138]:
def user_interface():
    print("Press 1 for French to English translation")
    print("Press 2 for English to Urdu translation")
    choice = input("Enter your choice: ")

    if choice == '1':
        text = input("Enter French text: ")
        translation = translate_fr_en(text)
        print("Translation:", translation)
    elif choice == '2':
        text = input("Enter English text: ")
        translation = translate_en_ur(text)
        print("Translation:", translation)
    else:
        print("Invalid choice. Please enter 1 or 2.")


In [144]:
if __name__ == "__main__":
    user_interface()

Press 1 for French to English translation
Press 2 for English to Urdu translation
Enter your choice: 2
Enter English text: Hi we are mustafa and ammad
Translation: خوش آمدید ہم کر رہے ہیں وراور کر رہے ہیں
