In [135]:
import pandas as pd
import numpy as np
import re
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Input, Dense, SimpleRNN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MinMaxScaler

In [136]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [137]:
df = pd.read_csv('/content/drive/MyDrive/Inteli/Módulo 11 - Brastel/Ponderada Hayashi Tradução/Sentence pairs in Portuguese-English - 2024-09-09.tsv', delimiter='\t')
df = df.sample(frac = 1)
print(df.head())
print(df.shape)

         146680                Uma menina chorando abriu a porta.     267881  \
18018   1047028            Quando você começou a aprender alemão?   427847.0   
195378  7297179                        Detestei estudar esloveno.  7985459.0   
55594   2695021                            A minha opinião mudou.  2693752.0   
201779  8165254           Tom e Mary são membros da mesma igreja.  8093131.0   
53895   2623428  Exercícios regulares são benéficos para a saúde.    20214.0   

                        A crying girl opened the door.  
18018              When did you start learning German?  
195378                     I hated studying Slovenian.  
55594                          My opinion has changed.  
201779    Tom and Mary are members of the same church.  
53895   Regular exercise is beneficial to good health.  
(292877, 4)


In [138]:
# Leave only the columns with sentences
df = pd.DataFrame(df.iloc[:, [1, 3]].values, columns=['Original (PT)', 'Tradução (EN)'])

# Ensure all values in 'Tradução (EN)' are strings
df['Tradução (EN)'] = df['Tradução (EN)'].astype(str)

# Calculate the number of words in each sentence for both columns
df['PT_word_count'] = df['Original (PT)'].str.split().str.len()
df['EN_word_count'] = df['Tradução (EN)'].str.split().str.len()

# Filter out sentences with more than 10 words in either language
df = df[(df['PT_word_count'] <= 10) & (df['EN_word_count'] <= 10)]

# Drop the temporary word count columns
df = df.drop(['PT_word_count', 'EN_word_count'], axis=1)

In [139]:
tokenizer_pt = Tokenizer()
tokenizer_en = Tokenizer()

tokenizer_pt.fit_on_texts(df['Original (PT)'])
tokenizer_en.fit_on_texts(df['Tradução (EN)'])

# Get the sequences padded
sequences_pt = tokenizer_pt.texts_to_sequences(df['Original (PT)'])
sequences_en = tokenizer_en.texts_to_sequences(df['Tradução (EN)'])

padded_sequences_pt = pad_sequences(sequences_pt, padding='post')
padded_sequences_en = pad_sequences(sequences_en, padding='post')

In [140]:
print(padded_sequences_pt[0])
print(padded_sequences_en[0])

[ 70   9 285   7 238 611   0   0   0   0   0   0]
[ 82  36   5 425 483 580   0   0   0   0   0   0   0   0]


In [141]:
# Get the first sentence of padded_sequences_pt and padded_senquences_en, revert the indexing to get the original sentence
print(tokenizer_pt.sequences_to_texts([padded_sequences_pt[0]])[0])
print(tokenizer_en.sequences_to_texts([padded_sequences_en[0]])[0])

quando você começou a aprender alemão
when did you start learning german


In [142]:
scaler_pt = MinMaxScaler()
scaler_en = MinMaxScaler()
padded_sequences_pt = scaler_pt.fit_transform(padded_sequences_pt)
padded_sequences_en = scaler_en.fit_transform(padded_sequences_en)

In [143]:
print(padded_sequences_pt.shape)
print(padded_sequences_en.shape)

(254711, 12)
(254711, 14)


In [144]:
train_data = padded_sequences_pt[:int(0.8 * len(padded_sequences_pt))]
test_data = padded_sequences_pt[int(0.8 * len(padded_sequences_pt)):]
train_labels = padded_sequences_en[:int(0.8 * len(padded_sequences_en))]
test_labels = padded_sequences_en[int(0.8 * len(padded_sequences_en)):]

In [145]:
RNNmodel = Sequential([
    Input(shape=(padded_sequences_pt.shape[1],1), batch_size=2),
    SimpleRNN(units=(16)),
    Dense(padded_sequences_en.shape[1])
])

RNNmodel.compile(loss='mean_squared_error', optimizer='adam')

loss = RNNmodel.fit(train_data, train_labels, epochs=1)
loss = loss.history['loss']

predicted_sentences = RNNmodel.predict(test_data)

[1m6368/6368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 4ms/step - loss: 0.0028
[1m1592/1592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step


In [146]:
# Transform the predicted sentences back to actual sentences
predicted_sentences = scaler_en.inverse_transform(predicted_sentences)
predicted_sentences = predicted_sentences.astype(int)
predicted_sentences = tokenizer_en.sequences_to_texts(predicted_sentences)

test_labels = scaler_en.inverse_transform(test_labels)
test_labels = test_labels.astype(int)
test_labels = tokenizer_en.sequences_to_texts(test_labels)

In [147]:
print(test_labels[0])
print(predicted_sentences[0])

i promise you i won't stay out too late
once use went wouldn't big because talk come an like i
