In [1]:
import pandas as pd
import numpy as np



In [4]:
# load dataset
df = pd.read_parquet('../data/train-00000-of-00001-3d14582ea46e1b17.parquet')



In [5]:
# print first 5 rows of dataset
df.head()


Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [6]:
# transform dataset to ascii
from unidecode import unidecode

df_ascii = df.applymap(unidecode)


df_ascii.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ca alors !


In [7]:
# save the ascii dataset
df_ascii.to_parquet('../data/train-00000-of-00001-3d14582ea46e1b17-ascii.parquet')


In [8]:
# max length of a cell:
print(df_ascii.applymap(len).max().max())


325


In [11]:
# melted dataset
df_melted = df_ascii.melt(var_name='language', value_name='text')

df_melted.head()


Unnamed: 0,language,text
0,English words/sentences,Hi.
1,English words/sentences,Run!
2,English words/sentences,Run!
3,English words/sentences,Who?
4,English words/sentences,Wow!


In [12]:
# save the melted dataset
df_melted.to_parquet('../data/train-00000-of-00001-3d14582ea46e1b17-ascii-melted.parquet')


In [15]:
# length of the melted dataset
print(len(df_melted))


350932


In [22]:
# tokenize the text, character to integers
chars = sorted(list(set(''.join(df_melted['text']))))
# add unknown character
chars.append('UNK')


print(''.join(chars))


 !"$%&'()+,-./0123456789:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzUNK


In [23]:
def encode_text(text: str) -> list:
    return [chars.index(c) if c in chars else chars.index('UNK') for c in text]

def decode_text(encoded_text: list) -> str:
    return ''.join([chars[c] for c in encoded_text])

print(encode_text('Salut tout le monde. '))
print(decode_text(encode_text('hello world')))


[47, 55, 66, 75, 74, 0, 74, 69, 75, 74, 0, 66, 59, 0, 67, 69, 68, 58, 59, 12, 0]
hello world


In [35]:
block_size = 8 # 8 characters per block


# define the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


model = Sequential([
    Dense(128, activation='relu', input_shape=(block_size,)),
    Dense(128, activation='relu'),
    Dense(2, activation='softmax')
])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 128)               1152      
                                                                 
 dense_11 (Dense)            (None, 128)               16512     
                                                                 
 dense_12 (Dense)            (None, 2)                 258       
                                                                 
Total params: 17,922
Trainable params: 17,922
Non-trainable params: 0
_________________________________________________________________


In [36]:
cost = 'categorical_crossentropy'
optimizer = 'adam'
metrics = ['accuracy']

model.compile(loss=cost, optimizer=optimizer, metrics=metrics)


In [37]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert to list first
x = df_melted['text'].apply(encode_text).apply(lambda x: x[:block_size]).to_list()

# Pad sequences
x = pad_sequences(x, maxlen=block_size, padding='post', truncating='post')

y = pd.get_dummies(df_melted['language']).values  # convert DataFrame into a NumPy array

model.fit(
    x=x,
    y=y,
    batch_size=32,
    epochs=10,
    validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x16c21fc70>

In [46]:

# test the model on my own text
my_text = 'youpi'
my_text_encoded = encode_text(my_text)
my_text_encoded = pad_sequences([my_text_encoded], maxlen=block_size, padding='post', truncating='post')

print(my_text_encoded)

if np.argmax(model.predict(my_text_encoded)) == 0:
    print('French')
else:
    print('English')

[[79 69 75 70 63  0  0  0]]
English
