Load packages

In [50]:
import tensorflow as tf

import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder


In [51]:
# Print TensorFlow version
tf.__version__

'2.18.0'

In [52]:
train_df = pd.read_csv("train_languages.csv")
train_df.head(10)

Unnamed: 0,sentence,language
0,"Jean Beauverie (Fontaines-sur-Saône, 18 febbra...",italian
1,Il pinguino saltarocce (Eudyptes chrysocome (F...,italian
2,Maison Ikkoku - Cara dolce Kyoko (めぞん一刻 Mezon ...,italian
3,La mia città è un singolo della cantante itali...,italian
4,L'Armata Rossa dei Lavoratori e dei Contadini ...,italian
5,Selezione dal Reader's Digest è stata una riv...,italian
6,La cultura Deverel-Rimbury è il nome dato a un...,italian
7,"Matías Ezequiel Dituro (Bigand (Santa Fe), 8 m...",italian
8,Tomorrow Never Knows è un brano musicale del g...,italian
9,"Berit Elisabeth Andersson (Stoccolma, 11 novem...",italian


In [53]:
len(train_df)

3633

Encode target variable from text to number


In [54]:
Y = train_df['language']
encoder = LabelEncoder()
encoder.fit(Y)
Y = encoder.transform(Y)
Y = tf.keras.utils.to_categorical(
    Y,
    num_classes=4 # Number of languages

)

Text processing

In [55]:
train_df['sentence_lower'] = train_df["sentence"].str.lower()
train_df['sentence_no_punctuation'] = train_df['sentence_lower'].str.replace('[^\w\s]','')
train_df['sentence_no_punctuation'] = train_df["sentence_no_punctuation"].fillna("fillna")

In [56]:
max_features=5000 # Set maximum number of words to 5000
maxlen=400 # Set maximum sequence length to 400

In [57]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) # Tokenization

In [58]:
tok.fit_on_texts(list(train_df['sentence_no_punctuation']))  # Fit to cleaned text

In [59]:
print(len(tok.word_index))
vocab_size = len(tok.word_index) + 1
# Represents the number of words that are tokenized different from max_features but necessary for the definition of the dimension of the embedding space

49274


In [60]:
train_df = tok.texts_to_sequences(list(train_df['sentence_no_punctuation'])) # How sequences are created
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df, maxlen=maxlen) # Execute pad step

In [61]:

from sklearn.model_selection import train_test_split # Divide into train and test set

In [62]:
X_train, X_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.1, random_state=42)

In [63]:
embedding_dim = 50 # Final dimension of the embedding space.


Let's write down the model

In [64]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size, # Embedding input
                           output_dim=embedding_dim), # Embedding output
  tf.keras.layers.Flatten(), # Flatten layer

  tf.keras.layers.Dense(4, activation=tf.nn.softmax) # Ouput layer a Dense layer with 4 probabilities
  # Final activation function also defined which is the softmax function typical for multiclass classifiction problems
])

In [65]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy', # Recommendeded loss function
              metrics=['accuracy'])

In [66]:
model.summary() # Show architecture

In [67]:
model.fit(np.array(X_train), np.array(y_train), epochs=3) # Fit model

Epoch 1/3
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - accuracy: 0.5509 - loss: 1.0933
Epoch 2/3
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.9958 - loss: 0.0646
Epoch 3/3
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.9982 - loss: 0.0197


<keras.src.callbacks.history.History at 0x7f4880617af0>

Use test to evaluate model

In [68]:
model.evaluate(np.array(X_test), np.array(y_test))

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0099  


[0.010375813581049442, 1.0]

In [69]:
from sklearn.metrics import confusion_matrix
predictions = model.predict(X_test)
cm = confusion_matrix(predictions.argmax(axis=1), y_test.argmax(axis=1)) # Generate the confusion matrix

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


In [70]:
cm

array([[ 91,   0,   0,   0],
       [  0,  88,   0,   0],
       [  0,   0, 102,   0],
       [  0,   0,   0,  83]])

Try brand new text

In [72]:
# Codes for each language in order to evaluate properly
print('english', encoder.transform(['english']))
print('french', encoder.transform(['french']))
print('italian', encoder.transform(['italian']))
print('spanish', encoder.transform(['spanish']))

english [0]
french [1]
italian [2]
spanish [3]


Language prediction

In [73]:
new_text = ["tensorflow is a great tool you can find a lot of tutorials from packt"]
#new_text = ["tensorflow est un excellent outil vous pouvez trouver beaucoup de tutoriels de packt"]
#new_text = ["tensorflow è un ottimo strumento puoi trovare molti tutorial di packt"]
#new_text = ["tensorflow es una gran herramienta puedes encontrar muchos tutoriales de packt"]


In [74]:
test_text = tok.texts_to_sequences(new_text) # Create sequences
test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=maxlen) # Execute pad step

In [75]:
# Create a mapping of index to language
index_to_language = {0: 'english', 1: 'french', 2: 'italian', 3: 'spanish'}

# Generate the predictions
np.set_printoptions(suppress=True)
predictions = model.predict(test_text)

# Get the index of the highest predicted probability
predicted_index = predictions.argmax()

# Map the index to the language
predicted_language = index_to_language[predicted_index]

# Display the predicted language and the prediction probabilities
print(f"Predicted Language: {predicted_language}")
print("Prediction Probabilities:", predictions)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Predicted Language: english
Prediction Probabilities: [[0.9255124  0.00878484 0.02849957 0.03720314]]
