In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder 
import seaborn as sns
import matplotlib.pyplot as plt

In [30]:
df = pd.read_csv('https://media.githubusercontent.com/media/PacktPublishing/Advanced-NLP-Projects-with-TensorFlow-2.0/master/section_1_notebooks/train_languages.csv')

In [22]:
df.head()

Unnamed: 0,sentence,language
0,"Jean Beauverie (Fontaines-sur-Saône, 18 febbra...",2
1,Il pinguino saltarocce (Eudyptes chrysocome (F...,2
2,Maison Ikkoku - Cara dolce Kyoko (めぞん一刻 Mezon ...,2
3,La mia città è un singolo della cantante itali...,2
4,L'Armata Rossa dei Lavoratori e dei Contadini ...,2


In [25]:
df.isnull().sum()

sentence    11
language     0
dtype: int64

In [26]:
df.shape

(3633, 2)

In [31]:
df = df.dropna()
df.isnull().sum()

sentence    0
language    0
dtype: int64

In [32]:
le = LabelEncoder()
le.fit(df.language.unique())
df['language'] = le.transform(df['language']) 

In [33]:
le.classes_

array(['english', 'french', 'italian', 'spanish'], dtype=object)

In [34]:
y = tf.keras.utils.to_categorical(df['language'],num_classes=4)

In [35]:
import string
def clean(text):
  text = text.lower()
  return "".join([c for c in text if c not in string.punctuation])
df['clean_sentence'] = df["sentence"].apply(lambda x:clean(x))

In [42]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(list(df['clean_sentence']))
print(len(tokenizer.word_index))
vocab_size = len(tokenizer.word_index) + 1

51979


In [38]:
train_df = tokenizer.texts_to_sequences(list(df['clean_sentence'])) #creating sequences
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df, maxlen=400)

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.1, random_state=42)
embedding_dim = 50

In [43]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size, #embedding input
                           output_dim=50,#embedding output
                           input_length=400), #maximum length of an input sequence
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(4, activation=tf.nn.softmax)])

In [44]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 400, 50)           2599000   
                                                                 
 flatten (Flatten)           (None, 20000)             0         
                                                                 
 dense (Dense)               (None, 4)                 80004     
                                                                 
Total params: 2,679,004
Trainable params: 2,679,004
Non-trainable params: 0
_________________________________________________________________


In [45]:
model.fit(np.array(X_train), np.array(y_train), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f57e0055410>

In [46]:
model.evaluate(np.array(X_test), np.array(y_test)) 



[0.007024282123893499, 1.0]

In [48]:
from sklearn.metrics import confusion_matrix
predictions = model.predict(X_test) 
cm = confusion_matrix(y_test.argmax(axis=1),predictions.argmax(axis=1))
cm

array([[ 73,   0,   0,   0],
       [  0, 105,   0,   0],
       [  0,   0, 110,   0],
       [  0,   0,   0,  75]])