In [1]:
import tensorflow as tf

import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder 


In [20]:
train_df = pd.read_csv('training.csv')#here should read the dataset
train_df

Unnamed: 0,sentence,language
0,"Jean Beauverie (Fontaines-sur-Saône, 18 febbra...",italian
1,Il pinguino saltarocce (Eudyptes chrysocome (F...,italian
2,Maison Ikkoku - Cara dolce Kyoko (めぞん一刻 Mezon ...,italian
3,La mia città è un singolo della cantante itali...,italian
4,L'Armata Rossa dei Lavoratori e dei Contadini ...,italian
...,...,...
3628,El Premio internacional de novela «Emilio Alar...,spanish
3629,La mujer más fea del mundo es una película esp...,spanish
3630,"Bacuag, también conocido como Bacnag, es un m...",spanish
3631,Violent Femmes es una banda de rock alternativ...,spanish


In [21]:
len(train_df) 

3633

Encode the target variable ('language') from text to number


In [24]:
Y = train_df['language']

#Convert Y to number 
#note 1: You may find LabelEncoder and  tf.keras.utils.to_categorical handy!
#note 2: there are 4 languages

train_df['language'].unique()
train_df['language'].replace(to_replace=['italian', 'french', 'english', 'spanish'], value=[0,1, 2, 3], inplace=True)

In [25]:
Y

0       0
1       0
2       0
3       0
4       0
       ..
3628    3
3629    3
3630    3
3631    3
3632    3
Name: language, Length: 3633, dtype: int64

It is be expected to convert the **train_df["sentence"]**

*  get rid of pucntuations, 
*  convert to lower case
* get rid of the null values

let's call it :   **train_df['sentence_no_punctuation']**


In [26]:
train_df

Unnamed: 0,sentence,language
0,"Jean Beauverie (Fontaines-sur-Saône, 18 febbra...",0
1,Il pinguino saltarocce (Eudyptes chrysocome (F...,0
2,Maison Ikkoku - Cara dolce Kyoko (めぞん一刻 Mezon ...,0
3,La mia città è un singolo della cantante itali...,0
4,L'Armata Rossa dei Lavoratori e dei Contadini ...,0
...,...,...
3628,El Premio internacional de novela «Emilio Alar...,3
3629,La mujer más fea del mundo es una película esp...,3
3630,"Bacuag, también conocido como Bacnag, es un m...",3
3631,Violent Femmes es una banda de rock alternativ...,3


In [27]:
train_df['sentence_no_punctuation'] = train_df["sentence"].str.lower()
train_df['sentence_no_punctuation'] = train_df['sentence_no_punctuation'].str.replace('[^\w\s]','')
train_df['sentence_no_punctuation'] = train_df["sentence_no_punctuation"].fillna("0")

train_df

Unnamed: 0,sentence,language,sentence_no_punctuation
0,"Jean Beauverie (Fontaines-sur-Saône, 18 febbra...",0,jean beauverie fontainessursaône 18 febbraio 1...
1,Il pinguino saltarocce (Eudyptes chrysocome (F...,0,il pinguino saltarocce eudyptes chrysocome for...
2,Maison Ikkoku - Cara dolce Kyoko (めぞん一刻 Mezon ...,0,maison ikkoku cara dolce kyoko めぞん一刻 mezon ik...
3,La mia città è un singolo della cantante itali...,0,la mia città è un singolo della cantante itali...
4,L'Armata Rossa dei Lavoratori e dei Contadini ...,0,larmata rossa dei lavoratori e dei contadini i...
...,...,...,...
3628,El Premio internacional de novela «Emilio Alar...,3,el premio internacional de novela emilio alarc...
3629,La mujer más fea del mundo es una película esp...,3,la mujer más fea del mundo es una película esp...
3630,"Bacuag, también conocido como Bacnag, es un m...",3,bacuag también conocido como bacnag es un mun...
3631,Violent Femmes es una banda de rock alternativ...,3,violent femmes es una banda de rock alternativ...


In [28]:
max_features=5000 #we set maximum number of words to 5000
maxlen=400 #we set maximum sequence length to 400

In [29]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) #again tokenizer step

In [30]:
tok.fit_on_texts(list(train_df['sentence_no_punctuation'])) #fit to cleaned text

In [31]:
print(len(tok.word_index))
vocab_size = len(tok.word_index) + 1 
#this represents the number of words that we tokenize different from max_features but necessary for
#the definition of the dimension of the embedding space

51629


In [12]:
train_df = tok.texts_to_sequences(list(train_df['sentence_no_punctuation'])) #this is how we create sequences
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df, maxlen=maxlen) #let's execute pad step

In [14]:
X_train, X_test, y_train, y_test =  #Split the data into train an test

Note: you may find "train_test_split" function from sklearn 

In [16]:
embedding_dim = 50 #this is the final dimension of the embedding space.


Let's write down the model using embedding layer

Note: you may need Embedding, flatten and Dense layers

In [17]:
model = tf.keras.models.Sequential([
  ....

])

In [18]:
model.compile(...)



In [None]:
model.summary() #here we show the architecture 

In [None]:
model.fit(np.array(X_train), np.array(y_train), epochs=3) #let's fit the model

Remember the train_test_split? now we use the test to evaluate our model

In [None]:
model.evaluate(...) 


Let's evaluate the model

In [22]:
from sklearn.metrics import confusion_matrix #we import this package from sklearn and output it
predictions = model.predict(X_test) #here we make predictions
cm = confusion_matrix(predictions.argmax(axis=1), y_test.argmax(axis=1))#we generate the confusion matrix

In [None]:
cm 

Let's try brand new text

In [24]:
#these are the codes for each language in order to evaluate properly
print('english', encoder.transform(['english']))
print('french', encoder.transform(['french']))
print('italian', encoder.transform(['italian']))
print('spanish', encoder.transform(['spanish']))

english [0]
french [1]
italian [2]
spanish [3]


In this experiment we will predict the language of the same sentence in the different languages

In [25]:
#new_text = ["tensorflow is a great tool you can find a lot of tutorials from packt"]
#new_text = ["tensorflow est un excellent outil vous pouvez trouver beaucoup de tutoriels de packt"]
#new_text = ["tensorflow è un ottimo strumento puoi trovare molti tutorial di packt"]
new_text = ["tensorflow es una gran herramienta puedes encontrar muchos tutoriales de packt"]


In [26]:
test_text = tok.texts_to_sequences(new_text) #this is how we create sequences
test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=maxlen) #let's execute pad step

In [None]:
np.set_printoptions(suppress=True)
predictions = model.predict(test_text)
print(predictions.argmax())
print(predictions) #spanish you can get confused with italian which makes sense since they are more similar languages

In [None]:
!pip install wikipedia
import wikipedia

Let's build a brand new data set with only spanish and let's see if we recognize it ...

In [None]:
#language codes
#english: en
#italian: it
#french: fr
#spanish: es
new_wiki_text = []
wikipedia.set_lang('es')
for i in range(0, 5):
    print(i)
    random = wikipedia.random(1)
       
    try:
        new_wiki_text.append([wikipedia.page(random).summary])
    except wikipedia.exceptions.DisambiguationError as e:
        random = wikipedia.random(1)

In [None]:
new_wiki_text = pd.DataFrame(new_wiki_text)
new_wiki_text.columns = ['sentence']
new_wiki_text

In [33]:
new_wiki_text['sentence_lower'] = new_wiki_text["sentence"].str.lower()
new_wiki_text['sentence_no_punctuation'] = new_wiki_text['sentence_lower'].str.replace('[^\w\s]','')
new_wiki_text['sentence_no_punctuation'] = new_wiki_text["sentence_no_punctuation"].fillna("fillna")

In [34]:
np.set_printoptions(suppress=True)
test_wiki_text = tok.texts_to_sequences(list(new_wiki_text['sentence_no_punctuation'] )) #this is how we create sequences
test_wiki_text = tf.keras.preprocessing.sequence.pad_sequences(test_wiki_text, maxlen=maxlen) #let's execute pad step

In [None]:
predictions = model.predict(test_wiki_text)
print(predictions)