In [1]:
import tensorflow as tf

In [2]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder 

In [3]:
tf.__version__

'2.7.0'

In [5]:
!unzip comments.zip

Archive:  comments.zip
  inflating: comments.csv            


In [6]:
train_df = pd.read_csv("comments.csv", header=None)
train_df.columns = ['sentence', 'language']
train_df.tail()

Unnamed: 0,sentence,language
58086,وفی المفید ولا یستنجی فی حیاض علی طریق المسلم...,arabic
58087,قوله کبیع الخیار اعترض بان المعتمد ان ال...,arabic
58088,وقال محمد یحنث قاسه علی صدیق فلان وزوجه فلان ...,arabic
58089,والثانی انه لو کان فی مال زوجه تبسط لسقط عنه ...,arabic
58090,الا تری انه بعد السبی قبل العتق کان الحکم هکذا...,arabic


In [7]:
train_df.language.value_counts()

arabic     52736
persian     1722
spanish      966
italian      963
french       868
english      836
Name: language, dtype: int64

In [8]:
train_df.shape

(58091, 2)

In [9]:
train_df['sentence_lower'] = train_df["sentence"].str.lower()
train_df['sentence_no_punctuation'] = train_df['sentence_lower'].str.replace('[^\w\s]','')
train_df['sentence_no_punctuation'] = train_df["sentence_no_punctuation"].fillna("fillna")

In [10]:
len(train_df) #we print the length, not a big one but sufficient

58091

In [11]:
num_classes = len(train_df.language.unique())
num_classes

6

In [12]:
Y = train_df['language']
encoder = LabelEncoder()
Y = encoder.fit_transform(Y)
Y

array([3, 3, 3, ..., 0, 0, 0])

In [13]:
Y = tf.keras.utils.to_categorical(
    Y,
    num_classes = num_classes
)
Y

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       ...,
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.]], dtype=float32)

In [14]:
max_features = 5000 #we set maximum number of words to 5000
maxlen = 400 #we set maximum sequence length to 400

In [15]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words = max_features)

In [16]:
tok.fit_on_texts(list(train_df['sentence_no_punctuation'])) #fit to cleaned text

In [17]:
len(tok.word_index)

183763

In [18]:
vocab_size = len(tok.word_index) + 1 
#this represents the number of words that we tokenize different from max_features but necessary for
#the definition of the dimension of the embedding space

In [19]:
train_df.loc[0, 'sentence']

'Jean Beauverie (Fontaines-sur-Saône, 18 febbraio 1874 – Lione, 22 febbraio 1938) è stato un botanico e micologo francese.'

In [20]:
train_df = tok.texts_to_sequences(list(train_df['sentence_no_punctuation'])) 

In [21]:
train_df[0]

[639, 1110, 168, 1025, 87, 173, 3572]

In [22]:
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df,
                                                         maxlen = maxlen,
                                                         padding='pre') 

In [None]:
train_df[0]

In [24]:
from sklearn.model_selection import train_test_split #divide into train and test set

In [25]:
X_train, X_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.1, random_state=42)

In [26]:
embedding_dim = 50

Let's write down the model

+ input_dim: Integer. Size of the vocabulary, i.e. maximum integer index + 1.
+ output_dim: Integer. Dimension of the dense embedding.
+ input_length: Length of input sequences, when it is constant. This argument is required if you are going to connect Flatten then Dense layers upstream (without it, the shape of the dense outputs cannot be computed).

In [27]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim = vocab_size,
                            output_dim = embedding_dim,
                            input_length = maxlen),
  tf.keras.layers.Flatten(), 
  tf.keras.layers.Dense(num_classes, activation = 'softmax')
])

In [28]:
model.compile(optimizer = 'adam',
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 400, 50)           9188200   
                                                                 
 flatten (Flatten)           (None, 20000)             0         
                                                                 
 dense (Dense)               (None, 6)                 120006    
                                                                 
Total params: 9,308,206
Trainable params: 9,308,206
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.fit(X_train, y_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7efe27612090>

In [31]:
model.evaluate(X_test, y_test, verbose = 0) 

[0.007759384345263243, 0.9982788562774658]

In [32]:
from sklearn.metrics import confusion_matrix 

In [33]:
predictions = model.predict(X_test) 
cm = confusion_matrix(predictions.argmax(axis=1), y_test.argmax(axis=1))

In [None]:
cm

In [36]:
print('english', encoder.transform(['english']))
print('french', encoder.transform(['french']))
print('persian', encoder.transform(['persian']))
print('italian', encoder.transform(['italian']))
print('spanish', encoder.transform(['spanish']))
print('arabic', encoder.transform(['arabic']))

english [1]
french [2]
persian [4]
italian [3]
spanish [5]
arabic [0]


In this experiment we will predict the language of the same sentence in the different languages

In [37]:
new_text = ["tensorflow is a great tool you can find a lot of tutorials from"]
#new_text = ["tensorflow est un excellent outil vous pouvez trouver beaucoup de tutoriels de"]
#new_text = ["tensorflow è un ottimo strumento puoi trovare molti tutorial di"]
#new_text = ["tensorflow es una gran herramienta puedes encontrar muchos tutoriales de"]
# new_text = ["توی یک آموزش تقریبا ده ساعته تلاش کردم 6تا سناریو واقعی تعریف کنم و بعد پیاده سازی انجام بشه. لیست پروژه هایی که توی این دوره پیاده سازی شدند"]

In [38]:
test_text = tok.texts_to_sequences(new_text)
test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen = maxlen)

In [39]:
predictions = model.predict(test_text)
print(predictions.argmax())
print(predictions)

1
[[1.45731028e-05 9.99758303e-01 2.00727700e-05 1.11691275e-04
  5.70292941e-05 3.83737824e-05]]


In [40]:
def lang_detector(txt):
    tmp = tok.texts_to_sequences(txt)
    tmp = tf.keras.preprocessing.sequence.pad_sequences(tmp, maxlen = maxlen)
    prediction = model.predict(tmp)
    label = prediction.argmax()
    prediction = encoder.inverse_transform([label])
    print(txt, prediction)    

In [43]:
# tmp_txt = ['در این ویدئو با ضرورت یادگیری پردازش متون فارسی با پایتون آشنا شویم']
tmp_txt = ['الحمدالله رب العالمین و صلی الله علی سیدنا محمد و آله الطاهرین و اعنه الله علی اعدئهم اجمعین']
lang_detector(tmp_txt)

['الحمدالله رب العالمین و صلی الله علی سیدنا محمد و آله الطاهرین و اعنه الله علی اعدئهم اجمعین'] ['arabic']


In [44]:
tmp_txt = ['النّاس عَبيدُ الدّنيا، و الدّين لَعِق (7) على اَلسِنَتِهم، يحوطُونَهُ ما درَّت معايشُهُم، فاذا مُحِّصوا بالبلأ قَلَّ الدَّيَّانون .']
lang_detector(tmp_txt)

['النّاس عَبيدُ الدّنيا، و الدّين لَعِق (7) على اَلسِنَتِهم، يحوطُونَهُ ما درَّت معايشُهُم، فاذا مُحِّصوا بالبلأ قَلَّ الدَّيَّانون .'] ['arabic']


In [46]:
tmp_txt = ['«ما در روزهاى آغازين ظهور دين همراه با رسول اللّه صلى اللّه عليه و آله و سلم، دست به شمشير مى‌برديم و پدر، فرزند، برادر و عموهايمان را مى‌كشتيم، اما اين قتل و كشتار ارحام و نزديكان تنها باعث تقويت ايمان و ثبات قدم، در اعتقاداتمان مى‌شد».']
lang_detector(tmp_txt)

['«ما در روزهاى آغازين ظهور دين همراه با رسول اللّه صلى اللّه عليه و آله و سلم، دست به شمشير مى\u200cبرديم و پدر، فرزند، برادر و عموهايمان را مى\u200cكشتيم، اما اين قتل و كشتار ارحام و نزديكان تنها باعث تقويت ايمان و ثبات قدم، در اعتقاداتمان مى\u200cشد».'] ['persian']
