In [1]:

%tensorflow_version 1.x


TensorFlow 1.x selected.


In [2]:
# Reading the data in makes sense to structure a little bit
import random

def read_data_one_lang(lang,part):
    """Reads one file for one language. Returns data in the form of pairs of (lang,line)"""
    filename="language-identification/{}_{}.txt".format(lang,part)
    result=[] #this will be the list of pairs (lang,line)
    with open(filename) as f:
        for line in f:
            line=line.strip()
            result.append((lang,line)) 
    return result


def read_data_all_langs(part):
    """Reads train, test or dev data for all languages. part can be train, test, or devel"""
    #glob
    data=[]
    for lang in ("en","es","et","fi","pt"):
        pairs=read_data_one_lang(lang,part)
        data.extend(pairs) #just add these lines to the end
    #...done
    #but now they come in the order of languages
    #we really must scramble these!
    random.shuffle(data)
    
    #let's yet separate the labels and lines, we will need that anyway
    labels=[label for label,line in data]
    lines=[line for label,line in data]
    return labels,lines

labels_train,lines_train=read_data_all_langs("train")
labels_dev,lines_dev=read_data_all_langs("devel")
for label,line in zip(labels_train[:5],lines_train[:5]):
    print(label,"   ",line[:30],"...")
#and beyond this point, exactly same code is applicable as before

pt     A altura de o solo foi ampliad ...
en     And what is there to show for  ...
et     Kuulsusega käib juba paraku ka ...
et     Alles siis, kui mees ametist p ...
et     Padari mitmete konkurentide se ...


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

#1-3 character grams
vectorizer=CountVectorizer(max_features=100000,binary=True,ngram_range=(3,3),analyzer="char_wb")
feature_matrix_train=vectorizer.fit_transform(lines_train)
feature_matrix_dev=vectorizer.transform(lines_dev)

from sklearn.preprocessing import LabelEncoder

label_encoder=LabelEncoder() #Turns class labels into integers
class_numbers_train=label_encoder.fit_transform(labels_train)
class_numbers_dev=label_encoder.fit_transform(labels_dev)

print("class_numbers shape=",class_numbers_train.shape)
print("class labels",label_encoder.classes_) #this will let us translate back from indices to labels



class_numbers shape= (5000,)
class labels ['en' 'es' 'et' 'fi' 'pt']


The default version of TensorFlow in Colab will switch to TensorFlow 2.x on the 27th of March, 2020.
We recommend you upgrade now or ensure your notebook will continue to use TensorFlow 1.x via the %tensorflow_version 1.x magic: more info.

https://www.tensorflow.org/guide/migrate
https://colab.research.google.com/notebooks/tensorflow_version.ipynb


In [4]:
 
import keras 
from keras.models import Model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping

example_count,feature_count=feature_matrix_train.shape
class_count=len(label_encoder.classes_)

inp=Input(shape=(feature_count,))
hidden=Dense(20,activation="tanh")(inp)
outp=Dense(class_count,activation="softmax")(hidden)
model=Model(inputs=[inp], outputs=[outp])

stop_cb=EarlyStopping(monitor='val_accuracy', patience=5, verbose=1, mode='auto', baseline=None, restore_best_weights=True)

model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=['accuracy'])
hist=model.fit(feature_matrix_train,class_numbers_train,batch_size=100,verbose=1,epochs=25,validation_data=(feature_matrix_dev,class_numbers_dev),callbacks=[stop_cb])

Using TensorFlow backend.







Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 5000 samples, validate on 5000 samples
Epoch 1/25





Epoch 2/25



Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


* Let's try to identify misclassified documents

In [5]:
import numpy

predictions=model.predict(feature_matrix_dev)
pred_classes=numpy.argmax(predictions,axis=-1)
for pred,correct,txt_line in zip(pred_classes,labels_dev,lines_dev):
    pred_label=label_encoder.classes_[pred]
    if pred_label!=correct:
        print("Prediction:",pred_label,"Correct:",correct,"Text:",txt_line)



Prediction: en Correct: fi Text: ANNEX 16
Prediction: fi Correct: et Text: Parima uustulnuka kategoorias on Bomfunki rivaalideks Blink 182, Melanie C, Sonique ja Anastasia.
Prediction: en Correct: et Text: 1.1.2..
Prediction: en Correct: pt Text: Medidas para as PME
Prediction: pt Correct: en Text: For decades.
Prediction: en Correct: pt Text: Voando Baixo
Prediction: et Correct: fi Text: Talovahtina
Prediction: en Correct: et Text: AT.
Prediction: en Correct: es Text: Apareció en un artículo de The Alternate View:" Boomerang and the Sound of the Big Bang"( January 2001).
Prediction: pt Correct: es Text: Se opone a un sistema económico, social o político estratificado.
Prediction: en Correct: et Text: Just.
Prediction: et Correct: en Text: 2 Peels:
Prediction: en Correct: fi Text: Yritysesittely: Accenture oy
Prediction: pt Correct: en Text: christmas cake for christmas day.
Prediction: en Correct: pt Text: Com fé
Prediction: en Correct: et Text: BRISTOL MYERS 36,9
Prediction: es Corre

In [6]:
data_in=vectorizer.transform(["sdfjfj fsdjfoj fsjofs fjskf fjsklf","I really think this should be classified as English"])
print(label_encoder.classes_)
model.predict(data_in)

['en' 'es' 'et' 'fi' 'pt']


array([[5.2468520e-01, 4.5902524e-02, 1.6088000e-01, 1.5496165e-01,
        1.1357068e-01],
       [9.9990404e-01, 1.8294686e-05, 3.0967985e-05, 2.9542984e-05,
        1.7214745e-05]], dtype=float32)