In [1]:
# Reading the data in makes sense to structure a little bit
import random

def read_data_one_lang(lang,part):
    """Reads one file for one language. Returns data in the form of pairs of (lang,line)"""
    filename="language-identification/{}_{}.txt".format(lang,part)
    result=[] #this will be the list of pairs (lang,line)
    with open(filename) as f:
        for line in f:
            line=line.strip()
            result.append((lang,line)) 
    return result


def read_data_all_langs(part):
    """Reads train, test or dev data for all languages. part can be train, test, or devel"""
    #glob
    data=[]
    for lang in ("en","es","et","fi","pt"):
        pairs=read_data_one_lang(lang,part)
        data.extend(pairs) #just add these lines to the end
    #...done
    #but now they come in the order of languages
    #we really must scramble these!
    random.shuffle(data)
    
    #let's yet separate the labels and lines, we will need that anyway
    labels=[label for label,line in data]
    lines=[line for label,line in data]
    return labels,lines

labels_train,lines_train=read_data_all_langs("train")
labels_dev,lines_dev=read_data_all_langs("devel")
for label,line in zip(labels_train[:5],lines_train[:5]):
    print(label,"   ",line[:30],"...")
#and beyond this point, exactly same code is applicable as before

fi     EU:n talousjärjestelmä joutui  ...
es     La misma fue latinizada a Reti ...
en     bd ...
pt     Os« contratos de gaveta», vist ...
en     But one should not go here exp ...


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

#1-3 character grams
vectorizer=CountVectorizer(max_features=100000,binary=True,ngram_range=(3,3),analyzer="char_wb")
feature_matrix_train=vectorizer.fit_transform(lines_train)
feature_matrix_dev=vectorizer.transform(lines_dev)

from sklearn.preprocessing import LabelEncoder

label_encoder=LabelEncoder() #Turns class labels into integers
class_numbers_train=label_encoder.fit_transform(labels_train)
class_numbers_dev=label_encoder.fit_transform(labels_dev)

print("class_numbers shape=",class_numbers_train.shape)
print("class labels",label_encoder.classes_) #this will let us translate back from indices to labels



class_numbers shape= (5000,)
class labels ['en' 'es' 'et' 'fi' 'pt']


In [11]:
import keras
from keras.models import Model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping

example_count,feature_count=feature_matrix_train.shape
class_count=len(label_encoder.classes_)

inp=Input(shape=(feature_count,))
hidden=Dense(20,activation="tanh")(inp)
outp=Dense(class_count,activation="softmax")(hidden)
model=Model(inputs=[inp], outputs=[outp])

stop_cb=EarlyStopping(monitor='val_accuracy', patience=5, verbose=1, mode='auto', baseline=None, restore_best_weights=True)

model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=['accuracy'])
hist=model.fit(feature_matrix_train,class_numbers_train,batch_size=100,verbose=1,epochs=25,validation_data=(feature_matrix_dev,class_numbers_dev),callbacks=[stop_cb])

Train on 5000 samples, validate on 5000 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Restoring model weights from the end of the best epoch
Epoch 00010: early stopping


* Let's try to identify misclassified documents

In [12]:
import numpy

predictions=model.predict(feature_matrix_dev)
pred_classes=numpy.argmax(predictions,axis=-1)
for pred,correct,txt_line in zip(pred_classes,labels_dev,lines_dev):
    pred_label=label_encoder.classes_[pred]
    if pred_label!=correct:
        print("Prediction:",pred_label,"Correct:",correct,"Text:",txt_line)



Prediction: en Correct: et Text: Loe ka:, ML, 14. november
Prediction: et Correct: en Text: junkie lube?!
Prediction: en Correct: pt Text: Nervosas!
Prediction: es Correct: pt Text: Lambari d' Oeste( 5.000 habitantes) virou município há dois anos.
Prediction: en Correct: fi Text: I know someone you don’t know... zzzztsts...
Prediction: en Correct: pt Text: Em um único ano, 1937, eles compuseram« They Can't Take That Away From Me»,« Let's Call the Whole Thing Off»,« A Foggy Day»,« Nice Work if You Can Get it»,« They All Laughed»,« Love Walked In»' e« Love Is Here to Stay»', e essas são apenas as que ficaram universalmente conhecidas.
Prediction: et Correct: fi Text: Se oli kamalaa.
Prediction: et Correct: fi Text: Valinta miljoonien lauseiden joukosta on minun tekoni.
Prediction: et Correct: en Text: 2 Peels:
Prediction: et Correct: fi Text: Tulet sä ja?
Prediction: en Correct: et Text: Üle tuhande meedialogo
Prediction: fi Correct: et Text: Vala viina!
Prediction: pt Correct: es Text: 

In [16]:
data_in=vectorizer.transform(["sdfjfj fsdjfoj fsjofs fjskf fjsklf","I really think this should be classified as English"])
print(label_encoder.classes_)
model.predict(data_in)

['en' 'es' 'et' 'fi' 'pt']


array([[2.9287121e-01, 9.9074520e-02, 2.2043586e-01, 2.0575805e-01,
        1.8186036e-01],
       [9.8750848e-01, 5.1748008e-04, 2.0715129e-03, 2.0498259e-03,
        7.8526288e-03]], dtype=float32)