In [None]:
# Import libraries

# core libs
import random
from collections import Counter

# numpy
import numpy as np

# Sklearn
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# keras
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Embedding, Conv1D, MaxPool1D, Flatten, Dense
from keras.models import Model

In [None]:
# Library versions
print(f'keras= {keras.__version__}')
print(f'sklearn= {sklearn.__version__}')
print(f'numpy= {np.__version__}')

keras= 2.4.3
sklearn= 0.22.2.post1
numpy= 1.18.5


In [None]:
# Basic Configs
data_dir = '/content/drive/My Drive/Colab Notebooks/data_dir'
# Select to articles from file
num_of_articles = 10000
# Maximum sequence length
sentense_len = 150
# shingle configs
shingles_range = (70, 100, 130)
# how many shingles generate per line
shingle_per_line = 10 
# out of vocabulary token
oov_str = 'oov'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# language code wise full name mapping
lang_code_dict = {
    'en':'english', 'de':'german', 
    'fr':'french', 'it':'italian', 
    'es':'spanish'
}

In [None]:
# language code wise data file mapping
data_info = {
    'en' : data_dir + '/wiki.en.text',
    'de' : data_dir + '/wiki.de.text',
    'fr' : data_dir + '/wiki.fr.text',
    'it' : data_dir + '/wiki.it.text',
    'es' : data_dir + '/wiki.es.text',
}

In [None]:
for lang_code, file_path in data_info.items():
    print(lang_code, lang_code_dict[lang_code], file_path)

en english /content/drive/My Drive/Colab Notebooks/data_dir/wiki.en.text
de german /content/drive/My Drive/Colab Notebooks/data_dir/wiki.de.text
fr french /content/drive/My Drive/Colab Notebooks/data_dir/wiki.fr.text
it italian /content/drive/My Drive/Colab Notebooks/data_dir/wiki.it.text
es spanish /content/drive/My Drive/Colab Notebooks/data_dir/wiki.es.text


In [None]:
# data loading
data_dict = {}
for lang_code, file_path in data_info.items():
    with open(file_path, encoding='ISO-8859-1') as file:
        lines = file.readlines()
        lines = lines[:num_of_articles]
        # convert to lower case
        lines = [l.lower().strip() for l in lines]
        data_dict[lang_code] = lines
        print(lang_code, len(lines))

en 1
de 7
fr 1
it 16
es 24


In [None]:
def generate_shingles(line, length, total):
    """
    Generate shingles from line
    """
    #todo: USE SET to remove REDUUUUUU
    shingle_list = [] 
    max_index = len(line) - length
    if max_index > 0:
        for _ in range(total):
            index = random.randint(0, max_index)
            shingle_text = line[index:index+length]
            shingle_list.append(shingle_text)
    else:
        shingle_list.append(line)
    return shingle_list

In [None]:
def generate_shingles_lines(line, length, total):
    """
    Generate shingles from list of lines
    """
    shingle_list = []
    for line in lines:
        shingles = generate_shingles(line=line, length=length, total=total)
        shingle_list.extend(shingles)
    return shingle_list

In [None]:
# generate shingles
shingle_data_dict = {}
for lang, lines in data_dict.items():
    shingle_list = []
    for s_range in shingles_range:
        shingles = generate_shingles_lines(lines, s_range, shingle_per_line)
        shingle_list.extend(shingles)
    shingle_data_dict[lang] = shingle_list
    print(lang, len(shingle_list))

en 30
de 210
fr 30
it 480
es 720


In [None]:
# create list of lines and labels
data_lines, labels = [], []
for lang, samples in shingle_data_dict.items():
    data_lines.extend(samples)
    total_samples = len(samples)
    labels.extend([lang] * total_samples)
print(len(data_lines), len(labels))

1470 1470


In [None]:
# create list of all characters from all data lines
data_char_ls = []
for line in data_lines:
    char_ls = [c for c in line]
    data_char_ls.append(char_ls)
    
# count all characters
cunt = Counter(x for xs in data_char_ls for x in set(xs))

# create vocabulary
char_vocab = [c[0] for c in cunt.most_common(76)] + [oov_str]
print(char_vocab)

[' ', 'e', 'n', 'a', 'r', 'i', 's', 'l', 'd', 't', 'o', 'c', 'u', 'm', 'p', 'g', 'b', 'v', 'h', 'f', 'z', 'q', 'í', 'j', 'ó', 'y', 'é', 'á', 'w', 'k', 'ñ', 'x', 'ò', 'ú', 'ü', 'à', 'ä', 'ö', 'è', 'ù', 'ß', 'ì', 'â', 'î', '²', 'ç', '³', 'ê', 'ô', 'oov']


In [None]:
# create dictionary for (char to index)
# here we (index + 1) becoz, 0 index for padding
ch2int = {c:i+1 for i, c in enumerate(char_vocab)}
print(ch2int)
print()
# create dictionary for (index to char)
int2ch = {i:c for c, i in ch2int.items()}
print(int2ch)

{' ': 1, 'e': 2, 'n': 3, 'a': 4, 'r': 5, 'i': 6, 's': 7, 'l': 8, 'd': 9, 't': 10, 'o': 11, 'c': 12, 'u': 13, 'm': 14, 'p': 15, 'g': 16, 'b': 17, 'v': 18, 'h': 19, 'f': 20, 'z': 21, 'q': 22, 'í': 23, 'j': 24, 'ó': 25, 'y': 26, 'é': 27, 'á': 28, 'w': 29, 'k': 30, 'ñ': 31, 'x': 32, 'ò': 33, 'ú': 34, 'ü': 35, 'à': 36, 'ä': 37, 'ö': 38, 'è': 39, 'ù': 40, 'ß': 41, 'ì': 42, 'â': 43, 'î': 44, '²': 45, 'ç': 46, '³': 47, 'ê': 48, 'ô': 49, 'oov': 50}

{1: ' ', 2: 'e', 3: 'n', 4: 'a', 5: 'r', 6: 'i', 7: 's', 8: 'l', 9: 'd', 10: 't', 11: 'o', 12: 'c', 13: 'u', 14: 'm', 15: 'p', 16: 'g', 17: 'b', 18: 'v', 19: 'h', 20: 'f', 21: 'z', 22: 'q', 23: 'í', 24: 'j', 25: 'ó', 26: 'y', 27: 'é', 28: 'á', 29: 'w', 30: 'k', 31: 'ñ', 32: 'x', 33: 'ò', 34: 'ú', 35: 'ü', 36: 'à', 37: 'ä', 38: 'ö', 39: 'è', 40: 'ù', 41: 'ß', 42: 'ì', 43: 'â', 44: 'î', 45: '²', 46: 'ç', 47: '³', 48: 'ê', 49: 'ô', 50: 'oov'}


In [None]:
def encode(in_ls, key):
    """
    encode list of character to index of characters using 'char2int' dictionary
    """
    out_ls = []
    for ch in in_ls:
        index = key.get(ch)
        if index is None:
            index = key.get(oov_str)
        out_ls.append(index)
    return out_ls

In [None]:
# data encoding
encoded_ls = [encode(l, ch2int) for l in data_lines]
print(len(encoded_ls))

1470


In [None]:
# observe count 'oov' in dataset
counts = 0
for enc in encoded_ls:
    if ch2int[oov_str] in enc:
        counts += 1
counts

0

In [None]:
# padding and trucating of encoded sequence
X = pad_sequences(encoded_ls, maxlen=sentense_len, truncating='post', padding='post')

In [None]:
# target encoding from 'en' or 'de' language code to 0, 1 
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
print(label_encoder.classes_)

['de' 'en' 'es' 'fr' 'it']


In [None]:
# one hot encoding of targets
y = to_categorical(encoded_labels)

In [None]:
print(X.shape, y.shape)

(1470, 150) (1470, 5)


In [None]:
# Train & Test split (70:30) ratio from full data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1029, 150) (441, 150) (1029, 5) (441, 5)


In [None]:
# Build the Neural network
inp = Input(shape=(sentense_len, ))
x = Embedding(input_dim=len(char_vocab) + 1, output_dim=64)(inp)
x = Conv1D(64, 5, activation='relu')(x)
x = MaxPool1D(5)(x)
x = Conv1D(64, 5, activation='relu')(x)
x = MaxPool1D(20)(x)
x = Flatten()(x)
x = Dense(64, activation='relu')(x)
x = Dense(5, activation='softmax')(x)
model = Model(inputs=inp, outputs=x)
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 150, 64)           3264      
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 146, 64)           20544     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 29, 64)            0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 25, 64)            20544     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 1, 64)             0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 64)               

In [None]:
# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=256, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fd592d4bf98>

In [None]:
# prediction on test data
pred = model.predict(X_test)

In [None]:
pred_y = pred.argmax(axis=1).ravel()
actual_y = y_test.argmax(axis=1).ravel()

In [None]:
# Generate classification report
report = classification_report(actual_y, pred_y, target_names=label_encoder.classes_)
print(report)

              precision    recall  f1-score   support

          de       0.00      0.00      0.00        61
          en       0.00      0.00      0.00         4
          es       0.86      0.65      0.74       217
          fr       0.00      0.00      0.00         9
          it       0.52      0.95      0.67       150

    accuracy                           0.65       441
   macro avg       0.28      0.32      0.28       441
weighted avg       0.60      0.65      0.59       441



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def predict(line):
    """
    Prediction method for single line
    """
    line = line.lower()
    chars = [c for c in line]
    encoded = encode(chars, ch2int)
    padded = keras.preprocessing.sequence.pad_sequences([encoded], maxlen=sentense_len, truncating='post', padding='post')
    scores = model.predict(padded)
    max_index = scores[0].argmax()
    lbl = label_encoder.classes_[max_index]
    return lbl, scores[0][max_index]

In [None]:
# sample perdiction
print(predict('this is my sample text'))

('es', 0.4009181)


In [None]:
# Real time data from google news
test_data = [
    ('en', 'Today rural India and its villages have declared themselves'),
    ('de', 'Es ist einer dieser Momente, bei denen man dabei gewesen sein will'),
    ('fr', 'Mais rien ne permet pour l’instant de confirmer ces propos.'),
    ('it', 'Il peso della compartecipazione dei cittadini (il ticket appunto) sarà cacolato'),
    ('es', 'Después de la evaluación y las pruebas médicas, se descubrió que tenía un')
]

In [None]:
# predict on real time data
for actual_lang, data in test_data:
    print('-----------------')
    print(f'Data:{data}')
    print(f'Predicted:{predict(data)}, Actual:{actual_lang}')

-----------------
Data:Today rural India and its villages have declared themselves
Predicted:('it', 0.4233026), Actual:en
-----------------
Data:Es ist einer dieser Momente, bei denen man dabei gewesen sein will
Predicted:('it', 0.42256892), Actual:de
-----------------
Data:Mais rien ne permet pour l’instant de confirmer ces propos.
Predicted:('it', 0.42087787), Actual:fr
-----------------
Data:Il peso della compartecipazione dei cittadini (il ticket appunto) sarà cacolato
Predicted:('it', 0.43131608), Actual:it
-----------------
Data:Después de la evaluación y las pruebas médicas, se descubrió que tenía un
Predicted:('es', 0.42752376), Actual:es
