In [None]:
# !pip install -U -q segmentation-models
# !pip install -q tensorflow==2.1
# !pip install -q keras==2.3.1
# !pip install -q tensorflow-estimator==2.1.

# ## Imports libs
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# os.environ["SM_FRAMEWORK"] = "tf.keras"

# from tensorflow import keras
# import segmentation_models as sm

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import Counter
from keras.preprocessing import sequence
import random

Using TensorFlow backend.


In [2]:
def create_letter_mapping(all_words):
    letter_li = []
    
    # Get unique letters in the data
    for word in all_words:
        for letter in word:
            letter_li.append(letter)
    
    # order letters by most common
    most_common = Counter(letter_li).most_common()
    letter_li = []
    for elem in most_common:
        letter_li.append(elem[0])
    
    # Map each letter to an integer
    letter_dict = {}
    for idx, letter in enumerate(letter_li):
        if letter_dict != '':
            letter_dict[letter] = idx + 1
        else:
            letter_dict[letter] = 0
    
    return letter_dict

def letter_to_number(word_li, letter_dict):
    return_li = []
    for letter in word_li:
        return_li.append(letter_dict[letter])
        
    return return_li

In [3]:
lan_df = pd.read_csv('language_data.csv')

In [4]:
group_dict = {}
for idx, value in enumerate(lan_df.Group.unique()):
    group_dict[value] = idx
    
lan_df['group_int'] = lan_df['Group'].apply(lambda x: group_dict[x])

In [5]:
lan_df['letter_li'] = lan_df['word'].apply(lambda x: list(x))

In [6]:
map_dict = create_letter_mapping(lan_df['word'].values)

In [7]:
lan_df['letter_ints'] = lan_df['letter_li'].apply(lambda x: letter_to_number(x, map_dict))

In [8]:
lan_df.iloc[0, :]

word                                 abs brake
origin                                     deu
full_name                               German
Group                                 Germanic
word_len                                     9
group_int                                    0
letter_li          [a, b, s,  , b, r, a, k, e]
letter_ints    [2, 16, 8, 27, 16, 7, 2, 21, 1]
Name: 0, dtype: object

In [9]:
lan_df.shape

(25203, 8)

In [10]:
word_arr = np.empty(25203, dtype=list)
origin_arr = np.empty(25203, dtype=int)

In [11]:
for i in range(lan_df.shape[0]):
    word_arr[i] = lan_df.loc[i, 'letter_ints']
    origin_arr[i] = lan_df.loc[i, 'group_int']

In [12]:
# pad array so that all words have length of 25
word_arr = sequence.pad_sequences(word_arr, 25)

In [13]:
# Develop random index for train test split
random.seed(0)
rng = np.arange(25203)
random.shuffle(rng)

In [14]:
train_len = int(25203 * 0.8)
train_idx = rng[:train_len]
test_idx = rng[train_len:]

In [15]:
X_train, y_train = word_arr[train_idx], origin_arr[train_idx]
X_test, y_test = word_arr[test_idx], origin_arr[test_idx]

In [16]:
len(map_dict)

59

In [37]:
Counter(y_train).most_common()

[(3, 16084), (0, 1942), (1, 1489), (5, 342), (2, 262), (4, 43)]

In [26]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(map_dict) + 1, 10),
    tf.keras.layers.LSTM(10),
    tf.keras.layers.Dense(1, activation="softmax")
])

In [27]:
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=['acc'])

history = model.fit(X_train, y_train, epochs=10, validation_split=0.2)

Train on 16129 samples, validate on 4033 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
word_arr[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,
       16,  8, 27, 16,  7,  2, 21,  1])

In [34]:
model.predict(np.array(word_arr[0]))

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]], dtype=float32)