In [454]:
# !pip install -U -q segmentation-models
# !pip install -q tensorflow==2.1
# !pip install -q keras==2.3.1
# !pip install -q tensorflow-estimator==2.1.

# ## Imports libs
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# os.environ["SM_FRAMEWORK"] = "tf.keras"

# from tensorflow import keras
# import segmentation_models as sm

In [455]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import Counter
from keras.preprocessing import sequence
import random

In [456]:
def create_letter_mapping(all_words):
    letter_li = []
    
    # Get unique letters in the data
    for word in all_words:
        for letter in word:
            letter_li.append(letter)
    
    # order letters by most common
    most_common = Counter(letter_li).most_common()
    letter_li = []
    for elem in most_common:
        letter_li.append(elem[0])
    
    # Map each letter to an integer
    letter_dict = {}
    for idx, letter in enumerate(letter_li):
        if letter_dict != '':
            letter_dict[letter] = idx + 1
        else:
            letter_dict[letter] = 0
    
    return letter_dict

def letter_to_number(word_li, letter_dict):
    return_li = []
    for letter in word_li:
        return_li.append(letter_dict[letter])
        
    return return_li

def pad_list(letter_li, pad_num=25):
    curr_len = len(letter_li)
    new_len = 25 - curr_len
    zeroes = [0] * new_len
    zeroes.extend(letter_li)
    return zeroes

In [457]:
lan_df = pd.read_csv('language_data.csv')

In [458]:
Counter(lan_df.Group.values).most_common()

[('Latin', 20118),
 ('Germanic', 2177),
 ('Other', 2167),
 ('Japanese', 411),
 ('Arabic', 330)]

In [459]:
lan_df.loc[lan_df['full_name'] == 'Sanskrit', 'Group'] = 'Indo-Aryan'

In [460]:
lan_df.loc[lan_df['full_name'] == 'Hindi', 'Group'] = 'Indo-Aryan'

In [461]:
lan_df = lan_df[lan_df['Group'] != 'Indo-Aryan']

In [462]:
#Counter(lan_df.loc[lan_df['Group'] == 'Other', 'full_name'].values).most_common()

In [463]:
lan_df = lan_df[lan_df['Group'] != 'Other']

In [464]:
lan_df.reset_index(inplace=True,drop=True)

In [465]:
group_dict = {}
for idx, value in enumerate(lan_df.Group.unique()):
    group_dict[value] = idx
    
lan_df['group_int'] = lan_df['Group'].apply(lambda x: group_dict[x])

In [466]:
lan_df['letter_li'] = lan_df['word'].apply(lambda x: list(x))

In [467]:
map_dict = create_letter_mapping(lan_df['word'].values)

In [468]:
lan_df['letter_ints'] = lan_df['letter_li'].apply(lambda x: pad_list(letter_to_number(x, map_dict)))

In [469]:
lan_df.iloc[0, :]

word                                                   abs brake
origin                                                       deu
full_name                                                 German
Group                                                   Germanic
word_len                                                       9
group_int                                                      0
letter_li                            [a, b, s,  , b, r, a, k, e]
letter_ints    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: 0, dtype: object

In [470]:
SHAPE = lan_df.shape[0]

In [471]:
word_arr = np.empty(SHAPE, dtype=list)
origin_arr = np.empty(SHAPE, dtype=int)

In [472]:
for i in range(SHAPE):
    word_arr[i] = lan_df.loc[i, 'letter_ints']
    origin_arr[i] = lan_df.loc[i, 'group_int']

In [473]:
# pad array so that all words have length of 25
word_arr = sequence.pad_sequences(word_arr, 25)

In [474]:
# Develop random index for train test split
random.seed(0)
rng = np.arange(SHAPE)
random.shuffle(rng)

In [475]:
train_len = int(SHAPE * 0.8)
train_idx = rng[:train_len]
test_idx = rng[train_len:]

In [476]:
X_train, y_train = word_arr[train_idx], origin_arr[train_idx]
X_test, y_test = word_arr[test_idx], origin_arr[test_idx]

In [477]:
len(map_dict)

53

In [478]:
len(group_dict)

4

In [479]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(map_dict) + 1, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(len(group_dict), activation="softmax")
])

In [480]:
model.compile(loss="sparse_categorical_crossentropy",optimizer="rmsprop",metrics=['acc'])

history = model.fit(X_train, y_train, epochs=10, validation_split=0.2)

Train on 14742 samples, validate on 3686 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [481]:
word_arr

array([[ 0,  0,  0, ...,  3, 21,  1],
       [ 0,  0,  0, ...,  2, 19, 14],
       [ 0,  0,  0, ..., 18,  1,  4],
       ...,
       [ 0,  0,  0, ...,  5,  2, 10],
       [ 0,  0,  0, ...,  7,  6,  9],
       [ 0,  0,  0, ..., 18, 11, 12]])

In [482]:
lan_df.head()

Unnamed: 0,word,origin,full_name,Group,word_len,group_int,letter_li,letter_ints
0,abs brake,deu,German,Germanic,9,0,"[a, b, s, , b, r, a, k, e]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,aivd,nld,Dutch;Flemish,Germanic,4,0,"[a, i, v, d]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,aachen,deu,German,Germanic,6,0,"[a, a, c, h, e, n]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,aarhus,dan,Danish,Germanic,6,0,"[a, a, r, h, u, s]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,ababil,ara,Arabic,Arabic,6,1,"[a, b, a, b, i, l]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [483]:
test_word = ''

In [484]:
def get_word_prediction(test_word):
    temp = pad_list(letter_to_number(list(test_word), map_dict))
    temp = model.predict(np.array([temp]))
    temp_df = pd.DataFrame(data={'group':[elem for elem in group_dict.keys()], 'probability' : temp[0] * 100})
    return temp_df

In [556]:
get_word_prediction('fakaworada')

Unnamed: 0,group,probability
0,Germanic,18.12472
1,Arabic,15.594498
2,Latin,11.296209
3,Japanese,54.984581


In [524]:
lan_df[lan_df['word'].str.startswith('al')]

Unnamed: 0,word,origin,full_name,Group,word_len,group_int,letter_li,letter_ints
47,al jazeera,ara,Arabic,Arabic,10,1,"[a, l, , j, a, z, e, e, r, a]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
48,aladdin,ara,Arabic,Arabic,7,1,"[a, l, a, d, d, i, n]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
49,alamo,spa,Spanish; Castilian,Latin,5,2,"[a, l, a, m, o]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
50,alban,lat,Latin,Latin,5,2,"[a, l, b, a, n]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
51,albina,lat,Latin,Latin,6,2,"[a, l, b, i, n, a]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
2002,alumna,lat,Latin,Latin,6,2,"[a, l, u, m, n, a]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2003,alumni,lat,Latin,Latin,6,2,"[a, l, u, m, n, i]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2004,alutaceous,lat,Latin,Latin,10,2,"[a, l, u, t, a, c, e, o, u, s]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2005,alveolus,lat,Latin,Latin,8,2,"[a, l, v, e, o, l, u, s]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
