In [1]:
# !pip install -U -q segmentation-models
# !pip install -q tensorflow==2.1
# !pip install -q keras==2.3.1
# !pip install -q tensorflow-estimator==2.1.

# ## Imports libs
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# os.environ["SM_FRAMEWORK"] = "tf.keras"

# from tensorflow import keras
# import segmentation_models as sm

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from collections import Counter
from keras.preprocessing import sequence
import random

Using TensorFlow backend.


In [3]:
def create_letter_mapping(all_words):
    letter_li = []
    
    # Get unique letters in the data
    for word in all_words:
        for letter in word:
            letter_li.append(letter)
    
    # order letters by most common
    most_common = Counter(letter_li).most_common()
    letter_li = []
    for elem in most_common:
        letter_li.append(elem[0])
    
    # Map each letter to an integer
    letter_dict = {}
    for idx, letter in enumerate(letter_li):
        if letter_dict != '':
            letter_dict[letter] = idx + 1
        else:
            letter_dict[letter] = 0
    
    return letter_dict

def letter_to_number(word_li, letter_dict):
    return_li = []
    for letter in word_li:
        return_li.append(letter_dict[letter])
        
    return return_li

def pad_list(letter_li, pad_num=25):
    curr_len = len(letter_li)
    new_len = 25 - curr_len
    zeroes = [0] * new_len
    zeroes.extend(letter_li)
    return zeroes



In [115]:
lan_df = pd.read_csv('language_data.csv')

In [116]:
test = pd.read_csv('language_data2.csv')

In [117]:
test.fillna('Other', inplace=True)

In [118]:
test.loc[test['full_name'].str.contains('English'), 'Group'] = 'Germanic'
test.loc[test['full_name'].str.contains('Dutch'), 'Group'] = 'Germanic'
test.loc[test['full_name'].str.contains('Norse'), 'Group'] = 'Germanic'
test.loc[test['full_name'].str.contains('Germ'), 'Group'] = 'Germanic'

In [119]:
test.loc[test['full_name'].str.contains('French'), 'Group'] = 'Latin'

In [120]:
test.loc[test['full_name'].str.contains('Hebrew'), 'Group'] = 'Other'

In [121]:
test.loc[test['full_name'].str.contains('Greek'), 'Group'] = 'Greek'

In [122]:
# Counter(test.loc[test['Group'] == 'Other', 'full_name'].values).most_common()

In [123]:
lan_df = test

In [124]:
lan_df.loc[lan_df['full_name'] == 'Sanskrit', 'Group'] = 'Indo-Aryan'
lan_df.loc[lan_df['full_name'] == 'Hindi', 'Group'] = 'Indo-Aryan'

In [125]:
lan_df = lan_df[lan_df['Group'] != 'Arabic']
lan_df = lan_df[lan_df['Group'] != 'Turkish']

In [126]:
lan_df = lan_df[lan_df['Group'] != 'Other']

In [127]:
lan_df.reset_index(inplace=True,drop=True)

In [128]:
group_dict = {}
for idx, value in enumerate(lan_df.Group.unique()):
    group_dict[value] = idx
    
lan_df['group_int'] = lan_df['Group'].apply(lambda x: group_dict[x])

In [130]:
group_dict

{'Germanic': 0, 'Indo-Aryan': 1, 'Greek': 2, 'Latin': 3, 'Japanese': 4}

In [131]:
lan_df['letter_li'] = lan_df['word'].apply(lambda x: list(x))

In [132]:
map_dict = create_letter_mapping(lan_df['word'].values)

In [133]:
lan_df['letter_ints'] = lan_df['letter_li'].apply(lambda x: pad_list(letter_to_number(x, map_dict)))

In [134]:
SHAPE = lan_df.shape[0]

In [135]:
word_arr = np.empty(SHAPE, dtype=list)
origin_arr = np.empty(SHAPE, dtype=int)

In [136]:
for i in range(SHAPE):
    word_arr[i] = lan_df.loc[i, 'letter_ints']
    origin_arr[i] = lan_df.loc[i, 'group_int']

In [137]:
# pad array so that all words have length of 25
word_arr = sequence.pad_sequences(word_arr, 25)

In [138]:
# Develop random index for train test split
random.seed(0)
rng = np.arange(SHAPE)
random.shuffle(rng)

In [139]:
train_len = int(SHAPE * 0.8)
train_idx = rng[:train_len]
test_idx = rng[train_len:]

In [140]:
X_train, y_train = word_arr[train_idx], origin_arr[train_idx]
X_test, y_test = word_arr[test_idx], origin_arr[test_idx]

In [141]:
len(map_dict)

55

In [142]:
len(group_dict)

5

In [147]:
embedding_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(map_dict) + 1, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(len(group_dict), activation="softmax")
])

In [148]:
# classes = len(group_dict)
# embed_dims = 32
# stack_model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(len(map_dict) + 1, embed_dims),
#     tf.keras.layers.LSTM(64, return_sequences=True),
#     tf.keras.layers.LSTM(32, return_sequences=False),
#     tf.keras.layers.Dense(classes, activation="softmax")
# ])

In [149]:
embedding_model.compile(loss="sparse_categorical_crossentropy",optimizer="rmsprop",metrics=['acc'])
history = embedding_model.fit(X_train, y_train, epochs=10, validation_split=0.2)

Train on 35983 samples, validate on 8996 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [151]:
def get_word_prediction(test_word, model):
    temp = pad_list(letter_to_number(list(test_word), map_dict))
    temp = model.predict(np.array([temp]))
    temp_df = pd.DataFrame(data={'group':[elem for elem in group_dict.keys()], 'probability' : temp[0] * 100})
    return temp_df

In [168]:
# test[test['Group'] == 'Greek'].tail(5)

Unnamed: 0.1,Unnamed: 0,word,origin,full_name,Group
58644,98176,zoophytology,grc,"Greek, Ancient (to 1453)",Greek
58648,98195,zootherapy,grc,"Greek, Ancient (to 1453)",Greek
58649,98196,zootic,grc,"Greek, Ancient (to 1453)",Greek
58658,98223,zoöphagy,grc,"Greek, Ancient (to 1453)",Greek
58659,98225,zoöphyte,grc,"Greek, Ancient (to 1453)",Greek
58670,98242,zygoma,grc,"Greek, Ancient (to 1453)",Greek
58672,98246,zygote,grc,"Greek, Ancient (to 1453)",Greek
58673,98247,zygotic,grc,"Greek, Ancient (to 1453)",Greek
58674,98248,zyme,grc,"Greek, Ancient (to 1453)",Greek
58683,98257,zymurgy,grc,"Greek, Ancient (to 1453)",Greek


In [183]:
# test[test['Group'] == 'Indo-Aryan'].tail(5)

In [182]:
get_word_prediction('smith', embedding_model)

Unnamed: 0,group,probability
0,Germanic,95.833519
1,Indo-Aryan,0.043775
2,Greek,0.124938
3,Latin,3.984998
4,Japanese,0.01277


In [52]:
20118 / (20118 + 2177 + 411 + 382)

0.8713617463617463