## Import Libraries

In [29]:
import gensim
from gensim.models import FastText
from gensim.models import Word2Vec
import textdistance
import ast
import random
import pandas as pd
import math

## Standard Words

In [30]:
### Read KBBI. collected from kbbi kemdikbud (not all words in kbbi kemdikbud is successfully scrapped)

kbbi = open("data/kbbi.txt").read().splitlines()
kbbi = set(kbbi)
print(len(kbbi))

72281


In [31]:
fixWords = {"begimana": "bagaimana", "kalo": "kalau",  'telpon': 'telepon',
            'kos':'indekos', 'cek': 'mengecek','toples': 'stoples','asik': 'asyik',
            'silahkan': 'silakan','rame': 'ramai','brewok': 'berewok','pepet': 'memepet',
            'duh' :'aduh','setel' :'menyetel','kebut' :'mengebut','umpet': 'mengumpet',
            'cabut': 'mencabut','rumpi': 'merumpi','detil': 'detail', 'rame' : 'ramai','angus':'hangus',
           'berantak':'berantakan', 'tropi':'trofi','hapal':'hafal','brewok':'berewok','keprek':'mengeprek',
           'mantol':'mantel','nasehat':'nasihat','ustad':'ustaz','teraweh':'tarawih','sukur':'syukur',
            'emak':'mak','adzan':'azan','telpon':'telepon','puteri':'putri','kecoa':'kecoak','perduli':'peduli',
            'impi':'mengimpikan','gue':'aku','kaco':'kacau', 'adhan':'azan'
           }

### few words that are "ragam tak baku" are included in kbbi so fixWords function to help clean these words
fixWords = pd.read_csv('data/fixWords.csv')
fixWords = dict(zip(fixWords.ragamTB, fixWords.B))

def fixdWord(inKbbiNotBaku):
    global fixWords
    try:
        return [inKbbiNotBaku[0], fixWords[inKbbiNotBaku[1]]]
    except:
        return inKbbiNotBaku

## Functions

In [32]:
### find t1 candidates that close to nsw word

def findCandidates(word, model, t1):
    '''
    generate t1 candidates that most similar to the nsw word using the model
    word    : nsw word --string
    model   : either word2vec model or fasttext model --.model
    t1      : how many candidates want to generate
    return  : candidates --list of string
    '''
    global kbbi
    try:
        candidates = model.wv.most_similar(word, topn = t1)
        candidates = [i for i in candidates if i[0] in kbbi and len(i[0])>1]
    except:
        return []
    return candidates

In [33]:
### find closest candidates based on Jaro-Winkler and Levenshtein distance

def emb_JWdist_LevDist(listofwords, referenceWord, t3):
    '''
    listofwords     : candidates -- list of string
    Find the most similar word to referenceWord(nsw) in listofwords
    referenceWord   : non standard word -- string
    t3              : minimun score to determined that two words are similar (jaro-winkler and levenshtein)
    output          : either [0, referenceWord] or [3, standar word]. 3 represents that the output is                       received based on jaro winkler and levenshtein similarities
    '''
    if listofwords!=[]:
        JWdist = [ textdistance.jaro_winkler.normalized_similarity(i[0],referenceWord) for i in listofwords ]
        LevDist = [ textdistance.levenshtein.normalized_similarity(i[0],referenceWord) for i in listofwords ]
        distance = [0.5*listofwords[i][1] + 0.25*JWdist[i] + 0.25*LevDist[i] for i in range(len(listofwords))]
        choosenWord = [[listofwords[i][0],distance[i]] for i in range(len(listofwords)) if distance[i] > t3]    
        if choosenWord != []:
            ans = max(choosenWord, key=lambda x:x[1])
            return [3,ans[0]]
        else:
            return [0, referenceWord]
    else:
        return [0, referenceWord]

In [34]:

### Find the most similar word to nsw in candidates
def modelSim(nsw,t2,t3,candidates):
    '''
    Find the most similar word to nsw in candidates
    nsw     : non standard word -- string
    t2      : minimun score to determined that two words are similar (only based on cosine similarity)              -- int
    t3      : minimun score to determined that two words are similar (jaro-winkler and levenshtein)
            -- int
    output  : either [2, standar word] or call another function. 2 represents that the output is                    received based on cosine sim only -- list
    '''
    if candidates[0][1] > t2:
        return [2, candidates[0][0]]
    else:
        return emb_JWdist_LevDist(candidates, nsw, t3)

    
def normalize(nsw,t2,t3,candidates):
    '''
    Check wether normalize the nsw or not
    nsw     : non standard word -- string
    t2      : minimun score to determined that two words are similar (only based on cosine similarity)              -- int
    t3      : minimun score to determined that two words are similar (jaro-winkler and levenshtein)
            -- int
    candidates : candidates taken from findCandidates function
    '''
    global kbbi
    if nsw in kbbi or len(nsw)==1 :
        return [0,nsw]
    if candidates!=[]:
        return modelSim(nsw,t2,t3,candidates)
    else:
        return [0, nsw]
    

In [36]:
def NormalizeWord(nsw,t1,t2,t3,model,modelType):
    '''
    Combine all the functions to get the result
    result  : List of indicator number and the standar word (sw).
    '''
    if modelType == 'ft':
        model = FastText.load(model)
    elif modelType == 'wtv':
        model = Word2Vec.load(model)
    candidates = findCandidates(nsw, model, t1)
    sw_predicted = normalize(nsw,t2,t3,candidates)
    sw_predicted = fixdWord(sw_predicted)
    return sw_predicted


## Example

In [39]:

model = "data\\fastTextmodelJktnonEnglish_alpha025_window10_epoch300_size300.model"
NormalizeWord('buanyak',30,0.85,0.55,model,modelType='ft')

### disclaimer : this model is too big, to download the model contact me via email hardianarafik@gmail.com


Wall time: 7.97 s


[3, 'banyak']