In [None]:
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath('../code'))
import config
from translate import Translate

languages = ["pap", "pap-simple", "nl", "nl-simple"]

# set filter default to deal with nan value
hny_pap_nl = pd.read_csv("../data/hny/pap-nl.csv", na_filter=False)
stparkpap_pap_nl_name = "-".join(config.scrapeTarget["languages"])
stparkpap_pap_nl = pd.read_csv(f"../data/stparkpap/{stparkpap_pap_nl_name}.csv")#, index_col=0)
crse = pd.read_csv(f"../data/{config.crse['name']}/nl-pap.csv")
tatoeba = pd.read_csv(f"../data/tatoeba/pap-nl.csv")

# corpus = pd.concat([hny_pap_nl], ignore_index=True)
corpus = pd.concat([hny_pap_nl, stparkpap_pap_nl, crse, tatoeba], ignore_index=True)
corpus = corpus[languages]
corpus = Translate.attachType(corpus, "pap-simple")
corpus

# Separate out !,?.
# There are numbers in pap-simple
# stpark has slasher and stuff. insert as other row?

In [None]:
trans = Translate()

short_sentences = [
    "ami ta bai kas",
    "mi stima mi yu-homber",
    "ami ta kòre outo",
    "mi no kier bai skol",
    "mi no kier kumpra bo kacho",
    "e ta hopi grandi",
    ]
long_sentences = [
    # "mi no ta guste pasobra e ta parse un kabritu",
    # "nan no tabata tin ningun idea pakiko e no a hasie"
    # "mi kier duna bo un kos aki",
    # "Mi no kier tende loke bo ta bisa",
    # "Mi no kier tende loke bo ta bisa, pasobra tur biaha bo ta gaña",
    # "Mi kier kumpra un grandi i un chiki",
    # "Mi tin hopi gana di bai landa i kome na mi mama su kas",
    # "Mi no kier kita mi karson",
    # "antiano no sa biba den sushi asina aki",
    # "Mi no gusta mi ruman pasobra e ta dal mi kada bia ku e wak mi.",
    # "Mi tin gana di kome un bon hamburger",
    # "E programa aki ta hopi malu",
    # "Pa konklui , un hende mester bisa ami si esun aki ta mas miho of mas malu."
    # "Pa konklui mi kier purba un otro zin."
    "Mi kier kome pan"
]
test_sentence = "mi yu di kriansa a pidi ami pa un pida bolo"

In [None]:
def getWordCorrections(sentence, words_corpus):
    translations = {}   

    for word in sentence.split():
        word = word.lower()
        words_corpus = Translate.attachClosest(words_corpus, word, "pap-simple")
        # print(words_corpus.head(3))
        if words_corpus["closest"].iloc[0] > 0:
            translations[word] = words_corpus.head(3)["pap-simple"].to_list()
    return translations

print(getWordCorrections("many can", hny_pap_nl[hny_pap_nl["type"]=="word"]))                

In [None]:
class Node:
    def __init__(self, lastNode=False):
        self.children = {}
        self.lastNode = lastNode
        self.accented = {
            "a" : ["à","á"], 
            "e" : ["è","é"], 
            "i" : ["ì","í"], 
            "o" : ["ò","ó"], 
            "u" : ["ù","ú"], 
            "n" : ["ñ"], 
        }
    def __str__(self):
        if self.children:
            ret = f"{len(self.children)}"
            for k, v in self.children.items():
                ret += f" {k}: {v.lastNode} {v}"
            return ret
        else:
            return ""

    def insert(self, word):
        current = self
        for i, l in enumerate(word):
            if l in current.children:
                # print(f"from {current} -> ")
                current = current.children[l]
                # print(f"to {current}")
            else:
                newNode = Node()
                # print(f"inserting {l} into {current}")
                current.children[l] = newNode
                # print(f"new {len(current.children)} children {current}")
                current = newNode
            if i == len(word)-1:
                current.lastNode = True

    def find(self, word):
        current = self
        for i, l in enumerate(word):
            if l in current.children:
                current = current.children[l]
                if i == len(word)-1:
                    return current.lastNode
            else:
                return False

    def lenientFind(self, word):
        found_word = "" 
        current = self
        for i, l in enumerate(word):
            found_key = False
            # print(f"checking if {l} has accented")
            if l in self.accented:
                for accented_l in self.accented[l]:
                    # print(f"checking {accented_l}")
                    if accented_l in current.children:
                        # print(f"found {accented_l}")
                        accented_search = accented_l + word[i+1:]
                        found_accented_word = current.lenientFind(accented_search)
                        if found_accented_word:
                            return found_word + found_accented_word
                
            if l in current.children:
                current = current.children[l]
                found_word += l
                found_key = True

            if not found_key:
                return ""
            
            if i == len(word)-1:
                if current.lastNode:
                    return found_word
                else:
                    return ""
            
    
    def populate(self, words):
        for word in words:
            self.insert(word)

In [None]:
import time
corpus = hny_pap_nl[hny_pap_nl["type"]=="word"]
words = corpus["pap-simple"].values
# start_time = time.time()
# print("--- %.2f seconds ---" % (time.time() - start_time))
test_acc_word = "kalumnia" + u'\u0301'
test_acc_word = "kalumniá"
# corpus[corpus["pap-simple"]=="kardiologo"]
corpus[corpus["pap-simple"]==test_acc_word]
# print(test_acc_word)

In [None]:
getWordCorrections("mi man no ta bon", corpus)

In [None]:
t = Node()
t.populate(words)
result = {}
sentence = "mi man no ta bon"
for word in sentence.split():
    result[word] = t.find(word)
result

In [None]:
# t.find("kámara")
print(t.find("kamara"))
print(t.lenientFind("manan"))
print(t.lenientFind("kamara"))
print(t.lenientFind("inkreibel"))
# print(t.lenientFind("kalmeki"))
t.find("kalmeki")