In [1]:
import os
import conllu
# import Levenshtein as lev
import numpy as np
import re
import tqdm
import json
from bs4 import BeautifulSoup

In [3]:
def levenshtein_distance(s1, s2):
    l_s1 = len(s1)
    l_s2 = len(s2)
    dist = np.zeros((l_s1 + 1, l_s2 + 1))

    # Distanta de la prefixul care incepe pe pozitia i din s1 la sirul vid
    for i in range(l_s1 + 1):
        dist[i, 0] = i
    for i in range(l_s2 + 1):
        dist[0, i] = i

    for i in range(1, l_s1 + 1):
        for j in range(1, l_s2 + 1):
            # daca ultimul caracter este acelasi
            if s1[i - 1] == s2[j - 1]:
                dist[i, j] = dist[i - 1, j - 1]
            else:
                deletion = dist[i - 1, j]
                insertion = dist[i, j - 1]
                substitution = dist[i - 1, j - 1]
                dist[i, j] = 1 + min(deletion, insertion, substitution)
    
    return dist[l_s1, l_s2]

In [4]:
file = open("./dataset/Tales.train.ro", "r", encoding="utf-8")
all_words_ro = set()
for line in file:
    line = line.strip()
    regex = re.compile(r"(\w+)")
    words = re.findall(regex, line)
    for word in words:
        all_words_ro.add(word)

FileNotFoundError: [Errno 2] No such file or directory: './dataset/Tales.train.ro'

In [5]:
file = open("./dataset/Tales.train.rup", "r", encoding="utf-8")
all_words_aro = set()
for line in file:
    line = line.strip()
    regex = re.compile(r"(\w+)")
    words = re.findall(regex, line)
    for word in words:
        all_words_aro.add(word)

FileNotFoundError: [Errno 2] No such file or directory: './dataset/Tales.train.rup'

In [6]:
json_file_path = "candidates.json"

def get_candidates():
    candidates_dict = {}
    all_words_ro_list = list(all_words_ro)
    all_words_aro_list = list(all_words_aro)
    for word in tqdm.tqdm(all_words_aro_list, desc="Processing", unit="element"):
        best_candidates_dist = [levenshtein_distance(word, candidate) for candidate in all_words_ro_list]
        best_candidates_ind = np.argsort(best_candidates_dist)
        candidates_dict[word] = [all_words_ro_list[ind] for ind in best_candidates_ind[:10]]
    # print(candidates_dict)


    with open(json_file_path, 'w') as json_file:
        json.dump(candidates_dict, json_file)
    print(f"Dictionary saved to {json_file_path}")


In [7]:
# îndoauă = unele (distanta mare)
# îndoauă ~ indata

In [9]:
with open(json_file_path, 'r') as json_file:
    candidates_dict_test = json.load(json_file)

print(candidates_dict_test['îndoauă'])

['îndată', 'înceapă', 'două', 'învață', 'înecată', 'întoarcă', 'încoace', 'Îndată', 'doară', 'înhață']


In [7]:
html_file = open("Dictionar_Cunia.html", "r", encoding="utf-8")
soup = BeautifulSoup(html_file, "html.parser")

<!DOCTYPE html>
<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <title>
   DictsiunarArmanescu, Dec 2008
  </title>
  <meta content="LibreOffice 7.3.7.2 (Linux)" name="generator"/>
  <meta content="Tiberius Cunia" name="author"/>
  <meta content="2008-12-26T13:42:00" name="created"/>
  <meta content="2023-01-28T19:32:49.767500185" name="changed"/>
  <style type="text/css">
   @page { size: 8.5in 11in; margin-right: 0.9in; margin-top: 0.7in; margin-bottom: 0.4in } 		p { color: #000000; letter-spacing: -0.3pt; line-height: 115%; orphans: 2; widows: 2; margin-bottom: 0.1in; direction: ltr; background: transparent } 		p.western { font-family: "Cartea Aromana3 TNR", serif; font-size: 10pt; so-language: en-US } 		p.cjk { font-family: "Times New Roman", serif; font-size: 10pt } 		p.ctl { font-family: "Cartea Aromana3 TNR", serif; font-size: 10pt; so-language: ar-SA } 		a:link { color: #000080; so-language: zxx; text-decoration: underline }
  </style>
 <

In [8]:
print(soup.prettify()[0:10000])


<!DOCTYPE html>
<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <title>
   DictsiunarArmanescu, Dec 2008
  </title>
  <meta content="LibreOffice 7.3.7.2 (Linux)" name="generator"/>
  <meta content="Tiberius Cunia" name="author"/>
  <meta content="2008-12-26T13:42:00" name="created"/>
  <meta content="2023-01-28T19:32:49.767500185" name="changed"/>
  <style type="text/css">
   @page { size: 8.5in 11in; margin-right: 0.9in; margin-top: 0.7in; margin-bottom: 0.4in } 		p { color: #000000; letter-spacing: -0.3pt; line-height: 115%; orphans: 2; widows: 2; margin-bottom: 0.1in; direction: ltr; background: transparent } 		p.western { font-family: "Cartea Aromana3 TNR", serif; font-size: 10pt; so-language: en-US } 		p.cjk { font-family: "Times New Roman", serif; font-size: 10pt } 		p.ctl { font-family: "Cartea Aromana3 TNR", serif; font-size: 10pt; so-language: ar-SA } 		a:link { color: #000080; so-language: zxx; text-decoration: underline }
  </style>
 <

In [23]:
def find_root_word(tag):
    return tag.name == 'span' and tag.get('style') == 'text-decoration: none'
def find_translation(tag):
    return tag.name == 'font' and tag.get('style') == 'font-size: 10pt' and tag.get('size') == '2'
def find_def_and_ex(tag):
    return tag.name == 'font' and tag.get('face') == 'Times New Roman, serif'

In [None]:
# trans = soup.find_all(find_translation)
# for el in trans[:100]:
#     print(el.find('i').text)

In [24]:
def_ex = soup.find_all(find_def_and_ex)
for el in def_ex[:100]:
    print(el.text)

A
a bre (a bré) inter – vedz tu ore
a bre 
ore
a1 prip – la, ca, ti, tri, trã, ca ti, ca trã, na, etc. {ro: la, a, ca pentru, etc.} {fr: au, à l’, à la, aux, etc.} {en: at, as, etc.} ex: sã ncljinarã a (la) cicioari; filigenj tri a beari yin; chinsirã si s-ducã a (tri) beari; hai, niveastã, a (la) primnari; s-duc a (la, s-facã) zbor; ãlj bati a (ca) vearã; anjurzeashti a (ca) primuvearã; bãtea a (ca) yiu; nu bãtea dip a (ca) yiu; anjurzeashti a (ca) om; anjurzeashti a (ca) ursã; corghilj nã cãntã a (ca ti) moarti; s-tsãni a (ca) mari; una cali apucã a (na) ndreapta, alantã acatsã a (na) stãnga; cupiili sãrmati di-a-doarã (trã andaua oarã); s-acãtsarã s-gioacã (a) cãrtsã; muljerli s-arca a mortului tu lucru (expr: pãnã s-cadã mpadi di-avursiri); u dipusi di-a-cu-totalui; bagã tsãruhili a molju (ta s-moalji; expr: sã ndreadzi s-fugã); a sclaea (di-a sclavlu); a shuirarea; shi nãintea-a dativlui, sing shi pl: lj-dau pãni a lui, a ljei, a lor; oili armasirã a noauã; bagã a calui cãpestrul;

In [None]:

"""
with open('dialect1.txt', 'r', encoding='utf-8') as file:
    text_dialect1 = file.read()

with open('dialect2.txt', 'r', encoding='utf-8') as file:
    text_dialect2 = file.read()


from nltk.tokenize import word_tokenize
words_dialect1 = word_tokenize(text_dialect1)
words_dialect2 = word_tokenize(text_dialect2)


from collections import Counter

co_occurrences = Counter()

for word1 in set(words_dialect1):
    for word2 in set(words_dialect2):
        if word1 in words_dialect1 and word2 in words_dialect2:
            co_occurrences[(word1, word2)] += 1


for (word1, word2), count in co_occurrences.most_common():
    print(f'Co-occurrence of "{word1}" and "{word2}": {count} times')
"""


In [None]:
import json
import pandas as pd


json_file_path = "candidates.json"
with open(json_file_path, 'r') as json_file:
    candidates_dict_test = json.load(json_file)

print(candidates_dict_test['îndoauă'])

excel_path = "Papahagi.xls"
df = pd.read_excel(excel_path, header = None)

df.columns = ["POS", "aro", "ro", "origine", "IDK", "autor"]
df.head()

for word in candidates_dict_test["îndoauă"]:
    print(word)
    if word in df["ro"].values:
        print("yes")
    else:
        print("no")