In [24]:
from time import time
from collections import defaultdict

from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 1. Enhanced Levenshtein: function distance
* insertion = 1
* deletion = 1
* substitution = 1
* except for cases:
* ž <-> zh = 0.1
* š <-> sh = 0.1
* ö <-> 8 = 0.1
* ü <-> y = 0.1
* õ <-> 6 = 0.1
* ä <-> 2 = 0.1
* x <-> ks = 0.1
* exchangeable pair of symbols = 0.5 (e.g., kt <-> tk in the word "punkt"; ?! <-> !?)
* lowercase <-> uppercase (same letter) = 0.1


In [25]:
def kaugus(xs, ys):
   
#Enhanced Levenshtein Distance
#Returns the distance between two given strings
    
    mem = defaultdict(lambda: defaultdict(lambda: -1))
    def _lev(s1,s2):
        if mem[s1][s2]!=-1:
            return mem[s1][s2]
        if len(s1)==0:
            mem[s1][s2] = len(s2)
            return mem[s1][s2]
        if len(s2)==0:
            mem[s1][s2] = len(s1)
            return mem[s1][s2]
        else:
            cost = 1
            if s1[-1]==s2[-1]:
                cost = 0

            # Loetelu massiivi
            levcosts = []
            # adding - 1
            levcosts.append(_lev(s1[:-1],s2)+1)
            # deleting - 1
            levcosts.append(_lev(s1,s2[:-1])+1)
            # substitution - 1 või 0
            levcosts.append(_lev(s1[:-1],s2[:-1])+cost)

            # ž <-> zh = 0.1
            if (s1[-1] == 'ž' and s2[-2:] == 'zh'):
                levcosts.append(_lev(s1[:-1],s2[:-2])+0.1)
             # zh <-> ž = 0.1
            if (s2[-1] == 'ž' and s1[-2:] == 'zh'):
                levcosts.append(_lev(s1[:-2],s2[:-1])+0.1)

            # š<->sh - 0.1
            if (s1[-1] == 'š' and s2[-2:] == 'sh'):
                levcosts.append(_lev(s1[:-1],s2[:-2])+0.1)
             # sh <-> š = 0.1
            if (s2[-1] == 'š' and s1[-2:] == 'sh'):
                levcosts.append(_lev(s1[:-2],s2[:-1])+0.1)

            # ö<->8 - 0.1
            if (s1[-1]=='ö' and s2[-1]=='8') or (s1[-1]=='8' and s2[-1]=='ö'):
                levcosts.append(_lev(s1[:-1],s2[:-1])+0.1)
            # ü<->y - 0.1
            if (s1[-1]=='ü' and s2[-1]=='y') or (s1[-1]=='ü' and s2[-1]=='y'):
                levcosts.append(_lev(s1[:-1],s2[:-1])+0.1)
            # õ<->6 - 0.1
            if (s1[-1]=='õ' and s2[-1]=='6') or (s1[-1]=='6' and s2[-1]=='õ'):
                levcosts.append(_lev(s1[:-1],s2[:-1])+0.1)
            # ä<->2 - 0.1
            if (s1[-1]=='ä' and s2[-1]=='2') or (s1[-1]=='2' and s2[-1]=='ä'):
                levcosts.append(_lev(s1[:-1],s2[:-1])+0.1)

            # x <-> ks = 0.1
            if (s1[-1] == 'x' and s2[-2:] == 'ks'):
                levcosts.append(_lev(s1[:-1],s2[:-2])+0.1)
             # ks <-> x = 0.1
            if (s2[-1] == 'x' and s1[-2:] == 'ks'):
                levcosts.append(_lev(s1[:-2],s2[:-1])+0.1)

            # pair of symbols exchanged (e.g., kt <-> tk)
            if len(s1) > 1 and len(s2) > 1 and s1[-1] == s2[-2] and s1[-2] == s2[-1]:
                levcosts.append(_lev(s1[:-2], s2[:-2]) + 0.5)

            # lowercase-uppercase either way - 0.1
            if s1[-1].lower() == s2[-1].lower():
                levcosts.append(_lev(s1[:-1], s2[:-1]) + 0.1)


            mem[s1][s2] = min(levcosts)
            return mem[s1][s2]

    return(_lev(xs,ys))

## 2. funktsioonid *sonad* ja *soovitus*

Function **soned** - Reads and preprocesses the EKI word forms file. The returned object can be of your choice (yes, it can be a list, a user-defined class, etc.).

Function **soovitus** - Returns three word forms from the word forms file via the function words, whose transformation distance to the given word according to the method is minimal, while they are sorted increasingly in terms of distance. In case of a tie, you can decide.


In [26]:
def soned(failinimi):
    # Saab ette EKI sõnavormide nimistu, kus on sõna real
    # Tagastab nende sisseloetud-töötletud kuju
    with open(failinimi, 'r', encoding='utf-8') as file:
        mudel = [line.strip().replace('|', '') for line in file]
    return mudel

In [27]:
def soovitus(mudel, sona):
    # gets the model and a word to calculate distance
    # Returns three word forms most similar to this word based on the 'distance' function.
    kaugused = [(x, kaugus(x, sona)) for x in mudel]
    kaugused.sort(key=lambda x: x[1])
    top_3 = [distance[0] for distance in kaugused[:3]]
    return top_3


## Testing

In [28]:
if __name__ == "__main__":
   
    algusaeg = time()
    dist = kaugus("yx","üks") #0,2
    aega = time()-algusaeg
    print("Kaugus saadi {}, õige vastus oli {}".format(dist,0.2))
    print("Aega kulus {} sekundit".format(round(aega,2)))
    print()

    algusaeg = time()
    m = soned("vormid.txt")
    aega = time()-algusaeg
    print("Sõnavormide sisselugemiseks kulus {} sekundit".format(round(aega,2)))

    algusaeg = time()
    vastus = soovitus(m,"keeletenholoogia")
    aega = time()-algusaeg
    print("Soovitused: {}, aega vastuse saamiseks kulus {} sekundit".format(vastus,round(aega,2)))

Kaugus saadi 1.1, õige vastus oli 0.2
Aega kulus 0.0 sekundit

Sõnavormide sisselugemiseks kulus 0.09 sekundit
Soovitused: ['keemiatehnoloogia', 'teabetehnoloogia', 'kõrgtehnoloogia'], aega vastuse saamiseks kulus 139.66 sekundit
