# Levenstien Edit Distance:

In [1]:
def levenstien(s1, s2):
    if(len(s2) > len(s1)):
        return levenstien(s2, s1)
    arr = [[0 for i in range(len(s2)+1)] for j in range(len(s1)+1)]
    for i in range(len(s1)+1):
        arr[i][0] = i
    for j in range(len(s2)+1):
        arr[0][j] = j
    for i, c1 in enumerate(s1):
        for j , c2 in enumerate(s2):
            deletions = arr[i][j+1] + 1
            insertions = arr[i+1][j] + 1
            substitutions = arr[i][j] + 2*(c1!=c2)
            minimum = min(deletions, insertions, substitutions)
            arr[i+1][j+1] = minimum
    return arr[len(s1)][len(s2)]

In [2]:
levenstien("writing", "reading")

6

# Jaro-Winkler Edit Distance:

In [3]:
def matching(s1, s2):
    count = 0
    if(s1 is None or s2 is None):
        return count
    else:
        size = min(len(s1), len(s2))
        for i in range(size):
            if(s1[i] == s2[i]):
                count += 1
        return count
def transpose(s1, s2):
    count = 0
    if(s1 is None or s2 is None):
        return count
    else:
        size = min(len(s1), len(s2))
        for i in range(size-1):
            if(s1[i]+s1[i+1] == s2[i+1]+s2[i]):
                count += 1
        return count
    
def jaroWinkler(s1, s2):
    m = matching(s1, s2)
    t = transpose(s1, s2)
    if m == 0:
        sim = 0
    else:
        sim = 1/3*(m/len(s1)+m/len(s2)+(m-t)/m)
    return sim

In [4]:
jaroWinkler('bimal', 'vimal')

0.8666666666666667

# Handling Files

In [5]:
unigrams = open('unigram.csv').read().splitlines()

In [6]:
print(unigrams[0])

foul ,  1


In [7]:
unigramWords = {}
for unigram in unigrams:
    word = unigram.split(',')
    unigramWords[word[0].strip()] = int(word[1].strip())

### Printing words similar to the given word in the unigram.csv file [ whose levenstien edit distance is less than 1]

In [8]:
for item in unigramWords.keys():
    if levenstien('sai', item) <= 1:
        print(item)

sail
sari
said


In [9]:
bigrams = open('bigrams.csv').read().splitlines()
bigramWords = {}
for bigram in bigrams:
    word = bigram.split(',')
    bigramWords[word[0].strip()] = int(word[1].strip())

### Printing the probabilty of words iron safe occuring one after another given safe has occured

In [10]:
print(bigramWords['iron safe']/unigramWords['iron'])

0.6666666666666666


In [11]:
unigramWords['<s>'] = 1
unigramWords['</s>'] = 1
s2 = ['<s> sandip babu sang bande mataram </s>','<s> chandranath babu asked for betel leaves </s>','<s> poor bimala went to the dressing room </s>']
for item in s2:
    multi = 1
    for i, s in enumerate(item.split()):
        try:
            p = (bigramWords[s+' '+item.split()[i+1]] + 1)/(unigramWords[s] + len(unigramWords.keys()))
            print(p, s+' '+item.split()[i+1])
            multi = multi * p
        except:
            try:
                p = (1)/(unigramWords[s] + len(unigramWords.keys()))
                print(p, s+' '+item.split()[i+1])
                multi = multi * p
            except:
                print(s)
    print(multi)




0.009802443070426783 <s> sandip
0.009051094890510949 sandip babu
0.0001490757304710793 babu sang
0.00015078407720144752 sang bande
0.005848830233953209 bande mataram
0.00014997000599880023 mataram </s>
</s>
1.749328211056289e-18
0.0009048408988086261 <s> chandranath
0.0018069567836169252 chandranath babu
0.0001490757304710793 babu asked
0.00014952153110047846 asked for
for
0.00030147723846849563 betel leaves
0.00015062509414068384 leaves </s>
</s>
1.654941054548257e-21
0.0012064545317448348 <s> poor
0.0004505180958101817 poor bimala
0.00029770765108663293 bimala went
0.00014905351021016544 went to
0.0001506931886678722 to the
0.00015071590052750564 the dressing
0.001053423626787058 dressing room
0.0001483459427384661 room </s>
</s>
8.560257448823194e-29


In [12]:
bigramProbability = bigramWords['come back']/unigramWords['come']
print(bigramProbability)

0.0684931506849315


### Add-one Smoothing

In [13]:
bigramProbability = (bigramWords['come back'] + 1.0)/((unigramWords['come']*1.0)+len(unigramWords.keys()))
print(bigramProbability)

0.0016233766233766235


In [14]:
import gensim

## Assignment questions

In [15]:
trainingData = [
    '<s> three friends amar akbar and antony are reading book </s>',
    '<s> amar is reading malgudi days </s>',
    '<s> akbar is reading a detective book </s>',
    '<s> antony is reading a book by rk narayan </s>'
]

In [16]:
from collections import defaultdict

def defValue():
    return 0

myUnigramWords = defaultdict(defValue)
for sentence in trainingData:
    for i, word in enumerate(sentence.split()):
        myUnigramWords[word] += 1
        

for word in myUnigramWords:
    print(word)

<s>
three
friends
amar
akbar
and
antony
are
reading
book
</s>
is
malgudi
days
a
detective
by
rk
narayan


In [17]:
myBigramWords = defaultdict(defValue)
for sentence in trainingData:
    for i, word in enumerate(sentence.split()):
        try:
            myBigramWords[word+' '+sentence.split()[i+1]] += 1
#             print(word+' '+sentence.split()[i+1])
        except:
            continue
            
for word in myBigramWords:
    print(word)

<s> three
three friends
friends amar
amar akbar
akbar and
and antony
antony are
are reading
reading book
book </s>
<s> amar
amar is
is reading
reading malgudi
malgudi days
days </s>
<s> akbar
akbar is
reading a
a detective
detective book
<s> antony
antony is
a book
book by
by rk
rk narayan
narayan </s>


In [18]:
def probabilityOfSentence(s1):
    multi = 1.0
    for i, word in enumerate(s1.split(' ')):
        try:
            p = (myBigramWords[word+' '+s1.split()[i+1]])/(myUnigramWords[word])
            multi *= p
        except:
            try:
                p = (1)/ (myUnigramWords[word])
            except:
                print(word)
    return multi

In [19]:
print(probabilityOfSentence('<s> amar is reading a book </s>'))

0.020833333333333332


In [20]:
def perplexity(s1):
    p = probabilityOfSentence(s1)
    pp = (1/p)**(1/len(s1.split()))
    return pp

In [21]:
print(perplexity('<s> amar is reading a book </s>'))

1.7385105064447572


In [22]:
def probabiltyWithAddOneSmoothing(s1):
    multi = 1.0
    for i, word in enumerate(s1.split(' ')):
        try:
#             print(word+' '+s1.split()[i+1])
            p = (myBigramWords[word+' '+s1.split()[i+1]] + 1)/(myUnigramWords[word] + len(myUnigramWords.keys()))
            multi *= p
        except:
            try:
#                 print(word)
                p = (1)/ (myUnigramWords[word] + len(myUnigramWords.keys()))
                multi *= p
            except:
#                 print(word, 'not in unigram')
                p = (1)/ (len(myUnigramWords.keys()))
                multi *= p
    return multi

In [23]:
print(probabiltyWithAddOneSmoothing('<s> akash is reading story book </s>'))

3.7507126354007264e-09


In [24]:
print(probabiltyWithAddOneSmoothing('<s> anthony loves reading book </s>'))
print(probabiltyWithAddOneSmoothing('<s> three friends are reading books </s>'))
print(probabiltyWithAddOneSmoothing('<s> akbar and anthony are reading malgudi days </s>'))
print(probabiltyWithAddOneSmoothing('<s> amar is reading a detective book by agatha christie </s>'))

2.5024179613551595e-08
1.1812979777085852e-09
1.19608282873589e-11
8.155044002387544e-14
