# Q3

In [1]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

In [2]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Import libraries

In [3]:
import numpy as np
import spacy
from difflib import SequenceMatcher
from Levenshtein import distance as edit_distance
from nltk.util import ngrams
from nltk.corpus import stopwords
from collections import Counter
import jellyfish

In [4]:
nlp = spacy.load("en_core_web_md")

## K-gram Score Function

In [7]:
def kgram_score(query, candidate, k=2):
    def get_kgrams(text):
        return set([''.join(gram) for gram in ngrams(text, k)])
    q_k = get_kgrams(query.replace(" ", ""))
    c_k = get_kgrams(candidate.replace(" ", ""))
    return len(q_k & c_k) / len(q_k | c_k)

## Noisy Channel Score

In [8]:
def noisy_channel_score(query, candidate):
    return 1 - (edit_distance(query, candidate) / max(len(query), len(candidate)))

## Sound Score Function

In [9]:
def sound_score(query, candidate):
    q_sound = jellyfish.soundex(query)
    c_sound = jellyfish.soundex(candidate)
    if q_sound == c_sound:
      return 1.0
    else:
      return 0.0

## Semantic Score Function

In [10]:
def semantic_score(query, candidate):
    q_vec = nlp(query).vector
    c_vec = nlp(candidate).vector
    if np.linalg.norm(q_vec) == 0 or np.linalg.norm(c_vec) == 0:
        return 0.0
    return np.dot(q_vec, c_vec) / (np.linalg.norm(q_vec) * np.linalg.norm(c_vec))


## Correct Query Function

In [11]:
def correct_query(query):
    results = []
    for c in candidates:
        k = kgram_score(query, c)
        n = noisy_channel_score(query, c)
        s = sound_score(query, c)
        sem = semantic_score(query, c)
        final = (k + n + s + sem) / 4
        results.append((c, final, k, n, sem, s))
    results.sort(key=lambda x: x[1], reverse=True)
    return results

## Example 1

In [12]:
query = "machin lernng"

candidates = [
    "machine learning",
    "deep learning",
    "reinforcement learning"
]

res = correct_query(query)

print(f"{'query':<25} | final score  | k-gram  | noise  | context  | sound")
print("-"*80)
for c, f, k, n, ctx, s in res:
    print(f"{c:<25} | {f:.4f}       | {k:.4f}  | {n:.4f} | {ctx:.4f}   | {s:.4f}")

query                     | final score  | k-gram  | noise  | context  | sound
--------------------------------------------------------------------------------
machine learning          | 0.5572       | 0.5000  | 0.8125 | -0.0838   | 1.0000
deep learning             | 0.1331       | 0.2222  | 0.3846 | -0.0745   | 0.0000
reinforcement learning    | 0.1283       | 0.1538  | 0.4091 | -0.0496   | 0.0000


## Example 2

In [14]:
query = "forrest gomp"

candidates = [
    "forrest gump",
    "forest gum",
    "forrest dump"
]

res = correct_query(query)

print(f"{'query':<25} | final score  | k-gram  | noise  | context  | sound")
print("-"*80)
for c, f, k, n, ctx, s in res:
    print(f"{c:<25} | {f:.4f}       | {k:.4f}  | {n:.4f} | {ctx:.4f}   | {s:.4f}")


query                     | final score  | k-gram  | noise  | context  | sound
--------------------------------------------------------------------------------
forrest gump              | 0.8533       | 0.6667  | 0.9167 | 0.8298   | 1.0000
forrest dump              | 0.7954       | 0.5385  | 0.8333 | 0.8098   | 1.0000
forest gum                | 0.5926       | 0.5000  | 0.7500 | 0.1203   | 1.0000
