# ZAD 1
Zaimplementuj przynajmniej 3 "metryki" spośród wymienionych: cosinusowa, LCS, DICE, euklidesowa, Levenshteina.

## Preprocessing and vectorization

In [56]:
import re

def preprocess(text: str) -> str:
    text = text.lower()    
    text = re.sub(r'\W+', ' ', text)
    return text

def text_to_vec(docs: list[str]) -> list[list[int]]:
    
    freq_set = set()
    freq_dicts = [dict() for _ in docs]
    freq_vects = []
    for i, doc in enumerate(docs):
        doc = preprocess(doc)
        for word in doc.split():
            if word not in freq_dicts[i]:
                freq_dicts[i][word] = 1
                freq_set.add(word)
            else:
                freq_dicts[i][word] += 1
    
    for f_dict in freq_dicts:
        vect = list(f_dict.values())
        for _ in range(len(freq_set) - len(f_dict)): vect.append(0)
        freq_vects.append(vect)

    return freq_vects


In [57]:
# Tests
text_a = "The quick brown fox jumped over the lazy dog."
text_b = "The lazy dog was jumped over by the quick brown fox."
vec_a, vec_b = text_to_vec([text_a, text_b])

assert(set(vec_a) == set([1, 1, 1, 2, 1, 1, 1, 1, 0, 0])) 
assert(set(vec_b) == set([1, 1, 1, 2, 1, 1, 1, 1, 1, 1]))

## Cosine similarity

$$
\begin{equation}
    \cos(\theta) = \frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\| \|\mathbf{B}\|}= \frac{\sum\limits_{i=1}^{n} A_i B_i}{\sqrt{\sum\limits_{i=1}^{n} A_i^2} \sqrt{\sum\limits_{i=1}^{n} B_i^2}}
    \qquad\begin{aligned}
    &\text{where:} \\
    &\mathbf{A}\text{ and }\mathbf{B} \text{ are the two vectors being compared}\\
    &n \text{ is the dimensionality of the vectors}\\
    &\theta \text{ represents the angle between two vectors } \mathbf{A} \text{ and } \mathbf{B} \text{ in a high-dimensional space}
    \end{aligned}
\end{equation}
$$

The dot product of $\mathbf{A}$ and $\mathbf{B}$ is divided by the product of their Euclidean lengths to normalize the result to a range of [-1, 1]. A value of 1 indicates that the two vectors are identical, while a value of -1 indicates that they are completely dissimilar.


In [None]:
import math

def cosine_similarity(text_a: str, text_b: str) -> float:
    cosine_similarity = 0
    
    vec_a, vec_b = text_to_vec([text_a, text_b])
    num = 0
    for i in range(len(vec_a)):
        num += vec_a[i] * vec_b[i]

    den_a = 0
    for i in range(len(vec_a)):
        den_a += vec_a[i] ** 2
    den_b = 0
    for i in range(len(vec_b)):
        den_b += vec_b[i] ** 2

    cosine_similarity = num / math.sqrt(den_a*den_b)    

    return cosine_similarity

In [59]:
# Tests
dist = cosine_similarity(text_a, text_b)
assert(abs(dist - 0.91986) < 0.0001)

## Dice coefficient / Sørensen-Dice Index

$$
\begin{equation}
    \text{Dice}(A, B) = \frac{2 |A \cap B|}{|A| + |B|} 
    \qquad\begin{aligned}
    &\text{where:} \\
    &A \text{ and } B \text{ represent the two sets being compared} \\
    &|A| \text{ and } |B| \text{ represent the cardinality (number of elements) of the sets} \\
    &\text{and } |A \cap B| \text{ represents the size of the intersection of the two sets}
    \end{aligned}
\end{equation}
$$


In [60]:
def dice(text_a: str, text_b: str) -> float:
    set_a = set()
    set_b = set()
    text_a = preprocess(text_a)
    text_b = preprocess(text_b)
    set_a = set(text_a.split())
    set_b = set(text_b.split())

    dice = (2 * len(set_a.intersection(set_b))) / (len(set_a) + len(set_b))

    return dice

dice(text_a, text_b)

0.8888888888888888

In [61]:
# Tests
dist = dice(text_a, text_b)
assert(abs(dist - 0.88888) < 0.0001)

## Euclidean distance

$$
\begin{equation}
    d(x,y) = \sqrt{\sum_{i=1}^{n}(x_i-y_i)^2}
    \qquad\begin{aligned}
    &\text{where:} \\
    &d(x,y) \text{ is the Euclidean distance} \\
    &x_i, y_i \text{ are the values of the i-th dimension of vectors } x \text{ and } y \\
    &n \text{ is the number of dimensions in the vectors}
    \end{aligned}
\end{equation}
$$

In [62]:
def euclidean_distance(text_a: str, text_b: str) -> float:
    vec_a, vec_b = text_to_vec([text_a, text_b])
    sum_sqr = 0
    for i in range(len(vec_a)):
        sum_sqr += (vec_a[i] - vec_b[i])**2
    dist = math.sqrt(sum_sqr)    
    
    return dist

In [63]:
# Tests
dist = euclidean_distance(text_a, text_b)
assert(abs(dist - 1.4142135) < 0.0001)

## LCS - Longest Common Subsequence

Longest, common, continuous subsequence of two sequences, aka "the longest substring".

In [92]:
from typing import Any, Sequence

def lcs(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int:

    a, b = seq_a, seq_b
    n = len(a)
    m = len(b)
    maxi = 0
    Matrix = [[0 for _ in range(m+1)] for _ in range(n+1)]
    for i in range(1, n+1):
        for j in range(1, m+1):
            if a[i-1] == b[j-1]:
                Matrix[i][j] = Matrix[i-1][j-1] + 1
                if maxi < Matrix[i][j]:
                    maxi = Matrix[i][j]
            else:
                Matrix[i][j] = max(Matrix[i-1][j], Matrix[i][j-1])

    return maxi


def word_lcs(text_a: str, text_b: str) -> int:
    
    seq_a = []
    seq_b = []

    for word in text_a.split():
        seq_a.append(word)
    for word in text_b.split():
        seq_b.append(word)

    return lcs(seq_a, seq_b)


In [93]:
# Tests
assert lcs("banana", "ananas") == 5
assert word_lcs(text_a, text_b) == 4

## Levenshtein distance



In [94]:

def levenshtein(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int:

    a, b = seq_a, seq_b

    n = len(a)
    m = len(b)
    Matrix = [[0 for _ in range(n+1)] for _ in range(m+1)]

    for i in range(n+1):
        Matrix[0][i] = i
    for j in range(m+1):
        Matrix[j][0] = j

    for i in range(1, n+1):
        for j in range(1, m+1):
            if a[i-1] != b[j-1]:
                Matrix[j][i] = min(Matrix[j-1][i-1], Matrix[j-1][i], Matrix[j][i-1]) + 1
            else:
                Matrix[j][i] = Matrix[j-1][i-1]

    dist = Matrix[m][n]
    return dist


def word_levenshtein(text_a: str, text_b: str) -> int:
    # You code here:
    # Using the above function implement the LCS algorithm for texts.
    # Make sure it works on whole words, not on characters.
    seq_a = []
    seq_b = []
        
    for word in text_a.split():
        seq_a.append(word)
    for word in text_b.split():
        seq_b.append(word)

    return levenshtein(seq_a, seq_b)


In [95]:
# Tests
assert levenshtein("banana", "ananas") == 2
assert word_levenshtein(text_a, text_b) == 7

# ZAD 2
Zaimplementuj przynajmniej 1 sposoby oceny jakości klasteryzacji (np. indeks Daviesa-Bouldina).

In [None]:
def average_distance(cluster, metric, *args):
    if len(cluster) < 2:
        return 0
    elif len(cluster) == 2:
        return metric(cluster[0], cluster[1], *args)
    dist_sum = 0
    for i in range(len(cluster)):
        for j in range(i):
            dist_sum += metric(cluster[i], cluster[j], *args)
    return dist_sum / ((len(cluster) - 2) * (len(cluster) - 1))
