# ZAD 1
Zaimplementuj przynajmniej 3 "metryki" spośród wymienionych: cosinusowa, LCS, DICE, euklidesowa, Levenshteina.

In [109]:
import math

def delta(x, y):
    return 1 if x == y else 0


def lcs(x, y):
    path = [[0 for _ in range(len(y) + 1)] for _ in range(len(x) + 1)]
    longest = 0

    for i in range(1, len(x) + 1):
        for j in range(1, len(y) + 1):
            path[i][j] = path[i - 1][j - 1] + delta(x[i - 1], y[j - 1])
            if path[i][j] > longest:
                longest = path[i][j]

    return 1 - longest / max(len(x), len(y))


def get_n_grams(x, n):
    n_grams = dict()
    for i in range(len(x) + 1 - n):
        n_gram = x[i:i + n]
        if n_gram in n_grams.keys():
            n_grams[n_gram] += 1
        else:
            n_grams[n_gram] = 1

    return n_grams


def dice(x, y, n=2):
    n_grams_x = set(get_n_grams(x, n))
    n_grams_y = set(get_n_grams(y, n))
    prod = n_grams_x & n_grams_y

    return 1 - len(prod) * 2 / (len(n_grams_x) + len(n_grams_y))


def levenstein(x, y):
    edit_dist = [[0 for _ in range(len(y) + 1)] for _ in range(len(x) + 1)]
    for i in range(1, len(x) + 1):
        edit_dist[i][0] = i
    for i in range(1, len(y) + 1):
        edit_dist[0][i] = i
    for i in range(1, len(x) + 1):
        for j in range(1, len(y) + 1):
            edit_dist[i][j] = min(edit_dist[i - 1][j - 1] + delta(x[i - 1], y[j - 1]),
                                  edit_dist[i - 1][j] + 1,
                                  edit_dist[i][j - 1] + 1)
    return edit_dist[-1][-1] / max(len(x), len(y))

# ZAD 2
Zaimplementuj przynajmniej 1 sposoby oceny jakości klasteryzacji (np. indeks Daviesa-Bouldina).


In [110]:
import itertools

def DaviesBouldin(clusters, metric):
    for cluster in clusters:
        cluster.sort(key=lambda l1: sum([metric(l1, l2) for l2 in cluster if l1 != l2]))
    centroids = [cluster[len(cluster) // 2] for cluster in clusters]

    mean_dist = []
    for cluster in clusters:
        s = 0
        n = len(cluster)
        for l1, l2 in itertools.combinations(cluster, 2):
            s += metric(l1, l2)
        if n == 1:
            mean_dist.append(0)
        else:
            mean_dist.append(s / (n * (n - 1) / 2))

    _max = [0 for _ in range(len(clusters))]
    for i, c1 in enumerate(clusters):
        for j, c2 in enumerate(clusters):
            if i != j:
                try:
                    a = (mean_dist[i] + mean_dist[j]) / metric(centroids[i], centroids[j])
                except:
                    a = (mean_dist[i] + mean_dist[j])
                _max[i] = max(a, _max[i])

    return sum(_max) / len(clusters)


def dunn_index(clusters, metric):
    for cluster in clusters:
        cluster.sort(key=lambda l1: sum([metric(l1, l2) for l2 in cluster if l1 != l2]))
    centroids = [cluster[len(cluster) // 2] for cluster in clusters]

    numerator = float('inf')
    for i, c1 in enumerate(clusters):
        for j, c2 in enumerate(clusters):
            if i < j:
                numerator = min(metric(centroids[i], centroids[j]), numerator)
    denominator = max([len(cluster) for cluster in clusters])

    return numerator / denominator


# ZAD 3
Stwórz stoplistę najczęściej występujących słów i zastosuj ją jako pre-processing dla nazw. Algorytmy klasteryzacji powinny działać na dwóch wariantach: z pre-processingiem i bez pre-processingu.

In [111]:
from collections import Counter

class StopList:
    def __init__(self, text, frequency):
        words = []
        for line in text:
            words += line.split()
        counter = Counter(words)
        self.common = {key for key, value in counter.items() if value >= frequency * len(words)}

    def remove_common(self, text):
        result = []
        for line in text:
            result.append(" ".join([w for w in line.split() if w not in self.common]))
        return result


# ZAD 4
Wykonaj klasteryzację zawartości załączonego pliku (lines.txt) przy użyciu  metryk zaimplementowanych w pkt. 1. Każda linia to adres pocztowy firmy, różne sposoby zapisu tego samego adresu powinny się znaleźć w jednym klastrze.
# ZAD 5
Porównaj jakość wyników sposobami zaimplementowanymi w pkt. 2.

In [112]:
def read_text(file, n):
    with open(file, "r", encoding="UTF-8") as f:
        text = f.read().splitlines()
    return text[:n]

def cluster_lines(lines, metric, threshold):
    clusters = []
    for line in lines:
        found_cluster = False
        for cluster in clusters:
            if any(metric(line, cluster_line) <= threshold for cluster_line in cluster):
                cluster.append(line)
                found_cluster = True
                break
        if not found_cluster:
            clusters.append([line])
    return clusters

In [113]:
lines = read_text("lines.txt", 200)
print("Clustering using LCS:")
lcs_clusters = cluster_lines(lines, lcs, 0.8)
with open("lcs_clusters.txt", "w", encoding="UTF-8") as f:
    for i, cluster in enumerate(lcs_clusters):
        f.write(f"Cluster {i+1}: {cluster}\n")


print("Clustering using Dice coefficient:")
dice_clusters = cluster_lines(lines, dice, 0.8)
with open("dice_clusters.txt", "w", encoding="UTF-8") as f:
    for i, cluster in enumerate(dice_clusters):
        f.write(f"Cluster {i+1}: {cluster}\n")


print("Clustering using Levenshtein distance:")
lev_clusters = cluster_lines(lines, levenstein, 0.6)
with open("lev_clusters.txt", "w", encoding="UTF-8") as f:
    for i, cluster in enumerate(lev_clusters):
        f.write(f"Cluster {i+1}: {cluster}\n")



Clustering using LCS:
Clustering using Dice coefficient:
Clustering using Levenshtein distance:


In [114]:
print("Davies-Bouldin index for LCS clusters:", DaviesBouldin(lcs_clusters, lcs))
print("Davies-Bouldin index for Dice coefficient clusters:", DaviesBouldin(dice_clusters, dice))
print("Davies-Bouldin index for Levenshtein distance clusters:", DaviesBouldin(lev_clusters, levenstein))

Davies-Bouldin index for LCS clusters: 1.1894017735187954
Davies-Bouldin index for Dice coefficient clusters: 0.9421675144477121
Davies-Bouldin index for Levenshtein distance clusters: 0.32822690520049685


In [116]:
lines = read_text("lines.txt", 200)
stoplist = StopList(lines, 0.05)  # create stoplist with words that appear in at least 5% of the lines
lines_without_common = stoplist.remove_common(lines)  # remove common words from the lines

lines = lines_without_common

print("Clustering using LCS:")
lcs_clusters = cluster_lines(lines, lcs, 0.8)
with open("stoplist_lcs_clusters.txt", "w", encoding="UTF-8") as f:
    for i, cluster in enumerate(lcs_clusters):
        f.write(f"Cluster {i+1}: {cluster}\n")

print("Clustering using Dice coefficient:")
dice_clusters = cluster_lines(lines, dice, 0.8)
with open("stoplist_dice_clusters.txt", "w", encoding="UTF-8") as f:
    for i, cluster in enumerate(dice_clusters):
        f.write(f"Cluster {i+1}: {cluster}\n")

print("Clustering using Levenshtein distance:")
lev_clusters = cluster_lines(lines, levenstein, 0.6)
with open("stoplist_lev_clusters.txt", "w", encoding="UTF-8") as f:
    for i, cluster in enumerate(lev_clusters):
        f.write(f"Cluster {i+1}: {cluster}\n")



Clustering using LCS:
Clustering using Dice coefficient:
Clustering using Levenshtein distance:


In [117]:
print("Davies-Bouldin index for LCS clusters with stoplist:", DaviesBouldin(lcs_clusters, lcs))
print("Davies-Bouldin index for Dice coefficient clusters with stoplist:", DaviesBouldin(dice_clusters, dice))
print("Davies-Bouldin index for Levenshtein distance clusters with stoplist:", DaviesBouldin(lev_clusters, levenstein))

Davies-Bouldin index for LCS clusters with stoplist: 1.1656477064993143
Davies-Bouldin index for Dice coefficient clusters with stoplist: 0.9423491875636503
Davies-Bouldin index for Levenshtein distance clusters with stoplist: 0.3267960284204245


# ZAD 6
Czy masz jakiś pomysł na poprawę jakości klasteryzacji w tym zadaniu?