## Zadanie nr 5 - klasteryzacja 
### Jakub Janicki

In [1]:
import codecs
import itertools
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from spacy.language import Language
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
import tabulate as tabulate
import time

### 1. Metrics

In [2]:

def delta(x, y):
    return 1 if x == y else 0


def lcs(x, y):
    path = [[0 for _ in range(len(y) + 1)] for _ in range(len(x) + 1)]
    longest = 0

    for i in range(1, len(x) + 1):
        for j in range(1, len(y) + 1):
            path[i][j] = path[i - 1][j - 1] + delta(x[i - 1], y[j - 1])
            if path[i][j] > longest:
                longest = path[i][j]

    return 1 - longest / max(len(x), len(y))


def get_n_grams(x, n):
    n_grams = dict()
    for i in range(len(x) + 1 - n):
        n_gram = x[i:i + n]
        if n_gram in n_grams.keys():
            n_grams[n_gram] += 1
        else:
            n_grams[n_gram] = 1

    return n_grams


def cosine_metrics(x, y, n=2):
    n_grams_x = get_n_grams(x, n)
    n_grams_y = get_n_grams(y, n)
    prod = 0
    for n_gram in n_grams_x.keys():
        if n_gram in n_grams_y.keys():
            prod += n_grams_x[n_gram] * n_grams_y[n_gram]

    return prod / (len(n_grams_x) + len(n_grams_y))


def dice(x, y, n=2):
    n_grams_x = set(get_n_grams(x, n))
    n_grams_y = set(get_n_grams(y, n))
    prod = n_grams_x & n_grams_y

    return 1 - len(prod) * 2 / (len(n_grams_x) + len(n_grams_y))


def levenstein(x, y):
    edit_dist = [[0 for _ in range(len(y) + 1)] for _ in range(len(x) + 1)]
    for i in range(1, len(x) + 1):
        edit_dist[i][0] = i
    for i in range(1, len(y) + 1):
        edit_dist[0][i] = i
    for i in range(1, len(x) + 1):
        for j in range(1, len(y) + 1):
            edit_dist[i][j] = min(edit_dist[i - 1][j - 1] + delta(x[i - 1], y[j - 1]),
                                  edit_dist[i - 1][j] + 1,
                                  edit_dist[i][j - 1] + 1)
    return edit_dist[-1][-1] / max(len(x), len(y))


### 2. Clusters rate

In [3]:
def DaviesBouldin(clusters, metric):
    for cluster in clusters:
        cluster.sort(key=lambda l1: sum([metric(l1, l2) for l2 in cluster if l1 != l2]))
    centroids = [cluster[len(cluster) // 2] for cluster in clusters]

    mean_dist = []
    for cluster in clusters:
        s = 0
        n = len(cluster)
        for l1, l2 in itertools.combinations(cluster, 2):
            s += metric(l1, l2)
        if n == 1:
            mean_dist.append(0)
        else:
            mean_dist.append(s / (n * (n - 1) / 2))

    _max = [0 for _ in range(len(clusters))]
    for i, c1 in enumerate(clusters):
        for j, c2 in enumerate(clusters):
            if i != j:
                try:
                    a = (mean_dist[i] + mean_dist[j]) / metric(centroids[i], centroids[j])
                except:
                    a = (mean_dist[i] + mean_dist[j])
                _max[i] = max(a, _max[i])

    return sum(_max) / len(clusters)


def dunn_index(clusters, metric):
    for cluster in clusters:
        cluster.sort(key=lambda l1: sum([metric(l1, l2) for l2 in cluster if l1 != l2]))
    centroids = [cluster[len(cluster) // 2] for cluster in clusters]

    numerator = float('inf')
    for i, c1 in enumerate(clusters):
        for j, c2 in enumerate(clusters):
            if i < j:
                numerator = min(metric(centroids[i], centroids[j]), numerator)
    denominator = max([len(cluster) for cluster in clusters])

    return numerator / denominator

### 3. Stoplist

In [4]:
def get_from_file(path):
    with open("assets/lines.txt", "r") as file:
        return file.read()

In [5]:
def stoplist(text):
    tokens = word_tokenize(text)
    freq = FreqDist(tokens)
    freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
    freq = freq[:int(0.03 * len(freq))]
    to_delete = [x for x, y in freq if len(x) <5]
    return to_delete

In [6]:
def prepare(text):
    vocab = Language(Vocab()).vocab
    tokenizer = Tokenizer(vocab)
    paragraphs = [p for p in text.split('\n') if p]
    line_tokens = []
    for paragraph in paragraphs:
        line_tokens.append(word_tokenize(paragraph))

    to_delete = stoplist(text)
    res = []
    for token_line in line_tokens:
        for token in token_line:
            if token not in to_delete:
                res.append(token)
        res.append("\n")

    text = TreebankWordDetokenizer().detokenize(res)
    return text

### 4. Clustering

In [7]:
def clusters(new_text, metric, sigma):
    result = dict()
    lines = new_text.split("\n")

    for i, el in enumerate(lines):
        result[el] = i
    combinations = list(itertools.combinations(lines, 2))
    for line1, line2 in combinations:
        if metric(line1, line2) < sigma:
            result[line2] = result[line1]

    re = dict()
    for el, val in result.items():
        if val not in re.keys():
            re[val] = [el]
        else:
            re[val].append(el)
    return re

### 5. Testing

In [8]:
text = get_from_file("assets/lines.txt")

In [9]:
print(stoplist(text))

[',', "''", '.', ':', 'LTD', '(', ')', 'TEL', '+7', '812', '``', 'LLC', 'FAX']


In [10]:
c1 = clusters(prepare(text), lcs, 0.65).values()
for key in c1:
    for l in key:
        print(l)
    print()

/11692589 RD TUNA CANNERS PORTION 1004 SIAR NORTH COAST ROAD P.O.BOX 2113 MADANG PAPUA NEW GUINEA 

 ''PA INTERIOR BOLSHAYA LUBYANKA STREET 16/4 MOSCOW 101000 RUSSIA INN/KPP 7704550148//770801001 495-984-8611 

 ''SSONTEX Sp.ZO.O.IMPORT-EXPORTUL PRZECLAWSKA 5 03-879 WARSZAWA POLAND NIP 113-01-17-669 
 ''SSONTEX SP.ZO.O.IMPORT-EXPORT UL PRZECLAWSKA 5 03-879 WARSZAWA POLAND NIP 113-01-17-669 TEL./FAX :0048 022 217 6532--
 SSONTEX SP.ZO.O IMPORT-EXPORT 03-879 WARSZAWA UL PRZECLAWSKA 5 NIP:113-01-17-669 

 ''TOPEX SP Z O.O SPOLKA KOMANDYTOWA UL POGRANICZNA 2/4 02-285 WARSZAWA POLAND 

 'MASTER PLUS CO.' 143000 RUSSIA MO ODINSOVO MOJAISKOE SHOSSE,153G +7495 7273939 

 2TIGERS GROUP LIMITED ROOM 504 JINSHAZHOU SHANGSHUI ROAD GUANGZHOU 510160 

 ALDETRANS 105066 MOSCOW RUSSIA TOKMAKOV LANE 11 495 641-03-89 

 A-LIFT JSC 1 PROSPEKT MARSHALA ZHUKOVA MOSCOW 123308 RUSSIA T 495 784-7961 

 ALISA 1/5 Derbenevskaya str. Moscow Russia Tel./Fax 495 987-13-07 postal code 115114 

 ALLIANCE-TRADE INN 7

In [11]:
res = []
c = clusters(prepare(text), lcs, 0.65).values()
d1 = DaviesBouldin(c, lcs)
d2 = dunn_index(c, lcs)
res.append(["lcs", d1,d2])


c = clusters(prepare(text), cosine_metrics, 0.05).values()
d1 = DaviesBouldin(c, cosine_metrics)
d2 = dunn_index(c, cosine_metrics)
res.append(["cosine", d1,d2])

c = clusters(prepare(text), dice, 0.3).values()
d1 = DaviesBouldin(c, dice)
d2 = dunn_index(c, dice)
res.append(["dice", d1, d2])

c = clusters(prepare(text), levenstein, 0.03).values()
d1 = DaviesBouldin(c, levenstein)
d2 = dunn_index(c, levenstein)
res.append(["levenstein", d1, d2])


print(tabulate.tabulate(res, headers=["metric", "davies_bouldin", "dunn"],
                    tablefmt="fancy_grid"))


╒════════════╤══════════════════╤═════════════╕
│ metric     │   davies_bouldin │        dunn │
╞════════════╪══════════════════╪═════════════╡
│ lcs        │         0.854475 │ 0.0942249   │
├────────────┼──────────────────┼─────────────┤
│ cosine     │         2.31604  │ 0.000129199 │
├────────────┼──────────────────┼─────────────┤
│ dice       │         0.45015  │ 0.0437318   │
├────────────┼──────────────────┼─────────────┤
│ levenstein │         0.810033 │ 0.0047619   │
╘════════════╧══════════════════╧═════════════╛


In [12]:
res = []

start = time.time()
c = clusters(prepare(text), lcs, 0.65).values()
end = time.time()
t1 = end - start 
start = time.time()
d1 = DaviesBouldin(c, lcs)
end = time.time()
t2 = end - start 
start = time.time()
d2 = dunn_index(c, lcs)
end = time.time()
t3 = end - start 
res.append(["lcs", t1,t2,t3])

start = time.time()
c = clusters(prepare(text), cosine_metrics, 0.05).values()
end = time.time()
t1 = end - start 
start = time.time()
d1 = DaviesBouldin(c, cosine_metrics)
end = time.time()
t2 = end - start 
start = time.time()
d2 = dunn_index(c, cosine_metrics)
end = time.time()
t3 = end - start 
res.append(["cosine", t1,t2,t3])

start = time.time()
c = clusters(prepare(text), dice, 0.3).values()
end = time.time()
t1 = end - start 
start = time.time()
d1 = DaviesBouldin(c, dice)
end = time.time()
t2 = end - start 
start = time.time()
d2 = dunn_index(c, dice)
end = time.time()
t3 = end - start 
res.append(["dice", t1, t2,t3])

start = time.time()
c = clusters(prepare(text), levenstein, 0.05).values()
end = time.time()
t1 = end - start 
start = time.time()
d1 = DaviesBouldin(c, levenstein)
end = time.time()
t2 = end - start 
start = time.time()
d2 = dunn_index(c, levenstein)
end = time.time()
t3 = end - start 
res.append(["levenstein", t1, t2,t3])


print(tabulate.tabulate(res, headers=["metric","clustering", "davies_bouldin", "dunn"],
                    tablefmt="fancy_grid"))

╒════════════╤══════════════╤══════════════════╤════════════╕
│ metric     │   clustering │   davies_bouldin │       dunn │
╞════════════╪══════════════╪══════════════════╪════════════╡
│ lcs        │    36.9807   │        32.1882   │ 15.5649    │
├────────────┼──────────────┼──────────────────┼────────────┤
│ cosine     │     0.699987 │         0.334777 │  0.0601475 │
├────────────┼──────────────┼──────────────────┼────────────┤
│ dice       │     0.65501  │         0.515242 │  0.264692  │
├────────────┼──────────────┼──────────────────┼────────────┤
│ levenstein │    72.6199   │        11.245    │  2.05416   │
╘════════════╧══════════════╧══════════════════╧════════════╛
