In [1]:
import numpy as np
from collections import Counter
from nltk import ngrams
import string
from scipy.cluster.hierarchy import fclusterdata as fancy_cluster

## 1. Implementacja trzech wybranych metryk

In [2]:
def get_ngrams(x, n):
    ngram_counts = Counter()

    for i in range(len(x)-n+1):
        ngram = tuple(x[i:i+n])
        ngram_counts[ngram] += 1

    return ngram_counts

In [3]:
def euclidean(x, y, num=2):
    x_counter = Counter(get_ngrams(x, num))
    y_counter = Counter(get_ngrams(y, num))
    all_keys = set(x_counter.keys()).union(y_counter.keys())
    val = 0

    for key in all_keys:
        x_count = x_counter.get(key, 0)
        y_count = y_counter.get(key, 0)
        val += (x_count - y_count)**2

    return np.sqrt(val)

In [4]:
def cosine(x, y, num=2):
    x_counter = Counter(get_ngrams(x, num))
    y_counter = Counter(get_ngrams(y, num))
    ngrams_inter = set(x_counter.keys())&set(y_counter.keys())

    prod = sum(x_counter[ngram] * y_counter[ngram] for ngram in ngrams_inter)
    norm_v1 = np.linalg.norm(list(x_counter.values()))
    norm_v2 = np.linalg.norm(list(y_counter.values()))

    return 1 - prod/(norm_v1 * norm_v2)

In [5]:
def dice(x, y, num=2):
    ngrams_x = set(get_ngrams(x, min(num, len(x), len(y))))
    ngrams_y = set(get_ngrams(y, min(num, len(x), len(y))))

    return 1 - 2*len(ngrams_x&ngrams_y)/(len(ngrams_x) + len(ngrams_y))

## 2. sposób oceny jakości klasteryzacji - indeks Daviesa-Bouldina

In [6]:
def calculate_cluster_distance(cluster_a, cluster_b, metric):
    total_distance = 0
    metric_val = 2
    total_clusters_size = len(cluster_a) * len(cluster_b)

    for text_a in cluster_a:
        for text_b in cluster_b:
            total_distance += metric(text_a, text_b, metric_val)

    return total_distance / total_clusters_size


def davies_bouldin_index(clusters, distance_metric):
    davies_bouldin_index = 0
    num_clusters = len(clusters)
    sigmas = [0]*num_clusters
    metric_val = 2

    for i in range(num_clusters):
        cluster_i = clusters[i]
        num_texts_i = len(cluster_i)
        if num_texts_i == 1:
            continue

        for j in range(i + 1, num_clusters):
            cluster_j = clusters[j]
            num_texts_j = len(cluster_j)
            if num_texts_j == 1:
                continue

            total_distance = 0
            for text_i in cluster_i:
                for text_j in cluster_j:
                    total_distance += distance_metric(text_i, text_j, metric_val)

            average_distance = total_distance / (num_texts_i * num_texts_j)
            sigmas[i] += average_distance
            sigmas[j] += average_distance

    for i in range(num_clusters):
        cluster_i = clusters[i]
        sigma_i = sigmas[i]
        max_value = 0
        
        for j in range(num_clusters):
            if i == j:
                continue
                
            cluster_j = clusters[j]
            sigma_j = sigmas[j]
            distance = calculate_cluster_distance(cluster_i, cluster_j, distance_metric)
            max_value = max(max_value, (sigma_i + sigma_j) / distance)
            
        davies_bouldin_index += max_value

    return davies_bouldin_index / num_clusters

In [7]:
davies_bouldin_index([["GHI", "JKL"], ["ghi", "jkl"]], cosine)

2.0

## 3. Stoplista najczęściej występujących słów (jako preprocessing dla nazw)

In [8]:
def stoplist_modify_text(text):
    processed_text = ""
    frequency_vectors = []
    
    for character in text:
        if character not in string.punctuation:
            processed_text += character.lower()

    character_counts = Counter()
    character_counts.update(processed_text)
    vocabulary = list(character_counts)
    processed_text = processed_text.splitlines()
    
    for line in processed_text:
        line_counts = Counter()
        line_counts.update(line)
        vector = []
        for character in vocabulary:
            vector.append(line_counts[character])
        frequency_vectors.append(vector)
    
    return frequency_vectors, processed_text

## 4. Klasteryzacja zawartości pliku `lines.txt` przy użyciu zaimplementowanych wcześniej metryk

In [9]:
with open("lines.txt", "r", encoding="UTF-8") as f:
        text = f.read()

In [10]:
text[:1000]

'/11692589 RD TUNA CANNERS, LTD. PORTION 1004, SIAR NORTH COAST ROAD, P.O.BOX 2113, MADANG, PAPUA NEW GUINEA\n\'\'PA INTERIOR\'\' LTD BOLSHAYA LUBYANKA STREET, 16/4 MOSCOW, 101000, RUSSIA INN/KPP 7704550148//770801001 495-984-8611\n\'\'SSONTEX\'\'  Sp.ZO.O.IMPORT-EXPORTUL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669\n\'\'SSONTEX\'\'SP.ZO.O.IMPORT-EXPORT UL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669 TEL./FAX.:0048(022)217 6532--\n\'\'TOPEX SP. Z O.O.\'\' SPOLKA KOMANDYTOWA UL. POGRANICZNA 2/4  02-285 WARSZAWA POLAND\n\'MASTER PLUS CO.,LTD.\' 143000,RUSSIA,MO,ODINSOVO, MOJAISKOE, SHOSSE,153G TEL:+7495 7273939\n"2TIGERS GROUP LIMITED"  ROOM 504 JINSHAZHOU SHANGSHUI ROAD,  GUANGZHOU 510160\n"ALDETRANS" LLC, 105066, MOSCOW, RUSSIA, TOKMAKOV LANE, 11. TEL:+7(495)641-03-89\n"A-LIFT",JSC 1 PROSPEKT MARSHALA ZHUKOVA,MOSCOW 123308,RUSSIA  T: +7(495)784-7961\n"ALISA" LTD, 1/5 Derbenevskaya str., Moscow, Russia Tel./Fax: (495) 987-13-07 postal code: 115114\n"ALLIANCE-TRADE" L

In [11]:
def get_clusters(modified_text, text, metric):
    clust_list = []
    clusters_from_text = []
    
    data_cluster = fancy_cluster(modified_text, t=1, metric=metric)
    max_cluster = max(data_cluster)
    
    for i in range(1, max_cluster + 1):
        inds = []
        cluster_texts = []
        text_parts = []
        
        for index, cluster_id in enumerate(data_cluster):
            if cluster_id == i:
                inds.append(index)
    
        for index in inds:
            text_parts.append(modified_text[index])
        
        clust_list.append(text_parts)
        for index in inds:
            cluster_texts.append(text[index])

        clusters_from_text.append(cluster_texts)
    
    return clust_list, clusters_from_text

In [12]:
def cluster_text_slice(text, start, end, metric):
    modified_text, processed_text = stoplist_modify_text(text)
    sliced_text = processed_text[start:end]

    clusters = get_clusters(modified_text[start:end], sliced_text, metric)
    for cluster_index, cluster_items in enumerate(clusters[1][1:], start=1):
        print(f"klaster nr {cluster_index}:")
        for item in cluster_items:
            print(item)
        print()

In [13]:
cluster_text_slice(text, 0, 100, euclidean)

klaster nr 1:
sevrollsystem spz oo plac czerwca 1976 roku nr 1b  02495 warszawa ursus  tel 022 3123139
sevrollsystem spz oo plac czerwca 1976 roku nr 1b  02495 warszawa ursus   tel 022 3123139

klaster nr 2:
nfc ltd gapsalskaya str 5 198035 st petersburg russia tel 7 812 327 77 41  fax 7 812 327 77 29
nfcltd gapsalskaya str 5 198035stpetersburgrussia tel78123277741 fax 78123277729
nfc ltd gapsalskaya str 5 198035 stpetersburg russia tel 7 812327 77 41 fax 7 812 327 77 29

klaster nr 3:
kobilight spolka komandytowa ultboya zelenskiego 25 35105 rzeszowpoland nip8133499669
kobilight spolka komandytowa ultboya zelenskiego 25 35105 rzeszowpoland nip8133499699

klaster nr 4:
meat trade company stpetersburgltd 191015 saintpetersburg  shpalernaya street 51 russian federation tel 7 812 3294262 pylaevatatiana
meat trade company stpetersburgltd 191015 saintpetersburg shpalernaya street 51 russian federation tel 7 812 3294262 pashkevichanna
meat trade company stpetersburgltd 191015 saintpetersburg