In [57]:
from collections import Counter
from nltk.corpus import wordnet
from cluster_separation import cluster_separator_from_txt_file, read_cluster_file, create_clusters
import os
import re
import importlib
# importlib.reload(cluster_separator_from_txt_file)

In [None]:
def create_clusters(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    clusters = []
    for line in lines:
        cluster = line.strip().replace('"', "").replace('[', "").replace(']', "").replace("'", "").split(',')
        if "Cluster" not in cluster[0]:
            cluster = [word.strip() for word in cluster] 
            clusters.append(cluster)

    return clusters

In [167]:
def get_most_common_hypernym(clusters):
    labels = []
    for cluster in clusters:
        hypernyms = []
        for word in cluster:
            synsets = wordnet.synsets(word)
            if not synsets:
                continue
            word_hypernyms = set()
            for synset in synsets:
                for path in synset.hypernym_paths():
                    word_hypernyms.update(path)
            word_hypernyms.discard(synset.lexname() for synset in synsets)
            if word_hypernyms:
                word_hypernym_counts = {h.name().split('.')[0]: 0 for h in word_hypernyms}
                for h in word_hypernyms:
                    if h.name().split('.')[0] in ['entity', 'physical_entity', 'person', 'whole', 'organism']:
                        continue
                    word_hypernym_counts[h.name().split('.')[0]] += 1
                most_common = max(word_hypernym_counts, key=word_hypernym_counts.get)
                hypernyms.append(most_common)
            else:
                hypernyms.append("unknown_hypernym")
        if hypernyms:
            labels.append(max(set(hypernyms), key=hypernyms.count))
        else:
            labels.append("unknown_cluster")
    return labels


In [None]:
def write_labels_to_file(labels, path_to_file):
    with open(path_to_file, 'r') as f:
        lines = f.readlines()

    current_cluster = 0
    for i, line in enumerate(lines):
        if line.startswith("Cluster "):
            current_cluster += 1
            lines[i] = f"{line.strip()} ({labels[current_cluster-1]})\n"
    
    with open(path_to_file, 'w') as f:
        f.writelines(lines)


In [157]:
HOME_PATH = r"C:\Users\Nauel\Desktop\Lavoro UNI"
n_clusters = [5, 8, 10, 15, 20, 80, 100, 125, 150, 175, 200, 250, 300]
PATH_INPUT = os.path.join(HOME_PATH, "cluster_loop_clean", "definitions", "Clusters con 80 gruppi & definizioni.txt")
PATH_OUTPUT = os.path.join(HOME_PATH, "cluster_loop_clean", "labels", "Clusters con 80 gruppi & definizioni.txt")


In [168]:
for n_cluster in n_clusters:
    PATH_INPUT = os.path.join(HOME_PATH, "cluster_loop_clean", "definitions", f"Clusters con {n_cluster} gruppi & definizioni.txt")
    PATH_OUTPUT = os.path.join(HOME_PATH, "cluster_loop_clean", "labels", f"Clusters con {n_cluster} gruppi & definizioni.txt")
    clusters = create_clusters(PATH_INPUT)
    labels = get_most_common_hypernym(clusters)
    write_labels_to_file(labels, PATH_OUTPUT)
