In [1]:
import random

import datasets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

random.seed(42)

In [2]:
from habr_article_analyzer.data import download_dataset, load_dataset_from_zst

download_dataset()
dataset = load_dataset_from_zst(rows_num=10000)

Reading records: 9999it [00:06, 1539.36it/s]


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               10000 non-null  int64 
 1   language         10000 non-null  object
 2   url              10000 non-null  object
 3   text_markdown    10000 non-null  object
 4   text_html        10000 non-null  object
 5   lead_markdown    9993 non-null   object
 6   lead_html        10000 non-null  object
 7   type             10000 non-null  object
 8   labels           10000 non-null  object
 9   original_author  973 non-null    object
 10  original_url     973 non-null    object
 11  time_published   10000 non-null  int64 
 12  author           10000 non-null  object
 13  title            10000 non-null  object
 14  statistics       10000 non-null  object
 15  hubs             10000 non-null  object
 16  flows            10000 non-null  object
 17  tags             10000 non-null 

In [4]:
from habr_article_analyzer.targets import Target

hubs = Target(dataset, "hubs")
flows = Target(dataset, "flows")
tags = Target(dataset, "tags", sparse=True)
labels = Target(dataset, "labels")

In [5]:
def fix_len(word: str, n=10):
    if len(word) >= n + 3:
        return word[:n] + "..."
    else:
        return word + " " * (n - (len(word)))


def info(self: Target, top=20):
    print(f"Target: {self.column_name}")
    print(f"There are {len(self.labels)} unique labels")
    sorted_by_size = [
        (label, size)
        for label, size in sorted(
            zip(self.labels, self.get_sizes()), key=lambda x: x[1], reverse=True
        )
    ]

    def print_list(sorted_group):
        for i, (label, size) in enumerate(sorted_group):
            coverage = self.get_coverage(label)
            print(f"{i}:\t{fix_len(label)}\t{size}\t{coverage*100:.2f}%")

    if len(self.labels) > top * 2:
        print(f"Top {top} labels by their size:")
        print_list(sorted_by_size[:top])
        print(f"Tail {top} labels by their size:")
        print_list(sorted_by_size[-top:])
    else:
        print(f"Labels by their size:")
        print_list(sorted_by_size)

    print(f"Sum coverage by top-N labels:")
    percent_to_cover = [50, 75, 80, 90, 95, 99]

    for percent in percent_to_cover:
        l, r = 0, len(self)
        while l < r:
            m = (l + r) // 2
            coverage = self.get_coverage([label for label, _ in sorted_by_size[:m]])
            if coverage * 100 <= percent:
                l = m + 1
            else:
                r = m
        print(f"Top-{m} labels cover ~{percent}%")

In [6]:
info(hubs)

Target: hubs
There are 1127 unique labels
Top 20 labels by their size:
0:	closet    	900	9.00%
1:	itcompanies	690	6.90%
2:	infosecurity	617	6.17%
3:	programming	548	5.48%
4:	webdev    	494	4.94%
5:	popular_sc...	463	4.63%
6:	gadgets   	302	3.02%
7:	javascript	302	3.02%
8:	finance   	249	2.49%
9:	it-infrast...	239	2.39%
10:	career    	238	2.38%
11:	hardware  	234	2.34%
12:	python    	227	2.27%
13:	business-l...	224	2.24%
14:	open_source	215	2.15%
15:	gamedev   	213	2.13%
16:	social_net...	211	2.11%
17:	DIY       	210	2.10%
18:	android_dev	210	2.10%
19:	machine_le...	210	2.10%
Tail 20 labels by their size:
0:	vezetvsem 	1	0.01%
1:	videointel...	1	0.01%
2:	virt2real 	1	0.01%
3:	visiology 	1	0.01%
4:	vivid_money	1	0.01%
5:	vps_house 	1	0.01%
6:	vrdevice  	1	0.01%
7:	vsk_insura...	1	0.01%
8:	wayforpay 	1	0.01%
9:	wayray    	1	0.01%
10:	webo      	1	0.01%
11:	westcomp  	1	0.01%
12:	what3words	1	0.01%
13:	ximad     	1	0.01%
14:	xslt      	1	0.01%
15:	yadro     	1	0.01%
16:	youvend   	1	0.01%


In [7]:
info(flows)

Target: flows
There are 6 unique labels
Labels by their size:
0:	develop   	4417	44.17%
1:	popsci    	4103	41.03%
2:	management	1626	16.26%
3:	admin     	1034	10.34%
4:	marketing 	541	5.41%
5:	design    	388	3.88%
Sum coverage by top-N labels:
Top-2 labels cover ~50%
Top-2 labels cover ~75%
Top-2 labels cover ~80%
Top-4 labels cover ~90%
Top-5 labels cover ~95%
Top-5 labels cover ~99%


In [8]:
info(tags)

Target: tags
There are 23259 unique labels
Top 20 labels by their size:
0:	javascript	177	1.77%
1:	google    	166	1.66%
2:	android   	165	1.65%
3:	python    	161	1.61%
4:	linux     	145	1.45%
5:	разработка	141	1.41%
6:	java      	136	1.36%
7:	apple     	122	1.22%
8:	php       	122	1.22%
9:	microsoft 	121	1.21%
10:	программир...	121	1.21%
11:	информацио...	109	1.09%
12:	Google    	100	1.00%
13:	игры      	96	0.96%
14:	машинное о...	95	0.95%
15:	open source	85	0.85%
16:	стартапы  	84	0.84%
17:	безопасность	79	0.79%
18:	дизайн    	78	0.78%
19:	ios       	75	0.75%
Tail 20 labels by their size:
0:	яндекс.бра...	1	0.01%
1:	яндекс.диск	1	0.01%
2:	яндекс.драйв	1	0.01%
3:	яндекс.кал...	1	0.01%
4:	яндекс.лок...	1	0.01%
5:	яндекс.мет...	1	0.01%
6:	яндекс.метро	1	0.01%
7:	яндекс.нав...	1	0.01%
8:	яндекс.обл...	1	0.01%
9:	яндекс.под...	1	0.01%
10:	яндекс.сайт	1	0.01%
11:	яндекс.спр...	1	0.01%
12:	янки      	1	0.01%
13:	японские у...	1	0.01%
14:	японские у...	1	0.01%
15:	японский д...	1	0.01%
16:	яр

KeyboardInterrupt: 

In [9]:
info(labels)

Target: labels
There are 8 unique labels
Labels by their size:
0:	translation	973	9.73%
1:	sandbox   	808	8.08%
2:	recovery  	212	2.12%
3:	technotext...	20	0.20%
4:	technotext...	20	0.20%
5:	technotext...	19	0.19%
6:	seasonJava...	2	0.02%
7:	seasonDm2022	1	0.01%
Sum coverage by top-N labels:
Top-7 labels cover ~50%
Top-7 labels cover ~75%
Top-7 labels cover ~80%
Top-7 labels cover ~90%
Top-7 labels cover ~95%
Top-7 labels cover ~99%


### Выбор типа метки

Выше приведены сводки информации по каждому из четырех видов текстовых меток: `['hubs', 'tags', 'labels', 'flows']`. 
Видно, что `['hubs', 'tags']` - метки с большим количеством классов относящихся к содержанию текста, а `['labels', 'flows']` - скорее технические метки с маленьким количеством классов. Хотя `flows` отражает некоторую категорию текста, эта категория слишком абстрактна (sci-fi, develop) для нашей задачи.

Поэтому, в контексте нашей задачи, интересны именно метки `hubs` и `tags`. 
Видно, что для покрытия более 95% примеров в датасете, нужно будет использовать примерно 200 и 2000 классов из каждого типа меток. 
Это звучит как слишком большое число меток, при этом не достигается полного покрытия. 

Далее посмотрим на примеры, которые не покрыты этим множествами меток, а так же рассмотрим рандомные примеры из каждого из этих множеств.

In [10]:
def labels_sample(self: Target, n_samples: int = 20):
    print(f"Random samples for '{self.column_name}':")
    print("\n".join(random.sample(self.labels, n_samples)))

In [11]:
labels_sample(hubs)

Random samples for 'hubs':
cyberleninka
akbarsdigital
kokoc_group
intersystems
htmlacademy
drupal
cpp
yota
cloudsnn
saas
analogbytes
allcorrect
comet-server
host-tracker
igromagaz
uprock
alawar
google_chrome
rusonyx
hpe


In [12]:
labels_sample(tags)

Random samples for 'tags':
гост р 34.10-2001
предсказательная аналитика
prince2
3d печать
curl
широкополосный интернет
большие данные
vm/sp cms rexx
premium account
convolutional neural network
inkscape; vector graphics
vector magic
TechCrunch
SP Manager Lite 5 бесплатно
ПСПО
Smart TV
БД
web design
работа со списками
opera mini 4


Видно, что среди тегов встречаются как на русском языке, так и на английском. Проверим есть ли дубли на нескольких примерах.

In [13]:
pairs = [
    ["deep learning", "нейронные сети"],
    ["нейросети", "нейронные сети"],
    ["deep learning", "глубокое обучение"],
    ["develop", "разработка"],
    ["network", "сеть"],
    ["AI", "ИИ"],
    ["machine learning", "машинное обучение"],
]

for pair in pairs:
    en, ru = pair
    try:
        en_cov = tags.get_coverage(en) * 100
        ru_cov = tags.get_coverage(ru) * 100
        both_cov = tags.get_coverage(pair) * 100
        print(f"For pair: {pair}")
        print(f"{fix_len(en)}\t{en_cov}")
        print(f"{fix_len(ru)}\t{ru_cov}")
        print(f"{fix_len("Both")}\t{both_cov}")
    except Exception as e:
        print("Got an exception: {}".format(e))

For pair: ['deep learning', 'нейронные сети']
deep learn...	0.18
нейронные ...	0.27
Both      	0.42
For pair: ['нейросети', 'нейронные сети']
нейросети 	0.33999999999999997
нейронные ...	0.27
Both      	0.5700000000000001
For pair: ['deep learning', 'глубокое обучение']
deep learn...	0.18
глубокое о...	0.1
Both      	0.24
Got an exception: 'develop not in labels'
For pair: ['network', 'сеть']
network   	0.05
сеть      	0.16999999999999998
Both      	0.22
For pair: ['AI', 'ИИ']
AI        	0.06999999999999999
ИИ        	0.13999999999999999
Both      	0.2
For pair: ['machine learning', 'машинное обучение']
machine le...	0.44
машинное о...	0.95
Both      	1.2


Видно, что теги не структурированы - встречаются дубли как на одном (`нейросети` - `нейронные сети`), так и на разных языках (`machine learning` - `машинное обучение`). В таком виде эти данные не очень хорошо подходят для обучения модели, так как эти классы не будут отличаться по смыслу, при этом будут разными классами, что усложняет обучение.

In [14]:
def get_best_labels_for_uncovered_samples(
    self: Target = hubs, top_n_selected=200, n_sample=20
):
    sorted_by_size = [
        label
        for label, _ in sorted(
            zip(self.labels, self.get_sizes()), key=lambda x: x[1], reverse=True
        )
    ]

    uncovered = np.logical_not(self[sorted_by_size[:top_n_selected]].any(axis=1))
    coverages_count = self[self.labels][uncovered].sum(axis=0)
    n_best = [self.labels[i] for i in np.argsort(coverages_count)][:20]

    print(f"Target: {self.column_name}")

    for i, label in enumerate(n_best):
        size = self.get_sizes(label)
        coverage = self.get_coverage(label)
        print(f"{i}:\t{fix_len(label, 16)}\t{size}\t{coverage*100:.2f}%")

In [15]:
get_best_labels_for_uncovered_samples(self=hubs, top_n_selected=200, n_sample=20)

Target: hubs
0:	mr_gefest       	1	0.01%
1:	mobile_dimension	1	0.01%
2:	mobile_one      	2	0.02%
3:	mobile_testing  	26	0.26%
4:	mobileanalytics 	23	0.23%
5:	mobileup        	3	0.03%
6:	mobio           	5	0.05%
7:	modesco         	1	0.01%
8:	moex            	2	0.02%
9:	momondo         	1	0.01%
10:	monandco        	1	0.01%
11:	mono            	1	0.01%
12:	moonmodule      	1	0.01%
13:	moysklad        	1	0.01%
14:	mobile_dev      	210	2.10%
15:	mygames         	2	0.02%
16:	nag             	2	0.02%
17:	natural_language...	26	0.26%
18:	navicon         	1	0.01%
19:	ncloudtech      	8	0.08%


In [16]:
get_best_labels_for_uncovered_samples(self=tags, top_n_selected=2000, n_sample=20)

KeyboardInterrupt: 

В `tags` встречаются довольно разнообразные лейблы. Я попробовал написать статью (тестовую) на habr.com для того, чтобы разобраться, как проставляются `hubs` (хабы) и `tags` (ключевые слова или теги). 

Теги можно выбрать из выпадающего списка, а можно - добавить свой новый тег. 
Хабы же можно выбирать из большого, но фиксированного списка.

В таком случае для обучения классификатора лучше подходят хабы. Но при этом их все ещё слишком много. 

Одним из решений в таком случае было бы оставить топ-100/топ-200 хабов, обеспечивающие наибольшее покрытие статей. 
Но я хочу рассмаотреть альтернативный способ, позволяющий не жертвовать покрытием совсем - кластеризовать все хабы на N кластеров и выбрать эти группы в качестве таргетов для модели. 

В данном случае у нас нет готовых (или зафиксированных) эмбеддингов для кластеризации, что не позволяет нам использовать большинство метрик кластеризации для сравнения качества полученных групп лейблов. 

Поэтому я предлагаю сделать довольно неприятную, зато честную работу: 
- разметить 40 пар лейблов бинарно: должны они попадать в один класс или нет
- домайнить позитивов в эту разметку, посредством тщательного анализа лейблов

In [17]:
import json

sample1, sample2 = random.sample(hubs.labels, 40), random.sample(hubs.labels, 40)

print(json.dumps(list(zip(sample1, sample2)), indent=4))

[
    [
        "asp",
        "jquery"
    ],
    [
        "speechpro",
        "brainfuck"
    ],
    [
        "wolfram",
        "haulmont"
    ],
    [
        "design",
        "mneniya_pro"
    ],
    [
        "pochtoy",
        "health"
    ],
    [
        "cgi",
        "ulmart"
    ],
    [
        "localization",
        "quarkly"
    ],
    [
        "otkritie_broker",
        "speakasap"
    ],
    [
        "getmatch",
        "e-legion"
    ],
    [
        "buruki",
        "javascript"
    ],
    [
        "astronomy",
        "drupal"
    ],
    [
        "icanchoose",
        "ios_dev"
    ],
    [
        "lifehacks",
        "x-com"
    ],
    [
        "cian",
        "ivideon"
    ],
    [
        "igromagaz",
        "sciberia"
    ],
    [
        "courson",
        "rambler"
    ],
    [
        "popular_science",
        "otus"
    ],
    [
        "kryptonite",
        "hosting-cafe"
    ],
    [
        "softmart",
        "droider"
    ],
    [
        

```python
[
    [
        "microformats",
        "edge",
        0

    ],
    [
        "masterkit",
        "medgadgets",
        1
    ],
    [
        "asus",
        "social_networks",
        0
    ],
    [
        "skyeng",
        "cpu",
        0
    ],
    [
        "alconost",
        "metrotek",
        0.5
    ],
    [
        "vertdider",
        "wunderfund",
        0.5
    ],
    [
        "biology",
        "beeline",
        0
    ],
    [
        "cloudsnn",
        "virtualization",
        0.5
    ],
    [
        "plarium",
        "pixonic",
        1
    ],
    [
        "compilers",
        "gadgets",
        0
    ],
    [
        "study",
        "unisender",
        0
    ],
    [
        "refactoring",
        "postgresql",
        0
    ],
    [
        "itcompanies",
        "pay_system",
        0.5
    ],
    [
        "owasp",
        "cybersport",
        0
    ],
    [
        "antikvariat",
        "cloudsnn",
        0
    ],
    [
        "image_processing",
        "html5",
        0
    ],
    [
        "ie",
        "complete_code",
        0
    ],
    [
        "learning_languages",
        "naumen",
        0
    ],
    [
        "mongodb",
        "sport_programming",
        0
    ],
    [
        "e-legion",
        "yandex_api",
        0
    ],
    [
        "network_standarts",
        "mssql",
        0
    ],
    [
        "3d_graphics",
        "xml",
        0
    ],
    [
        "sberdevices",
        "1C",
        0.5
    ],
    [
        "bitrix",
        "ozontech",
        0.5
    ],
    [
        "regex",
        "getwear",
        0
    ],
    [
        "vtb",
        "macloud",
        0
    ],
    [
        "nano",
        "AJAX",
        0
    ],
    [
        "skillbox",
        "biotech"
    ],
    [
        "energy",
        "youvend"
    ],
    [
        "sound",
        "hi",
        0
    ],
    [
        "powershell",
        "vivaldi",
        0
    ],
    [
        "growthhacking",
        "typography",
        0
    ],
    [
        "biotech",
        "tablum"
    ],
    [
        "funcprog",
        "game_testing",
        0
    ],
    [
        "ivideon",
        "doctrine",
        0
    ],
    [
        "community_management",
        "analysis_design",
        0.5
    ],
    [
        "kebrum",
        "domclick",
        0
    ],
    [
        "1cloud",
        "notebooks",
        0
    ],
    [
        "visual_programming",
        "arttel",
        0
    ],
    [
        "sberbank",
        "ashmanov_net",
        0
    ]
]
```

После такой небольшой разметки, хочется выделить кластера компаний (и вероятно, отдельно, компаний занимающихся облачными решениями), геймдев и приложения для разработки. 


In [18]:
sample1 = random.sample(hubs.labels, 40)

print(json.dumps(list(sample1), indent=4))

[
    "sendpulse",
    "etmc_exponenta",
    "solarsecurity",
    "3cx",
    "izine",
    "uml",
    "fvdmedia",
    "varonis",
    "crowdsourcing",
    "macloud",
    "google_chrome",
    "energy",
    "phpshop",
    "facebook",
    "xakep",
    "webassembly",
    "1c",
    "mootools",
    "toster",
    "acelab",
    "cyberpunk",
    "owasp",
    "megafon",
    "infowatch",
    "bb-mobile",
    "innoros",
    "cgi",
    "cloud_mts",
    "tomhunter",
    "build_automation",
    "webo",
    "desktops",
    "devmail",
    "t1_cloud",
    "zwave",
    "finolab",
    "javascript",
    "wearable_electronics",
    "sales",
    "haxe"
]


Пробую намайнить позитивы из семпла выше (еще некоторые примеры группирую из списков выше):

```[
    [
        "branding",
        "business_models",
        1
    ],
    [
        "business-laws",
        "business_models",
        1
    ],
    [
        "research",
        "patents",
        1
    ],
    [
        "skyeng",
        "learning_languages",
        1
    ],
    [
        "sberbank",
        "vtb",
        1
    ],
    [
        "study",
        "netologyru",
        1
    ],
    [
        "DIY",
        "raspberrypi",
        1
    ],
    [
       "html5",
       "javascript",
       1 
    ],
    [
        "cpp",
        "c",
        1
    ],
    [
        "machine_learning",
        "natural_language_processing",
        1
    ],
    [
        "mongodb",
        "postgresql",
        1
    ]
]
```

In [19]:
markup = [
    ["branding", "business_models", 1],
    ["business-laws", "business_models", 1],
    ["research", "patents", 1],
    ["skyeng", "learning_languages", 1],
    ["sberbank", "vtb", 1],
    ["study", "netologyru", 1],
    ["DIY", "raspberrypi", 1],
    ["html5", "javascript", 1],
    ["cpp", "c", 1],
    ["machine_learning", "natural_language_processing", 1],
    ["mongodb", "postgresql", 1],
] + [
    ["microformats", "edge", 0],
    ["masterkit", "medgadgets", 1],
    ["asus", "social_networks", 0],
    ["skyeng", "cpu", 0],
    ["alconost", "metrotek", 0.5],
    ["vertdider", "wunderfund", 0.5],
    ["biology", "beeline", 0],
    ["cloudsnn", "virtualization", 0.5],
    ["plarium", "pixonic", 1],
    ["compilers", "gadgets", 0],
    ["study", "unisender", 0],
    ["refactoring", "postgresql", 0],
    ["itcompanies", "pay_system", 0.5],
    ["owasp", "cybersport", 0],
    ["antikvariat", "cloudsnn", 0],
    ["image_processing", "html5", 0],
    ["ie", "complete_code", 0],
    ["learning_languages", "naumen", 0],
    ["mongodb", "sport_programming", 0],
    ["e-legion", "yandex_api", 0],
    ["network_standarts", "mssql", 0],
    ["3d_graphics", "xml", 0],
    ["sberdevices", "1C", 0.5],
    ["bitrix", "ozontech", 0.5],
    ["regex", "getwear", 0],
    ["vtb", "macloud", 0],
    ["nano", "AJAX", 0],
    ["skillbox", "biotech", 0],
    ["energy", "youvend", 0],
    ["sound", "hi", 0],
    ["powershell", "vivaldi", 0],
    ["growthhacking", "typography", 0],
    ["biotech", "tablum"],
    ["funcprog", "game_testing", 0],
    ["ivideon", "doctrine", 0],
    ["community_management", "analysis_design", 0.5],
    ["kebrum", "domclick", 0],
    ["1cloud", "notebooks", 0],
    ["visual_programming", "arttel", 0],
    ["sberbank", "ashmanov_net", 0],
]

In [20]:
positive = 0
negative = 0
halfs = 0

for pair in markup:
    if len(pair) != 3:
        continue
    l, r, score = pair
    positive += int(score == 1)
    negative += int(score == 0)
    halfs += int(score == 0.5)

print(f"Positives: {positive}")
print(f"Negatives: {negative}")
print(f"Halfs: {halfs}")

Positives: 13
Negatives: 30
Halfs: 7


Посмотрим на самые частотные слова текстов статей для каждого хаба:

In [21]:
dataset.columns

Index(['id', 'language', 'url', 'text_markdown', 'text_html', 'lead_markdown',
       'lead_html', 'type', 'labels', 'original_author', 'original_url',
       'time_published', 'author', 'title', 'statistics', 'hubs', 'flows',
       'tags', 'reading_time', 'format', 'complexity', 'comments'],
      dtype='object')

Это не много, но это честная работа! (асессором)

In [22]:
def measure_clusterization(
    markup: list[list], clusters: dict[int : list[str]]
) -> dict[str:float]:
    markup = list(filter(lambda x: len(x) == 3 and x[2] != 0.5, markup))
    inverse_clusters = dict(
        [(value, key) for key, values in clusters.items() for value in values]
    )
    TP, FP, TN, FN = 0, 0, 0, 0
    FPs = []
    FNs = []
    for pair in markup:
        l, r, mark = pair
        l_id, r_id = inverse_clusters[l], inverse_clusters[r]

        if mark == 1 and l_id == r_id:
            TP += 1
        elif mark == 1 and l_id != r_id:
            FN += 1
            FNs.append((l, r))
        elif mark == 0 and l_id == r_id:
            FP += 1
            FPs.append((l, r))
        elif mark == 0 and l_id != r_id:
            TN += 1

    return {
        "TP": TP,
        "FP": FP,
        "FN": FN,
        "TN": TN,
        "accuracy": (TP + TN) / len(markup),
        "false_positives": FPs,
        "false_negatives": FNs,
    }

Далее я буду пробовать различные способы кластеризации лейблов.
Кластеризация состоит из двух частей:
Embedder + Clusterization algorithm

В качестве алгоритма кластеризации возьмем KMeans. 
А для эмбеддера рассмотрим несколько разных подходов:
1. Вектор семплов
2. Джаккардова мера похожести лейблов
3. Эмбеддинги MiniLM 
4. Instructor (модель, у которой эмбеддинги конфигугрируются с помощью промпта)

Так же для каждой модели подберем оптимальное множество кластеров (от 20 до 100).


In [23]:
from sklearn.cluster import KMeans

from habr_article_analyzer.clusters import TextClusterization

In [24]:
def get_best_clusters_with_kmeans(
    embeds, texts, n_clusters=[10, 25, 50, 75, 100], random_state=42, **kwargs
):
    best_res = {"accuracy": 0}
    best_n = 0
    for n in n_clusters:
        clusters = KMeans(n_clusters=n, random_state=random_state).fit_predict(
            embeds, **kwargs
        )
        clusters_mapping = TextClusterization._get_clusters(clusters, texts)
        result = measure_clusterization(markup, clusters_mapping)
        print(f"n_clusters: {n}")
        print(f"measurment: {result}")
        if result["accuracy"] > best_res["accuracy"]:
            best_res = result
            best_n = n

    return best_res, best_n

In [25]:
from habr_article_analyzer.clusters import DatasetEmbedder

dataset_kmeans = TextClusterization(
    DatasetEmbedder(hubs), KMeans(n_clusters=50, random_state=42)
)
dataset_kmeans_clusters = dataset_kmeans.get_clusters(hubs.labels)
measure_clusterization(markup, dataset_kmeans_clusters)

{'TP': 7,
 'FP': 27,
 'FN': 6,
 'TN': 3,
 'accuracy': 0.23255813953488372,
 'false_positives': [('microformats', 'edge'),
  ('skyeng', 'cpu'),
  ('biology', 'beeline'),
  ('study', 'unisender'),
  ('refactoring', 'postgresql'),
  ('owasp', 'cybersport'),
  ('antikvariat', 'cloudsnn'),
  ('ie', 'complete_code'),
  ('learning_languages', 'naumen'),
  ('mongodb', 'sport_programming'),
  ('e-legion', 'yandex_api'),
  ('network_standarts', 'mssql'),
  ('3d_graphics', 'xml'),
  ('regex', 'getwear'),
  ('vtb', 'macloud'),
  ('nano', 'AJAX'),
  ('skillbox', 'biotech'),
  ('energy', 'youvend'),
  ('sound', 'hi'),
  ('powershell', 'vivaldi'),
  ('growthhacking', 'typography'),
  ('funcprog', 'game_testing'),
  ('ivideon', 'doctrine'),
  ('kebrum', 'domclick'),
  ('1cloud', 'notebooks'),
  ('visual_programming', 'arttel'),
  ('sberbank', 'ashmanov_net')],
 'false_negatives': [('business-laws', 'business_models'),
  ('research', 'patents'),
  ('DIY', 'raspberrypi'),
  ('html5', 'javascript'),
  ('

In [26]:
best_res, best_n = get_best_clusters_with_kmeans(
    DatasetEmbedder(hubs).encode(hubs.labels), hubs.labels
)
print(f"best n: {best_n}")
print(f"best_res: {best_res}")

n_clusters: 10
measurment: {'TP': 12, 'FP': 30, 'FN': 1, 'TN': 0, 'accuracy': 0.27906976744186046, 'false_positives': [('microformats', 'edge'), ('asus', 'social_networks'), ('skyeng', 'cpu'), ('biology', 'beeline'), ('compilers', 'gadgets'), ('study', 'unisender'), ('refactoring', 'postgresql'), ('owasp', 'cybersport'), ('antikvariat', 'cloudsnn'), ('image_processing', 'html5'), ('ie', 'complete_code'), ('learning_languages', 'naumen'), ('mongodb', 'sport_programming'), ('e-legion', 'yandex_api'), ('network_standarts', 'mssql'), ('3d_graphics', 'xml'), ('regex', 'getwear'), ('vtb', 'macloud'), ('nano', 'AJAX'), ('skillbox', 'biotech'), ('energy', 'youvend'), ('sound', 'hi'), ('powershell', 'vivaldi'), ('growthhacking', 'typography'), ('funcprog', 'game_testing'), ('ivideon', 'doctrine'), ('kebrum', 'domclick'), ('1cloud', 'notebooks'), ('visual_programming', 'arttel'), ('sberbank', 'ashmanov_net')], 'false_negatives': [('research', 'patents')]}
n_clusters: 25
measurment: {'TP': 11, 'F

In [27]:
from habr_article_analyzer.clusters import JaccardLabelsEmbedder

jaccard_kmeans = TextClusterization(
    JaccardLabelsEmbedder(hubs), KMeans(n_clusters=50, random_state=42)
)
jaccard_kmeans_clusters = jaccard_kmeans.get_clusters(hubs.labels)
measure_clusterization(markup, jaccard_kmeans_clusters)



{'TP': 4,
 'FP': 3,
 'FN': 9,
 'TN': 27,
 'accuracy': 0.7209302325581395,
 'false_positives': [('refactoring', 'postgresql'),
  ('owasp', 'cybersport'),
  ('mongodb', 'sport_programming')],
 'false_negatives': [('branding', 'business_models'),
  ('business-laws', 'business_models'),
  ('research', 'patents'),
  ('skyeng', 'learning_languages'),
  ('study', 'netologyru'),
  ('DIY', 'raspberrypi'),
  ('html5', 'javascript'),
  ('mongodb', 'postgresql'),
  ('masterkit', 'medgadgets')]}

In [28]:
best_res, best_n = get_best_clusters_with_kmeans(
    JaccardLabelsEmbedder(hubs).encode(hubs.labels), hubs.labels
)
print(f"best n: {best_n}")
print(f"best_res: {best_res}")



n_clusters: 10
measurment: {'TP': 6, 'FP': 11, 'FN': 7, 'TN': 19, 'accuracy': 0.5813953488372093, 'false_positives': [('microformats', 'edge'), ('biology', 'beeline'), ('owasp', 'cybersport'), ('ie', 'complete_code'), ('mongodb', 'sport_programming'), ('regex', 'getwear'), ('nano', 'AJAX'), ('powershell', 'vivaldi'), ('funcprog', 'game_testing'), ('kebrum', 'domclick'), ('1cloud', 'notebooks')], 'false_negatives': [('branding', 'business_models'), ('research', 'patents'), ('study', 'netologyru'), ('DIY', 'raspberrypi'), ('machine_learning', 'natural_language_processing'), ('mongodb', 'postgresql'), ('masterkit', 'medgadgets')]}
n_clusters: 25
measurment: {'TP': 5, 'FP': 14, 'FN': 8, 'TN': 16, 'accuracy': 0.4883720930232558, 'false_positives': [('microformats', 'edge'), ('skyeng', 'cpu'), ('biology', 'beeline'), ('refactoring', 'postgresql'), ('owasp', 'cybersport'), ('mongodb', 'sport_programming'), ('nano', 'AJAX'), ('skillbox', 'biotech'), ('energy', 'youvend'), ('powershell', 'vival

In [29]:
from huggingface_hub.utils import disable_progress_bars
from sentence_transformers import SentenceTransformer

disable_progress_bars()

minillm_kmeans = TextClusterization(
    SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"),
    KMeans(n_clusters=50, random_state=42),
)
minillm_kmeans_clusters = minillm_kmeans.get_clusters(hubs.labels)
measure_clusterization(markup, minillm_kmeans_clusters)

{'TP': 7,
 'FP': 0,
 'FN': 6,
 'TN': 30,
 'accuracy': 0.8604651162790697,
 'false_positives': [],
 'false_negatives': [('research', 'patents'),
  ('skyeng', 'learning_languages'),
  ('sberbank', 'vtb'),
  ('study', 'netologyru'),
  ('DIY', 'raspberrypi'),
  ('mongodb', 'postgresql')]}

In [30]:
best_res, best_n = get_best_clusters_with_kmeans(
    SentenceTransformer(
        "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    ).encode(hubs.labels),
    hubs.labels,
)
print(f"best n: {best_n}")
print(f"best_res: {best_res}")

n_clusters: 10
measurment: {'TP': 6, 'FP': 3, 'FN': 7, 'TN': 27, 'accuracy': 0.7674418604651163, 'false_positives': [('skillbox', 'biotech'), ('sound', 'hi'), ('ivideon', 'doctrine')], 'false_negatives': [('branding', 'business_models'), ('business-laws', 'business_models'), ('skyeng', 'learning_languages'), ('study', 'netologyru'), ('machine_learning', 'natural_language_processing'), ('mongodb', 'postgresql'), ('masterkit', 'medgadgets')]}
n_clusters: 25
measurment: {'TP': 6, 'FP': 2, 'FN': 7, 'TN': 28, 'accuracy': 0.7906976744186046, 'false_positives': [('e-legion', 'yandex_api'), ('energy', 'youvend')], 'false_negatives': [('research', 'patents'), ('skyeng', 'learning_languages'), ('sberbank', 'vtb'), ('study', 'netologyru'), ('DIY', 'raspberrypi'), ('mongodb', 'postgresql'), ('masterkit', 'medgadgets')]}
n_clusters: 50
measurment: {'TP': 7, 'FP': 0, 'FN': 6, 'TN': 30, 'accuracy': 0.8604651162790697, 'false_positives': [], 'false_negatives': [('research', 'patents'), ('skyeng', 'lea

In [31]:
from InstructorEmbedding import INSTRUCTOR

instructor_kmeans = TextClusterization(
    INSTRUCTOR("hkunlp/instructor-large"), KMeans(n_clusters=50, random_state=42)
)
text_instruction_pairs = [
    [
        "Represent the Programming Hub category for company or technology clustering:",
        label,
    ]
    for label in hubs.labels
]
instructor_kmeans_clusters = instructor_kmeans.get_clusters(text_instruction_pairs)
instructor_kmeans_clusters = dict(
    (key, [value[1] for value in values])
    for key, values in instructor_kmeans_clusters.items()
)
measure_clusterization(markup, instructor_kmeans_clusters)

No sentence-transformers model found with name hkunlp/instructor-large. Creating a new one with mean pooling.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


{'TP': 2,
 'FP': 1,
 'FN': 11,
 'TN': 29,
 'accuracy': 0.7209302325581395,
 'false_positives': [('growthhacking', 'typography')],
 'false_negatives': [('branding', 'business_models'),
  ('business-laws', 'business_models'),
  ('research', 'patents'),
  ('skyeng', 'learning_languages'),
  ('sberbank', 'vtb'),
  ('study', 'netologyru'),
  ('DIY', 'raspberrypi'),
  ('html5', 'javascript'),
  ('mongodb', 'postgresql'),
  ('masterkit', 'medgadgets'),
  ('plarium', 'pixonic')]}

In [32]:
best_res, best_n = get_best_clusters_with_kmeans(
    INSTRUCTOR("hkunlp/instructor-large").encode(text_instruction_pairs), hubs.labels
)
print(f"best n: {best_n}")
print(f"best_res: {best_res}")

No sentence-transformers model found with name hkunlp/instructor-large. Creating a new one with mean pooling.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


KeyboardInterrupt: 

Попробуем еще поподбирать промпт для модели Instructor.

In [None]:
def choose_best_prompt(
    prompts: list[str], texts: list[str], model=INSTRUCTOR("hkunlp/instructor-large")
):
    best_res = {"accuracy": 0}
    best_n = 0
    best_prompt = ""
    for prompt in prompts:
        text_instruction_pairs = [[prompt, label] for label in texts]
        b_res, b_n = get_best_clusters_with_kmeans(
            model.encode(text_instruction_pairs), texts
        )

        if b_res["accuracy"] > best_res["accuracy"]:
            best_res = b_res
            best_n = b_n
            best_prompt = prompt

    return best_res, best_n, best_prompt

No sentence-transformers model found with name hkunlp/instructor-large. Creating a new one with mean pooling.


In [None]:
best_res, best_n, best_prompt = choose_best_prompt(
    prompts=[
        "Represent the Programming Hub category for clustering:",
        "Represent the Programming Hub category for company or technology clustering:",
        "Represent the Programming Hub category for company or technology or activity clustering:",
        "Represent the Programming Hub category for company or technology or profession clustering:",
    ],
    texts=hubs.labels,
)

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


n_clusters: 10
measurment: {'TP': 5, 'FP': 7, 'FN': 8, 'TN': 23, 'accuracy': 0.6511627906976745, 'false_positives': [('microformats', 'edge'), ('biology', 'beeline'), ('study', 'unisender'), ('regex', 'getwear'), ('energy', 'youvend'), ('sound', 'hi'), ('1cloud', 'notebooks')], 'false_negatives': [('branding', 'business_models'), ('research', 'patents'), ('skyeng', 'learning_languages'), ('study', 'netologyru'), ('DIY', 'raspberrypi'), ('mongodb', 'postgresql'), ('masterkit', 'medgadgets'), ('plarium', 'pixonic')]}
n_clusters: 25
measurment: {'TP': 3, 'FP': 2, 'FN': 10, 'TN': 28, 'accuracy': 0.7209302325581395, 'false_positives': [('microformats', 'edge'), ('sound', 'hi')], 'false_negatives': [('branding', 'business_models'), ('business-laws', 'business_models'), ('research', 'patents'), ('skyeng', 'learning_languages'), ('sberbank', 'vtb'), ('study', 'netologyru'), ('DIY', 'raspberrypi'), ('machine_learning', 'natural_language_processing'), ('masterkit', 'medgadgets'), ('plarium', 'pi

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


n_clusters: 100
measurment: {'TP': 1, 'FP': 0, 'FN': 12, 'TN': 30, 'accuracy': 0.7209302325581395, 'false_positives': [], 'false_negatives': [('branding', 'business_models'), ('research', 'patents'), ('skyeng', 'learning_languages'), ('sberbank', 'vtb'), ('study', 'netologyru'), ('DIY', 'raspberrypi'), ('html5', 'javascript'), ('cpp', 'c'), ('machine_learning', 'natural_language_processing'), ('mongodb', 'postgresql'), ('masterkit', 'medgadgets'), ('plarium', 'pixonic')]}
n_clusters: 10
measurment: {'TP': 5, 'FP': 6, 'FN': 8, 'TN': 24, 'accuracy': 0.6744186046511628, 'false_positives': [('skyeng', 'cpu'), ('compilers', 'gadgets'), ('owasp', 'cybersport'), ('network_standarts', 'mssql'), ('energy', 'youvend'), ('sound', 'hi')], 'false_negatives': [('branding', 'business_models'), ('business-laws', 'business_models'), ('skyeng', 'learning_languages'), ('sberbank', 'vtb'), ('study', 'netologyru'), ('machine_learning', 'natural_language_processing'), ('masterkit', 'medgadgets'), ('plarium'

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


n_clusters: 100
measurment: {'TP': 1, 'FP': 0, 'FN': 12, 'TN': 30, 'accuracy': 0.7209302325581395, 'false_positives': [], 'false_negatives': [('branding', 'business_models'), ('business-laws', 'business_models'), ('research', 'patents'), ('skyeng', 'learning_languages'), ('sberbank', 'vtb'), ('study', 'netologyru'), ('DIY', 'raspberrypi'), ('html5', 'javascript'), ('machine_learning', 'natural_language_processing'), ('mongodb', 'postgresql'), ('masterkit', 'medgadgets'), ('plarium', 'pixonic')]}
n_clusters: 10
measurment: {'TP': 2, 'FP': 3, 'FN': 11, 'TN': 27, 'accuracy': 0.6744186046511628, 'false_positives': [('skyeng', 'cpu'), ('energy', 'youvend'), ('growthhacking', 'typography')], 'false_negatives': [('branding', 'business_models'), ('business-laws', 'business_models'), ('research', 'patents'), ('skyeng', 'learning_languages'), ('sberbank', 'vtb'), ('study', 'netologyru'), ('DIY', 'raspberrypi'), ('machine_learning', 'natural_language_processing'), ('mongodb', 'postgresql'), ('mas

`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


n_clusters: 100
measurment: {'TP': 4, 'FP': 1, 'FN': 9, 'TN': 29, 'accuracy': 0.7674418604651163, 'false_positives': [('growthhacking', 'typography')], 'false_negatives': [('branding', 'business_models'), ('research', 'patents'), ('skyeng', 'learning_languages'), ('sberbank', 'vtb'), ('study', 'netologyru'), ('machine_learning', 'natural_language_processing'), ('mongodb', 'postgresql'), ('masterkit', 'medgadgets'), ('plarium', 'pixonic')]}
n_clusters: 10
measurment: {'TP': 4, 'FP': 7, 'FN': 9, 'TN': 23, 'accuracy': 0.627906976744186, 'false_positives': [('microformats', 'edge'), ('skyeng', 'cpu'), ('biology', 'beeline'), ('skillbox', 'biotech'), ('energy', 'youvend'), ('sound', 'hi'), ('growthhacking', 'typography')], 'false_negatives': [('branding', 'business_models'), ('business-laws', 'business_models'), ('research', 'patents'), ('skyeng', 'learning_languages'), ('sberbank', 'vtb'), ('study', 'netologyru'), ('mongodb', 'postgresql'), ('masterkit', 'medgadgets'), ('plarium', 'pixonic

In [None]:
print(f"best_res: {best_res}")
print(f"best_n: {best_n}")
print(f"best_prompt: {best_prompt}")

best_res: {'TP': 4, 'FP': 1, 'FN': 9, 'TN': 29, 'accuracy': 0.7674418604651163, 'false_positives': [('regex', 'getwear')], 'false_negatives': [('business-laws', 'business_models'), ('research', 'patents'), ('skyeng', 'learning_languages'), ('sberbank', 'vtb'), ('study', 'netologyru'), ('machine_learning', 'natural_language_processing'), ('mongodb', 'postgresql'), ('masterkit', 'medgadgets'), ('plarium', 'pixonic')]}
best_n: 75
best_prompt: Represent the Programming Hub category for company or technology or activity clustering:


In [None]:
instructor_kmeans = TextClusterization(
    INSTRUCTOR("hkunlp/instructor-large"), KMeans(n_clusters=75, random_state=42)
)
text_instruction_pairs = [
    [
        "Represent the Programming Hub category for company or technology or activity clustering:",
        label,
    ]
    for label in hubs.labels
]
instructor_kmeans_clusters = instructor_kmeans.get_clusters(text_instruction_pairs)
instructor_kmeans_clusters = dict(
    (key, [value[1] for value in values])
    for key, values in instructor_kmeans_clusters.items()
)
measure_clusterization(markup, instructor_kmeans_clusters)

print(instructor_kmeans_clusters)

No sentence-transformers model found with name hkunlp/instructor-large. Creating a new one with mean pooling.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


{36: ['1C', '1c', '1cair', '2can', '2gis', '7days', '8812', 'Atlassian', 'Centrobit', 'Competentum', 'DIY', 'Hadoop', 'Licel', 'accessibility', 'acelab', 'acer', 'action360', 'acumatica', 'adapty', 'agile', 'aktiv-company', 'alawar', 'alee', 'alfa', 'algorithms', 'aligntechnology', 'allcorrect', 'altergeo', 'altnet', 'ambar', 'amd', 'animation', 'anychart', 'apacer', 'apache', 'api', 'aquaphor', 'arenadata', 'asp', 'asterisk', 'asus', 'atlas', 'auriga', 'avi', 'azure', 'badoo', 'bar', 'barsgroup', 'bastion', 'beeline', 'beget', 'big', 'bigdata', 'bigdataplatform', 'biggo', 'bonjoin', 'brave', 'c', 'cackle', 'canon', 'caravan', 'cardberry', 'career', 'cartaxi', 'celecom', 'changeagain', 'changellenge', 'chateam', 'cian', 'cit', 'cleantalk', 'cleverence', 'click', 'cloverr', 'clovertel', 'clrium', 'codefest', 'colobridge', 'comagic', 'compilers', 'constanta', 'contell', 'contentai', 'context', 'convead', 'converse', 'conversion', 'copiny', 'copyright', 'corel', 'cpp', 'cpu', 'crazydev', 

Полученный результат мне не нравится. Думаю проблема - в нехватке контекста, для хороших эмбеддингов нужно больше 1-2 слов.

Как нам получить текстовое описание каждого лейбла? Очень просто - давайте выберем из статей с этим лейблом самые частотные (по TF-IDF для него).

In [None]:
hubs_top_worlds = hubs.get_top_words_per_label("text_markdown")

Посмотрим примеры, для каких-нибудь не особо понятных слов и для понятных:

In [None]:
hubs_top_worlds["mono"]

[('intptr', 4065.578369140625),
 ('param', 623.1882934570312),
 ('ref', 497.8687438964844),
 ('uint', 447.219970703125),
 ('zero', 367.0495300292969),
 ('oid', 365.9857482910156),
 ('returns', 345.847412109375),
 ('summary', 341.7725524902344),
 ('сертификата', 253.23025512695312),
 ('подписи', 184.31829833984375)]

In [None]:
hubs_top_worlds["cpp"]

[('endl', 69.2018051147461),
 ('nullptr', 67.54036712646484),
 ('cout', 67.17047882080078),
 ('typename', 67.06929016113281),
 ('std', 58.166343688964844),
 ('size_t', 54.285831451416016),
 ('gdb', 42.914669036865234),
 ('clang', 39.875152587890625),
 ('typedef', 38.802547454833984),
 ('boost', 37.63223648071289)]

In [None]:
hubs_top_worlds["machine_learning"]

[('accuracy', 40.30488967895508),
 ('sklearn', 40.233360290527344),
 ('predict', 37.253684997558594),
 ('loss', 34.91127395629883),
 ('keras', 34.31116485595703),
 ('ml', 32.905975341796875),
 ('train', 31.36823272705078),
 ('plt', 30.495180130004883),
 ('датасет', 29.038772583007812),
 ('регрессии', 28.73417854309082)]

Выглядит неплохо, может быть это поможет Instructor или mini-LLM. 

In [None]:
def get_text(label, prompt=None, top_words=hubs_top_worlds):
    if prompt is None:
        return label + " " + " ".join([x for x, _ in top_words[label]])
    return [prompt, label + " " + " ".join([x for x, _ in top_words[label]])]


print(get_text("machine_learning"))

machine_learning accuracy sklearn predict loss keras ml train plt датасет регрессии


In [None]:
from collections import defaultdict

instructor_kmeans = TextClusterization(
    INSTRUCTOR("hkunlp/instructor-large"), KMeans(n_clusters=75, random_state=42)
)
text_instruction_pairs = list(
    map(
        lambda x: get_text(
            x,
            "Represent the Programming Hub category for company or technology or activity clustering: ",
        ),
        hubs.labels,
    )
)
instructor_kmeans_clusters = instructor_kmeans.get_clusters(text_instruction_pairs)
instructor_kmeans_clusters = dict(
    (key, [value[1] for value in values])
    for key, values in instructor_kmeans_clusters.items()
)

result = defaultdict(list)
for key, values in instructor_kmeans_clusters.items():
    for value in values:
        label = value.split()[0]
        result[key].append(label)

measure_clusterization(markup, result)

print(instructor_kmeans_clusters)

No sentence-transformers model found with name hkunlp/instructor-large. Creating a new one with mean pooling.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


KeyError: 'branding'

In [None]:
from huggingface_hub.utils import disable_progress_bars
from sentence_transformers import SentenceTransformer

disable_progress_bars()

minillm_kmeans = TextClusterization(
    SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"),
    KMeans(n_clusters=50, random_state=42),
)

minillm_kmeans_clusters = minillm_kmeans.get_clusters(list(map(get_text, hubs.labels)))

result_mini_llm = defaultdict(list)
for key, values in minillm_kmeans_clusters.items():
    for value in values:
        label = value.split()[0]
        result_mini_llm[key].append(label)

measure_clusterization(markup, result_mini_llm)

{'TP': 1,
 'FP': 0,
 'FN': 12,
 'TN': 30,
 'accuracy': 0.7209302325581395,
 'false_positives': [],
 'false_negatives': [('branding', 'business_models'),
  ('business-laws', 'business_models'),
  ('research', 'patents'),
  ('skyeng', 'learning_languages'),
  ('sberbank', 'vtb'),
  ('study', 'netologyru'),
  ('DIY', 'raspberrypi'),
  ('html5', 'javascript'),
  ('machine_learning', 'natural_language_processing'),
  ('mongodb', 'postgresql'),
  ('masterkit', 'medgadgets'),
  ('plarium', 'pixonic')]}

In [None]:
with open("data/targets/hubs.json", "+x") as file:
    json.dump(result_mini_llm, file, indent=4)

FileExistsError: [Errno 17] File exists: 'data/targets/hubs.json'