In [184]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import ast
import string
import re
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.inspection import permutation_importance

In [185]:
df_keys = pd.read_csv('../data/processed/keyword_list.csv')

In [186]:
keywords = list(df_keys['0'])

In [187]:
len(keywords)

2267

In [188]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(keywords)

In [189]:
kmeans = KMeans(n_clusters=3, random_state=0)
labels = kmeans.fit_predict(embeddings)

In [190]:
centroids = kmeans.cluster_centers_

top_keywords_per_cluster = {}
for cluster_id in range(kmeans.n_clusters):
    cluster_indices = [i for i, label in enumerate(labels) if label == cluster_id]
    distances = [np.linalg.norm(embeddings[i] - centroids[cluster_id]) for i in cluster_indices]
    sorted_indices = np.argsort(distances)
    top_keywords = [keywords[cluster_indices[i]] for i in sorted_indices[:1000]]  # Top 10 keywords
    top_keywords_per_cluster[cluster_id] = top_keywords


In [191]:
for cluster_id, top_keywords in top_keywords_per_cluster.items():
    print(f"Top keywords for Cluster {cluster_id}:")
    print(", ".join(top_keywords))
    print()

Top keywords for Cluster 0:
drug, drug, drug, drug, drug, drug, drug, drug, drug, drug, drug, drug, drug, drug, drug, drug, drug, drug, weed, weed, weed, smoke, smoke, smoke, smoke, smoke, smoke, smoke, drugs, drugs, drugs, drugs, drugs, drugs, drugs, drugs, drugs, drugs, drugs, drugs, drugs, drugs, smoked, police, police, police, corpse, crime, crime, crime, crime, crime, death, murder, murder, fire, prison, prison, drunken, smoking, cigarette, cigarette, cigarette, cigarette, cigarette, cigarette, cigarette, cigarette, drunk, drunk, drunk, drunk, drunk, drunk, drunk, drunk, drunk, stole, stole, violence, violence, violence, violence, wasted, water, criminals, criminals, criminals, war, dark, dark, dark, dark, dark, dark, dark, dark, dark, dark, dark, crimes, poison, poison, poison, addict, addict, alcohol, toilet, beer, beer, crack, crack, rain, rain, rain, rain, rats, rats, rats, rats, rats, rats, rats, rats, rats, rats, rats, rats, rats, rats, rats, drunks, drunks, drunks, drunks, 

In [193]:
gbm = GradientBoostingClassifier(random_state=0)
gbm.fit(embeddings, labels)

In [195]:
importance = permutation_importance(gbm, embeddings, labels, n_repeats=20, random_state=0)
important_features = np.argsort(importance['importances_mean'])[::-1]

top_keywords_per_cluster = {i: [] for i in range(n_clusters)}

for cluster_id in range(n_clusters):
    cluster_indices = [i for i, label in enumerate(labels) if label == cluster_id]
    cluster_keywords = [keywords[i] for i in cluster_indices]
    # Select top features that are present in the current cluster
    cluster_top_features = [keywords[i] for i in important_features if i in cluster_indices]
    top_keywords_per_cluster[cluster_id] = cluster_top_features[:10]  

for cluster_id, top_keywords in top_keywords_per_cluster.items():
    print(f"Cluster {cluster_id}: {', '.join(top_keywords)}")

keyword_counts = Counter(labels)
print("\nKeywords per cluster:", keyword_counts)

Cluster 0: windy, cigarette, smoke, drug, dealers, unmaintained, foreigners, bitterly, prices, hustle
Cluster 1: cry, pleasant, dead, disappointment, aggressive, yelled, forbidden, bother, rude, absurd
Cluster 2: overcrowded, overcrowded, garbage, dirty, unclean, smells, noise, smells, rubbish, rubbish

Keywords per cluster: Counter({0: 1015, 1: 796, 2: 456})
