In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.manifold import TSNE
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from wordcloud import WordCloud
from sklearn.metrics import silhouette_score

In [3]:
df = pd.read_csv("..\\..\\data\\collocations\\processed_collocations_set_from_the_supervisor.csv", encoding='windows-1251')
df

EmptyDataError: No columns to parse from file

In [None]:
meaningless_pos = {'0', '&', 'F', 'e', 's', 'm', 'M', 'Y', 'n'}
df = df[~df['PartOfSpeech'].isin(meaningless_pos)]
df

In [None]:
meaningless_words = {
    'быть', 'был', 'есть', 'будет', 'может', 'мочь', 'являться',
    'имеющий', 'имеющая', 'имеющее', 'имеющие',
    'идущий', 'идущая', 'идущие', 'идущего',
    'он', 'я', 'сам', 'сама', 'само', 'сами', 'тот', 'та', 'то', 'те',
    'такой', 'такая', 'такое', 'такие', 'данный', 'данная', 'данное', 'данные',
    'чего', 'почему', 'зачем', 'где', 'когда', 'пока',
    'уже', 'еще', 'сейчас', 'теперь', 'тогда',
    'должен', 'должна', 'должно', 'должны',
    'человек', 'люди', 'раз', 'мир', 'время', 'год',
    'прошлый', 'следующий', 'новый', 'старый', 'большой', 'маленький',
    'который', 'которая', 'которое', 'которые',
    'поэтому'
}
df = df[~df['NormalizedWord'].isin(meaningless_words)]
df

In [None]:
df = df.copy()
df['FrequencyInText'] = df['FrequencyInText'].astype(int)
df['FrequencyInCorpus'] = df['FrequencyInCorpus'].astype(int)

filtered_df = df[(df['FrequencyInCorpus'] > 1) & (df['FrequencyInText'] > 1)]
doc_word_matrix = filtered_df.pivot_table(
    index='DocumentName',
    columns='NormalizedWord',
    values='FrequencyInText',
    aggfunc='sum',
    fill_value=0
)

filtered_df

In [None]:
doc_word_matrix.head(25)

In [None]:
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(doc_word_matrix)
similarity_matrix = cosine_similarity(tfidf_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=doc_word_matrix.index, columns=doc_word_matrix.index)
similarity_df.to_csv("data\\collocations\\cosine_similarity_matrix.csv", index=True)
similarity_df

In [None]:
def save_clusters(labels, method_name):
    cluster_df = pd.DataFrame({"Document": doc_word_matrix.index, "Cluster": labels})
    cluster_df.to_csv(f"data\\collocations\\{method_name}_clusters.csv", index=False)

def save_cluster_centers(tfidf_matrix, labels, method_name):
    unique_clusters = np.unique(labels)
    cluster_centers = []

    for cluster in unique_clusters:
        if cluster == -1:
            continue
        cluster_indices = np.where(labels == cluster)[0]
        cluster_vectors = tfidf_matrix[cluster_indices].toarray()
        center_idx = np.argmin(cdist([cluster_vectors.mean(axis=0)], cluster_vectors)[0])
        central_file = doc_word_matrix.index[cluster_indices[center_idx]]
        cluster_centers.append((cluster, central_file))

    centers_df = pd.DataFrame(cluster_centers, columns=["Cluster", "Central_File"])
    centers_df.to_csv(f"data\\collocations\\{method_name}_cluster_centers.csv", index=False)

In [None]:
k = 30
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans_labels = kmeans.fit_predict(tfidf_matrix)
save_clusters(kmeans_labels, "kmeans")
save_cluster_centers(tfidf_matrix, kmeans_labels, "kmeans")

In [None]:
eps = 0.6
min_samples = 5
dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric="cosine")
dbscan_labels = dbscan.fit_predict(tfidf_matrix)
save_clusters(dbscan_labels, "dbscan")
save_cluster_centers(tfidf_matrix, dbscan_labels, "dbscan")

In [None]:
n_clusters_agglo = 35
agglo = AgglomerativeClustering(n_clusters=n_clusters_agglo, metric="cosine", linkage="average")
agglo_labels = agglo.fit_predict(tfidf_matrix.toarray())
save_clusters(agglo_labels, "agglomerative")
save_cluster_centers(tfidf_matrix, agglo_labels, "agglomerative")

In [None]:
def load_clusters(file_name):
    return pd.read_csv(file_name)

In [None]:
kmeans_clusters = load_clusters("data\\collocations\\kmeans_clusters.csv")
dbscan_clusters = load_clusters("data\\collocations\\dbscan_clusters.csv")
agglo_clusters = load_clusters("data\\collocations\\agglomerative_clusters.csv")

kmeans_cluster_centers = load_clusters("data\\collocations\\kmeans_cluster_centers.csv")
dbscan_cluster_centers = load_clusters("data\\collocations\\dbscan_cluster_centers.csv")
agglo_cluster_centers = load_clusters("data\\collocations\\agglomerative_cluster_centers.csv")

In [None]:
def visualize_clusters(labels, tfidf_matrix, method_name):
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000, metric='cosine')
    tsne_res = tsne.fit_transform(tfidf_matrix.toarray())

    unique_labels = np.unique(labels)
    cmap = ListedColormap(plt.cm.get_cmap('tab20', len(unique_labels)).colors)

    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(tsne_res[:, 0], tsne_res[:, 1], c=labels, cmap=cmap, s=10, alpha=0.9)
    plt.colorbar(scatter, label=f'{method_name} Cluster', ticks=range(len(unique_labels)))
    plt.title(f't-SNE визуализация кластеров {method_name}')
    plt.xlabel('t-SNE измерение 1')
    plt.ylabel('t-SNE измерение 2')
    plt.grid(True)
    plt.show()

    unique_clusters, counts = np.unique(labels, return_counts=True)
    print(f"{method_name} кластеры:", dict(zip(unique_clusters, counts)))

In [None]:
def generate_word_cloud(cluster_label, cluster_df, method_name):
    cluster_files = cluster_df[cluster_df['Cluster'] == cluster_label]['Document']
    print(f"Файлы в кластере {cluster_label} ({method_name}):")
    for f in cluster_files:
        print(f)

    for doc_name in cluster_files:
        doc_data = filtered_df[filtered_df['DocumentName'] == doc_name]
        word_freq = dict(zip(doc_data['NormalizedWord'], doc_data['FrequencyInText']))
        wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
        plt.figure(figsize=(10, 5))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'Облако слов для {doc_name}')
        plt.show()

In [None]:
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(doc_word_matrix)

In [None]:
visualize_clusters(kmeans_clusters['Cluster'].values, tfidf_matrix, "K-Means")
generate_word_cloud(11, kmeans_clusters, "K-Means")
kmeans_clusters

In [None]:
kmeans_cluster_centers

In [None]:
visualize_clusters(dbscan_clusters['Cluster'].values, tfidf_matrix, "DBSCAN")
generate_word_cloud(8, dbscan_clusters, "DBSCAN")
dbscan_clusters

In [None]:
dbscan_cluster_centers

In [None]:
visualize_clusters(agglo_clusters['Cluster'].values, tfidf_matrix, "Agglomerative")
generate_word_cloud(11, agglo_clusters, "Agglomerative")
agglo_clusters

In [None]:
agglo_cluster_centers

In [None]:
output_path = "data\\collocations\\dataprocessed_dataset.csv"
df.to_csv(output_path, index=False, encoding='windows-1251')
print(f"Данные успешно сохранены в {output_path}")