In [None]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Загрузка необходимых ресурсов NLTK
nltk.download('punkt')
nltk.download('stopwords')

# Загрузка данных
df = pd.read_csv("cleaned_museums.csv", encoding='utf-8')

def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'[^\w\s]', '', text).lower()
    return text

# TF-IDF векторизация для униграмм и биграмм
stop_words = stopwords.words('russian')
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=1, stop_words=stop_words)
tfidf_matrix = vectorizer.fit_transform(df['Description'].apply(clean_text))

# Создание модели Word2Vec
sentences = [word_tokenize(clean_text(text)) for text in df['Description']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

# Векторизация с tf-idf весами
tfidf_word2vec = np.zeros((len(sentences), 100))
for i, sentence in enumerate(sentences):
    sentence_vec = np.zeros(100)
    weighted_sum = 0
    for word in sentence:
        if word in word2vec_model.wv and word in vectorizer.vocabulary_:
            tfidf_score = tfidf_matrix[i, vectorizer.vocabulary_[word]]
            sentence_vec += word2vec_model.wv[word] * tfidf_score
            weighted_sum += tfidf_score
    if weighted_sum > 0:
        sentence_vec /= weighted_sum
    tfidf_word2vec[i, :] = sentence_vec

# K-Means кластеризация
inertia = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(tfidf_word2vec)
    inertia.append(kmeans.inertia_)

# Построение графика метода локтя
plt.plot(range(2, 11), inertia, marker='o')
plt.xlabel('Количество кластеров (k)')
plt.ylabel('Inertia')
plt.title('Метод локтя')
plt.show()

# Определение оптимального количества кластеров и применение K-Means
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['cluster'] = kmeans.fit_predict(tfidf_word2vec)

# Визуализация кластеров с помощью t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(tfidf_word2vec)

plt.figure(figsize=(10, 8))
for i in range(optimal_k):
    plt.scatter(tsne_results[df['cluster'] == i, 0], tsne_results[df['cluster'] == i, 1], label=f'Cluster {i}')
plt.title('Визуализация кластеров с помощью t-SNE')
plt.xlabel('t-SNE компонент 1')
plt.ylabel('t-SNE компонент 2')
plt.legend()
plt.show()
