In [None]:
import pandas as pd
import nltk
import spacy
from gensim import corpora, models
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import re
from numpy import argmax

df = pd.read_csv("/content/cleaned_museums.csv")

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('russian'))

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    return [w for w in tokens if w not in stop_words and len(w) > 2]

df['processed_text'] = df['Description'].apply(preprocess_text)

vectorizer = CountVectorizer(max_df=0.95, min_df=2)
doc_term_matrix = vectorizer.fit_transform(df['Description'])

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(doc_term_matrix)

feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
    print(f"Тема {topic_idx + 1}: {top_words}\n")

vectorizer_lsa = TfidfVectorizer(max_df=0.95, min_df=2)
doc_term_matrix_lsa = vectorizer_lsa.fit_transform(df['Description'])

lsa = TruncatedSVD(n_components=5, random_state=42)
lsa.fit(doc_term_matrix_lsa)

feature_names_lsa = vectorizer_lsa.get_feature_names_out()
for topic_idx, topic in enumerate(lsa.components_):
    top_words = [feature_names_lsa[i] for i in topic.argsort()[:-10 - 1:-1]]
    print(f"Тема LSA {topic_idx + 1}: {top_words}\n")

nmf = NMF(n_components=5, random_state=42)
nmf.fit(doc_term_matrix)

try:
    nlp = spacy.load("ru_core_news_sm")
except OSError:
    print("Модель spaCy 'ru_core_news_sm' не найдена. Пожалуйста, загрузите её.")
    exit()

def extract_entities(text):
    entities = {}
    doc = nlp(text)
    for ent in doc.ents:
        entities.setdefault(ent.label_, []).append(ent.text)
    return entities

df['entities'] = df['Description'].apply(extract_entities)

topic_probabilities = lda.transform(doc_term_matrix)
for i in range(lda.n_components):
    top_words = [feature_names[j] for j in lda.components_[i].argsort()[:-10 - 1:-1]]
    print(f"\nТема LDA {i+1}: {top_words}")

    topic_indices = argmax(topic_probabilities, axis=1) == i
    entities_in_topic = []
    for index in df[topic_indices].index:
        entities_in_topic.extend(df['entities'][index])
    print("Сущности в теме:", entities_in_topic)