In [None]:
import pandas as pd
import numpy as np
import random
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

df = pd.read_csv("cleaned_museums.csv", encoding='utf-8')

nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('russian')
X = df['Description']
y = df['City']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stop_words)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


sentences = [word_tokenize(text) for text in df['Description']]
train_data, padded_sents = padded_everygram_pipeline(2, sentences)

ngram_model = MLE(2)
ngram_model.fit(train_data, padded_sents)

def generate_text_statistical(model, num_words=50):
    text = []
    current_state = tuple()
    for _ in range(num_words):
        next_word = model.generate(1, current_state)
        text.append(next_word)
        current_state = (next_word,)
    return ' '.join(text)

for i in range(10):
    generated_text = generate_text_statistical(ngram_model, num_words=50)
    print(f"Generated text {i + 1}:\n{generated_text}\n")

print("\n")

def generate_text_neural_simple(sentences, num_words=50):
    sentence = random.choice(sentences)
    return ' '.join(random.sample(sentence, min(num_words, len(sentence))))

for i in range(10):
    generated_text = generate_text_neural_simple(sentences, num_words=50)
    print(f"Generated text {i + 1} (Simple Neural):\n{generated_text}\n")


AttributeError: partially initialized module 'nltk' has no attribute 'data' (most likely due to a circular import)