In [None]:
import spacy
# Загрузка научной модели SciSpacy
# nlp_sci = spacy.load("en_core_sci_sm")
nlp = spacy.load("en_core_web_sm")

from pymongo import MongoClient
import tqdm
from transformers import pipeline


In [None]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "true"

# Создание размеченного датасета для дообучения модели

In [68]:
categories = {
    "TECHNOLOGY": [
        'artificial intelligence', 'big data', 'internet of things (IoT)', 
        'cloud computing', 'quantum computing', 'edge computing', 
        'blockchain', '5g networks', 'augmented reality', 'virtual reality', 
        'serverless computing', 'digital twins', 'microservices', 
        'event-driven architecture', 'in-memory computing', 'fog computing', 
        'cyber-physical systems', 'software-defined networks', 
        'data lakehouse', 'data mesh', 'robotics process automation (RPA)',
        'human-computer interaction (HCI)', 'cybersecurity', 'distributed computing', 
        'neuro-symbolic AI', 'federated learning', 'privacy-preserving AI',
        'explainable AI (XAI)', 'multi-cloud environments', 'data observability',
        'green computing', 'energy-efficient AI', 'smart contracts',
        'autonomous systems', 'immersive technologies', 'zero-trust architecture',
        'context-aware computing', 'haptic technologies', 'nanotechnologies'
    ],
    "METHOD": [
        'principal component analysis (PCA)', 'bayesian methods', 
        'markov chains', 'gradient boosting', 'stochastic processes', 
        'time series decomposition', 'dynamic programming', 
        'feature engineering', 'dimensionality reduction', 
        'data sampling', 'data imputation', 'data augmentation', 
        'semi-supervised learning', 'self-supervised learning', 
        'meta-learning', 'few-shot learning', 'multi-task learning', 
        'transfer learning', 'ensemble learning', 
        'autoregressive integrated moving average (ARIMA)', 
        'exponential smoothing', 'kalman filtering', 'survival analysis', 
        'probabilistic graphical models', 'causal inference', 
        'topic modeling', 'outlier detection', 'active learning', 
        'neural architecture search', 'attention mechanisms', 
        'hierarchical clustering', 'contrastive learning',
        'adaptive boosting', 'bagging', 'maximum likelihood estimation',
        'expectation-maximization', 'manifold learning', 'spectral embedding',
        'regularization techniques', 'robust optimization', 
        'decision analysis', 'swarm intelligence'
    ],
    "ALGORITHM": [
        'k-means', 'decision trees', 'random forest', 'svm', 
        'naive bayes', 'knn', 'logistic regression', 'linear regression', 
        'genetic algorithms', 'gradient descent', 'simulated annealing', 
        'particle swarm optimization', 'hill climbing', 'pagerank', 
        'dijkstra algorithm', 'kruskal algorithm', 'prim algorithm', 
        'bipartite matching', 'shortest path algorithms', 
        'hierarchical clustering', 'dbscan', 'mean-shift clustering', 
        'spectral clustering', 'xgboost', 'lightgbm', 'catboost', 
        'deep q-learning', 'ppo (proximal policy optimization)', 
        'transformer models', 'variational inference', 
        'reinforcement learning algorithms', 'apriori', 'fp-growth', 
        'monte carlo tree search', 'long short-term memory networks (LSTM)',
        'convolutional neural networks (CNN)', 'self-organizing maps', 
        'word2vec', 'doc2vec', 'collaborative filtering algorithms', 
        'temporal difference learning', 'value iteration', 'policy gradient methods',
        'trust region policy optimization (TRPO)', 'asynchronous advantage actor-critic (A3C)'
    ],
    "TASK": [
        'classification', 'clustering', 'regression', 
        'dimensionality reduction', 'time series forecasting', 
        'anomaly detection', 'sentiment analysis', 'recommendation systems', 
        'topic modeling', 'translation', 'object detection', 
        'speech recognition', 'image segmentation', 'text summarization', 
        'document classification', 'entity recognition', 
        'causal modeling', 'explainable AI', 'automated feature extraction',
        'intelligent tutoring', 'robot path planning', 'game AI design', 
        'event detection', 'sequence labeling', 'action recognition',
        'domain adaptation', 'knowledge graph construction'
    ],
    "MODEL": [
        'linear regression', 'logistic regression', 'decision trees', 
        'random forest', 'k-nearest neighbors (kNN)', 'support vector machine (SVM)', 
        'naive bayes', 'multilayer perceptron (MLP)', 'convolutional neural networks (CNN)', 
        'recurrent neural networks (RNN)', 'transformers', 'bert', 
        'gpt', 'longformer', 'albert', 'graph neural networks (GNN)', 
        'autoencoders', 'generative adversarial networks (GAN)', 
        'tabular neural networks', 'variational autoencoders (VAE)', 
        'language models', 'vision transformers (ViT)', 'ensemble models', 
        'mixture of experts', 'meta-modeling', 'neuro-symbolic models',
        'zero-shot models', 'contrastive learning models', 'sequence-to-sequence models',
        'multi-modal transformers', 'attention-based models', 'biLSTMs', 'capsule networks'
    ],
    "TOOL": [
        'apache hadoop', 'apache spark', 'tensorflow', 'pytorch', 
        'scikit-learn', 'pandas', 'numpy', 'matplotlib', 
        'seaborn', 'plotly', 'd3.js', 'tableau', 'power bi', 
        'kafka', 'apache beam', 'apache flink', 'kubeflow', 
        'mlflow', 'grafana', 'superset', 'aws glue', 'databricks', 
        'nltk', 'spacy', 'gensim', 'flask', 'streamlit', 
        'fastapi', 'transformers library', 'huggingface datasets', 
        'langchain', 'ray tune', 'azure ml', 'google colab', 
        'anaconda', 'mlpack', 'jupyter lab', 'knime',
        'snowflake', 'airflow', 'datadog', 'matplotlib', 'dash', 
        'highcharts', 'supervised library', 'mlxtend'
    ],
    "FRAMEWORK": [
        'tensorflow', 'pytorch', 'keras', 'apache flink', 
        'apache storm', 'ray', 'dask', 'spark mllib', 
        'scikit-learn pipelines', 'fastai', 'hugging face transformers', 
        'langchain', 'ray tune', 'azure ml', 'google ai platform', 
        'kubeflow pipelines', 'mlflow tracking', 'microsoft synapse', 
        'amazon sagemaker', 'openvino', 'mlxtend', 'bigdl', 
        'deeplearning4j', 'onnx runtime', 'paddlepaddle'
    ],
    "DATA": [
        'structured data', 'unstructured data', 'semi-structured data', 
        'time series data', 'historical data', 'geospatial data', 
        'streaming data', 'synthetic data', 'metadata', 
        'bigquery datasets', 'delta lake datasets', 'data cubes', 
        'training data', 'test data', 'validation data', 
        'annotated datasets', 'graph data', 'relational data', 
        'vector embeddings', 'text corpora', 'speech data', 
        'event streams', 'tabular data', 'spatial-temporal datasets',
        'encrypted datasets', 'multi-view data'
    ],
    "PARAMETER": [
        'learning rate', 'batch size', 'number of layers', 
        'dropout rate', 'number of epochs', 'activation functions', 
        'optimizer settings', 'momentum', 'weight decay', 
        'regularization parameters', 'embedding size', 
        'sequence length', 'window size', 'attention heads', 
        'hidden layer size', 'decay rate', 'max iterations', 
        'gradient clipping', 'loss function', 'hyperparameter ranges', 
        'convergence thresholds'
    ],
    "METRIC": [
        'accuracy', 'precision', 'recall', 'f1-score', 
        'mean squared error (MSE)', 'root mean squared error (RMSE)', 
        'mean absolute error (MAE)', 'roc-auc score', 'log loss', 
        'silhouette score', 'calinski-harabasz index', 'perplexity', 
        'bleu score', 'rouge score', 'edit distance', 
        'cosine similarity', 'mean average precision (MAP)', 
        'dcg', 'ndcg', 'kendall rank correlation', 'pairwise precision', 
        'normalized mutual information (NMI)', 'jaccard index'
    ],
    "APPLICATION": [
        'predictive analytics', 'healthcare analytics', 'genomics', 
        'fraud detection', 'marketing analytics', 'recommender systems', 
        'financial analytics', 'cybersecurity', 'traffic prediction', 
        'robotics', 'autonomous vehicles', 'smart cities', 
        'natural language processing', 'time series forecasting', 
        'climate modeling', 'bioinformatics', 'customer lifetime value', 
        'gaming analytics', 'personalization systems', 'geospatial analysis', 
        'speech synthesis', 'virtual assistants', 'real-time analytics', 
        'demand forecasting', 'inventory optimization', 
        'e-commerce analytics', 'social media analytics', 
        'precision agriculture', 'renewable energy optimization', 
        'disaster management', 'financial fraud detection', 'edge AI applications'
    ]
}


In [None]:
#  шаблоны предложений
templates = [
    "The study demonstrates how {} and {} can improve the efficiency of {} tasks.",
    "Using {}, the authors were able to achieve significant improvements in {} with metrics like {}.",
    "{} and {} are critical for {} and have been explored in depth in this research.",
    "This paper compares {} and {} for solving problems related to {}.",
    "Advanced techniques such as {} and {} have been utilized in applications like {}.",
    "The experimental setup involved using {} for tasks like {}, achieving notable {}.",
    "Recent advancements in {} have opened new possibilities for improving {}.",
    "{} has been combined with {} to create innovative solutions for {}.",
    "In the context of {}, methods such as {} and {} were extensively evaluated.",
    "Key contributions include the integration of {} with {} for enhanced {}.",
    "The authors present a novel approach combining {} and {} to tackle challenges in {}.",
    "Performance improvements were observed when {} was applied alongside {} for {}.",
    "The methodology employs {} and {} for robust {} pipelines.",
    "This approach highlights the synergy between {} and {} in enhancing {} outcomes.",
    "Case studies demonstrate the practical applications of {} and {} in {} scenarios."
]

In [None]:
lowercase_terms = [term.lower() for term in domain_terms] from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

class DomainWeightedTfidfVectorizer(TfidfVectorizer):
    def __init__(self, domain_terms=None, domain_weight=2, **kwargs):
        """
        Инициализация кастомного TfidfVectorizer.
        :param domain_terms: Список или множество доменных терминов.
        :param domain_weight: Вес, на который умножаются доменные термины.
        :param kwargs: Остальные параметры TfidfVectorizer.
        """
        super().__init__(**kwargs)
        self.domain_terms = set(domain_terms) if domain_terms else set()
        self.domain_weight = domain_weight

    def transform(self, X):
        """
        Преобразование данных в матрицу признаков с увеличением веса доменных терминов.
        """
        X_tfidf = super().transform(X)
        if not self.domain_terms:
            return X_tfidf
        
        # Получение индексов доменных терминов
        feature_names = np.array(self.get_feature_names_out())
        domain_indices = [i for i, term in enumerate(feature_names) if term in self.domain_terms]

        # Увеличение весов для доменных терминов
        weights = np.ones(X_tfidf.shape[1])
        weights[domain_indices] *= self.domain_weight

        # Применение весов
        return X_tfidf.multiply(weights)

# Использование кастомного векторизатора
vectorizer = DomainWeightedTfidfVectorizer(domain_terms=domain_terms, domain_weight=2, stop_words='english')
standard_stopwords = list(stopwords.words('english'))
additional_stopwords = ['study','research','phd','use','university','abstract','published','thesis','paper','data','using','used']
full_stopwords = standard_stopwords + additional_stopwords

vectorizer_2 = DomainWeightedTfidfVectorizer(
    domain_terms=lowercase_terms,
    domain_weight=10,
    stop_words=full_stopwords,
    ngram_range=(1, 3),
    max_features=5000,
    max_df=0.9,  # Расширяем диапазон
    min_df=5  # Учитываем больше редких слов
) # 1. Функция для настройки UMAP
def configure_umap(n_neighbors=50, n_components=5, metric='cosine'):
    """
    Настройка модели UMAP для уменьшения размерности.
    :param n_neighbors: Количество соседей для локального графа.
    :param n_components: Размерность выходного пространства.
    :param metric: Метрика расстояния.
    :return: Настроенный объект UMAP.
    """
    return UMAP(
        n_neighbors=n_neighbors,
        n_components=n_components,
        metric=metric,
        random_state=42  # Фиксация для воспроизводимости
    )

# 2. Функция для настройки HDBSCAN
def configure_hdbscan(min_cluster_size=10, min_samples=1, metric='euclidean', 
                      cluster_selection_method='leaf', cluster_selection_epsilon=0.1):
    """
    Настройка модели HDBSCAN для кластеризации.
    :param min_cluster_size: Минимальный размер кластера.
    :param min_samples: Минимальное число точек для плотности.
    :param metric: Метрика расстояния.
    :param cluster_selection_method: Метод выбора кластеров ('eom' или 'leaf').
    :param cluster_selection_epsilon: Допустимая разреженность кластеров. Чем больше значение, тем больше объединённых кластеров.
    :return: Настроенный объект HDBSCAN.
    """
    return hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric=metric,
        cluster_selection_method=cluster_selection_method,
        cluster_selection_epsilon=cluster_selection_epsilon,  # Установка epsilon
        prediction_data=True  # Для работы с новыми данными
    )

# 3. Функция для уменьшения размерности эмбеддингов
def reduce_embeddings(embeddings, n_components=50):
    """
    Уменьшает размерность эмбеддингов с помощью PCA.
    :param embeddings: Исходные эмбеддинги.
    :param n_components: Количество компонентов PCA.
    :return: Эмбеддинги с уменьшенной размерностью.
    """
    pca_model = PCA(n_components=n_components, random_state=42)
    return pca_model.fit_transform(embeddings)  # Настройка компонентов
umap_model = configure_umap(n_neighbors=15, n_components=5, metric='cosine')
hdbscan_model = configure_hdbscan(min_cluster_size=30, min_samples=20, metric='euclidean',cluster_selection_epsilon=0.25)
embeddings_reduced = reduce_embeddings(embeddings_SCIBERT, n_components=12)

# Создание модели BERTopic
topic_model = BERTopic(vectorizer_model=vectorizer_2, umap_model=umap_model,hdbscan_model=hdbscan_model,)

# Обучение модели на предобработанных текстах с вашими эмбеддингами
topics, probs = topic_model.fit_transform(documents, embeddings_reduced)

# Процесс дообучения модели

In [1]:
import json
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
import evaluate
from tqdm import tqdm

# Шаг 1. Загрузка синтетического размеченного датасета
with open("synthetic_ner_dataset.json", "r", encoding="utf-8") as f:
    synthetic_data = json.load(f)

# Преобразование в формат Hugging Face
def convert_to_hf_format(data):
    tokens = [entry["tokens"] for entry in data]
    ner_tags = [entry["ner_tags"] for entry in data]
    return {"tokens": tokens, "ner_tags": ner_tags}

hf_data = convert_to_hf_format(synthetic_data)

# Создание Dataset
dataset = Dataset.from_dict(hf_data)

# Разделение на обучающую и тестовую выборки
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

# Шаг 2. Токенизация
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Уникальные тэги
unique_labels = list(set(tag for tags in hf_data["ner_tags"] for tag in tags))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Токенизация и выравнивание меток
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",  # Добавляем автоматическое выравнивание
        max_length=512
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Связь токенов и слов
        aligned_labels = []
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)  # Игнорируем специальные токены ([CLS], [SEP])
            else:
                aligned_labels.append(label2id[label[word_id]])
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# Шаг 3. Загрузка модели
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)

# Шаг 4. Метрики оценки
metric = evaluate.load("seqeval")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = predictions.argmax(-1)
    true_labels = [
        [id2label[label] for label in label_seq if label != -100]
        for label_seq in labels
    ]
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred_seq, label_seq) if l != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Шаг 5. DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)



training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8,
    num_train_epochs=1,  
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    fp16=True,  # Используем смешанную точность
)

# Шаг 7. Обучение с использованием Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)



Train size: 8000, Test size: 2000


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.007,0.001241,0.999588,0.999588,0.999588,0.9999


TrainOutput(global_step=1000, training_loss=0.1377083191871643, metrics={'train_runtime': 20081.8453, 'train_samples_per_second': 0.398, 'train_steps_per_second': 0.05, 'total_flos': 2090770931712000.0, 'train_loss': 0.1377083191871643, 'epoch': 1.0})

In [5]:
# Шаг 8. Сохранение модели
trainer.save_model("./trained_scibert_ner_model")
tokenizer.save_pretrained("./trained_scibert_ner_model")

print("Model training completed and saved.")


Model training completed and saved.


# Тестирование

In [8]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Путь к сохранённой модели
model_path = "./trained_scibert_ner_model"

# Загрузка токенизатора и модели
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)


In [10]:
# Создание NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
# Пример текста для тестирования


# Разметка текста
ner_results = ner_pipeline(example_text)

# Печать результатов
for entity in ner_results:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entity: had, Label: DATA, Score: 0.5473
Entity: ##oop, Label: DATA, Score: 0.5225
Entity: apache spark, Label: TOOL, Score: 0.9924
Entity: distributed computing, Label: TECHNOLOGY, Score: 0.9979
Entity: principal component analysis, Label: METHOD, Score: 0.9988
Entity: gradient boosting, Label: METHOD, Score: 0.9987
Entity: data, Label: APPLICATION, Score: 0.6238
Entity: processing, Label: METHOD, Score: 0.3217


# NER 

In [None]:
client = MongoClient('localhost', 27017)
collection = client['VKR1']['proccessed_for_topic_modelling']

In [72]:
# Список доменных терминов (можно расширить)
domain_terms = [
    # Big Data Technologies and Platforms
    'hadoop', 'spark', 'flink', 'storm', 'kafka', 'hive', 'pig', 'cassandra',
    'hbase', 'mongodb', 'redis', 'elasticsearch', 'solr', 'apache beam',
    'apache samza', 'apache apex', 'presto', 'apache drill', 'apache tez',
    'kudu', 'druid', 'nifi', 'zookeeper', 'oozie', 'mahout', 'sqoop', 'ambari',
    'zeppelin', 'mesos', 'alluxio', 'yarn', 'mapreduce', 'azure hdinsight',
    'amazon emr', 'google cloud dataproc', 'cloudera', 'hortonworks', 'databricks',
    'clickhouse', 'apache ignite', 'azure synapse', 'dataproc', 'data fusion',
    'bigquery', 'snowflake', 'redshift', 'minio', 'delta lake', 'lakehouse',
    'trino', 'apache kylin', 'apache carbondata', 'hudi', 'iceberg', 'dremio',
    'bigtable', 'open tsdb', 'prometheus', 'grafana loki', 'vectorized io',
    'openwhisk', 'kubeflow', 'mlflow', 'apache pulsar', 'apache superset',
    'apache ranger', 'datadog', 'new relic', 'dynatrace', 'splunk', 'logstash',
    'fluentd', 'apache arrow', 'delta sharing', 'microsoft fabric', 'aws glue',
    'athena', 'databricks unity catalog', 'citus', 'greenplum', 'hypertable',
    'apache pinot', 'apache heron', 'hazelcast', 'couchbase','ai',

    # Programming Languages and Tools
    'python', 'scala', 'java', 'r', 'sql', 'nosql', 'julia', 'matlab', 'sas',
    'perl', 'go', 'c++', 'rust', 'haskell', 'pig latin', 'hiveql', 'bash',
    'shell scripting', 'php', 'typescript', 'javascript', 'groovy', 'terraform',
    'ansible', 'ruby', 'fortran', 'erlang', 'f#', 'kotlin', 'elixir', 'nim',
    'clojure', 'prolog', 'scheme', 'smalltalk', 'powershell', 'visual basic',
    'tcl', 'awk', 'sed', 'dart', 'cobol', 'vbscript', 'actionscript', 'apl',
    'postgresql', 'mariadb', 'google bigquery sql', 'pl/pgsql', 'solidity',

    # Machine Learning and AI
    'machine learning', 'deep learning', 'clustering', 'classification',
    'regression', 'association rules', 'k-means', 'random forest',
    'gradient boosting', 'svm', 'neural networks', 'decision trees',
    'bayesian methods', 'markov chains', 'genetic algorithms', 'reinforcement learning',
    'exponential smoothing', 'pca', 't-sne', 'lda', 'linear regression',
    'logistic regression', 'naive bayes', 'knn', 'deep neural networks', 'rnn',
    'cnn', 'autoencoders', 'transformers', 'attention mechanism', 'seq2seq models',
    'ensemble learning', 'bagging', 'boosting', 'xgboost', 'lightgbm', 'catboost',
    'hyperparameter tuning', 'grid search', 'bayesian optimization',
    'self-supervised learning', 'contrastive learning', 'meta-learning',
    'few-shot learning', 'multi-task learning', 'generative adversarial networks',
    'variational autoencoders', 'graph neural networks', 'language models',
    'bpe tokenization', 'word2vec', 'fasttext', 'doc2vec', 'sentence transformers',
    'roberta', 'gpt', 'scibert', 'longformer', 'xlm', 'electra', 'albert',
    'vision transformers (vit)', 'bertology', 'zero-shot learning',
    'one-shot learning', 'distillation', 'federated gans', 'tabnet', 'deepar',
    'fastai', 'hugging face transformers', 'pinecone', 'haystack', 'openai apis',
    'langchain', 'llamaindex', 'embeddings',

    # Algorithms and Data Structures
    'bloom filter', 'count-min sketch', 'hyperloglog', 'hashing', 'sharding',
    'partitioning', 'consistent hashing', 'merkle trees', 'trie', 'b-tree',
    'skip lists', 'graph algorithms', 'pagerank', 'giraph', 'pregel', 'gelly',
    'priority queues', 'heap', 'hash tables', 'dynamic programming',
    'shortest path algorithms', 'a-star', 'dijkstra', 'kruskal', 'prim',
    'suffix trees', 'trie structures', 'radix sort', 'red-black trees', 'quadtree',
    'k-d tree', 'segment tree', 'fenwick tree', 'bit masking', 'splay tree',
    'bloomier filters', 'min-heap', 'max-heap', 'hashmaps', 'sparse matrices',
    'dense matrices', 'adjacency lists', 'minimax algorithm',
    'monte carlo tree search', 'simulated annealing', 'tabu search',
    'bellman-ford', 'convex hull', 'suffix arrays', 'lzw compression',
    'sparse neural networks', 'bipartite graphs', 'euler paths',
    'hamiltonian paths', 'approximation algorithms',

    # Application Areas
    'predictive analytics', 'natural language processing', 'computer vision',
    'time series analysis', 'social network analysis', 'bioinformatics',
    'recommender systems', 'internet of things', 'cybersecurity', 'financial analytics',
    'stock market analysis', 'marketing analytics', 'big graph processing',
    'geospatial analysis', 'sentiment analysis', 'digital health', 'personalization',
    'audio analytics', 'robotics', 'autonomous vehicles', 'genomics', 'climate modeling',
    'smart cities', 'energy optimization', 'fraud detection', 'e-commerce analytics',
    'supply chain analytics', 'edge analytics', 'telecommunications analytics',
    'healthcare analytics', 'environmental monitoring', 'disaster prediction',
    'text summarization', 'translation systems', 'video analytics', 'speech recognition',
    'image segmentation', 'retail analytics', 'transportation analytics',
    'insurance analytics', 'gaming analytics', 'agrotech analytics',
    'educational data mining', 'precision medicine', 'quantum computing',
    'remote sensing', 'smart retail', '5g networks analytics', 'streaming media analytics',
    'virtual reality data', 'blockchain data analysis',

    # Concepts and Approaches
    'etl', 'elt', 'stream processing', 'parallel computing', 'distributed systems',
    'in-memory computing', 'lambda architecture', 'kappa architecture', 'cap theorem',
    'base approach', 'acid transactions', 'horizontal scaling', 'vertical scaling',
    'data governance', 'data quality', 'data modeling', 'semantic web', 'microservices',
    'containerization', 'orchestration', 'event-driven architecture',
    'real-time analytics', 'batch processing', 'stream analytics', 'stateful processing',
    'stateless processing', 'low-latency processing', 'adaptive streaming',
    'data wrangling', 'federated learning', 'data pipelines', 'data aggregation',
    'event sourcing', 'data lakes', 'metadata management', 'data versioning',
    'data lineage', 'data observability', 'composable data', 'decentralized data',
    'dataops', 'edge computing', 'digital twins', 'neuro-symbolic ai',
    'knowledge distillation',

    # Data Visualization
    'tableau', 'power bi', 'qlikview', 'd3.js', 'matplotlib', 'seaborn', 'plotly',
    'ggplot2', 'kibana', 'grafana', 'superset', 'dash', 'bokeh', 'gephi',
    'data storytelling', 'heatmaps', 'scatter plots', 'box plots', 'histograms',
    'time series visualization', 'network visualization', 'interactive dashboards',
    'infographics', 'sankey diagrams', 'word clouds', 'sunburst charts', 'radar charts',
    'treemaps', 'correlation plots', 'observable plot', 'rawgraphs', 'flourish',
    'highcharts', 'chart.js', 'piktochart', 'anychart', 'datawrapper', 'infogram',
    'veusz', 'charticulator'

    # Добавленные термины из domain_terms
    'Big Data','large-scale data', 'Data Analysis', 'Data Mining', 'Data Science', 'Artificial Intelligence',
    'MapReduce', 'NoSQL', 'SQL', 'Data Warehousing', 'Business Intelligence',
    'Prescriptive Analytics', 'Descriptive Analytics', 'Apache Hive', 'Apache Pig',
    'Apache Flink', 'Apache Storm', 'Apache Cassandra', 'HBase', 'PyTorch', 'Keras',
    'Scikit-learn', 'Pandas', 'NumPy', 'Data Cleaning', 'Data Integration',
    'AWS', 'Google Cloud Platform', 'Microsoft Azure', 'MLOps', 'Data Pipeline',
    'OLAP', 'OLTP', 'Fog Computing', 'Feature Engineering', 'Dimensionality Reduction',
    'Principal Component Analysis', 'Singular Value Decomposition', 'Data Anonymization',
    'Privacy Preserving Data Mining', 'GDPR', 'Data Ethics', 'Database Management Systems',
    'SQL Server', 'Teradata', 'Data Catalog', 'Data Lineage', 'Data Lakehouse',
    'Structured Data', 'Unstructured Data', 'Semi-Structured Data', 'Distributed Computing',
    'HDFS', 'Columnar Databases', 'Graph Databases', 'Neo4j', 'Data Preprocessing',
    'Data Sampling', 'Data Imputation', 'Anomaly Detection', 'Recurrent Neural Networks',
    'Long Short-Term Memory', 'Attention Mechanisms', 'Data Monetization', 'Data Strategy',
    'Data Literacy', 'Data Democratization', 'Self-Service Analytics', 'Augmented Analytics',
    'Explainable AI', 'AutoML', 'Synthetic Data', 'Data-Driven Decision Making',
    'DataOps', 'Data Stewardship', 'Data Scientist', 'Data Engineer', 'Data Analyst',
    'Data Architect', 'Chief Data Officer', 'Scalability', 'Throughput', 'Fault Tolerance',
    'Load Balancing', 'API', 'RESTful Services', 'Parquet', 'Avro', 'ORC', 'Apache Arrow',
    'Data Storage', 'Data Retrieval', 'ETL Tools', 'Informatica', 'Talend', 'Pentaho',
    'SSIS', 'Data Enrichment', 'Data Warehouse Automation', 'Data Ingestion', 'Data Retention',
    'Data Archiving', 'Data Lifecycle Management', 'Data Replication', 'Master Data Management',
    'Reference Data', 'Data Provenance', 'Data Virtualization', 'Data Federation',
    'Data Consolidation', 'Data Blending', 'Data Cubes', 'Data Mesh', 'Data Fabric',
    'Data Tokenization', 'Data Security', 'Data Privacy', 'Data Loss Prevention',
    'Access Control', 'Authentication', 'Authorization', 'Role-Based Access Control',
    'Identity and Access Management', 'Encryption at Rest', 'Encryption in Transit',
    'SSL', 'TLS', 'Single Sign-On', 'Data Compliance', 'HIPAA', 'PCI DSS', 'Data Breach',
    'Data Incident Response', 'High Availability', 'Serverless Computing',
    'Function as a Service', 'Platform as a Service', 'Infrastructure as a Service',
    'Software as a Service', 'Apache Iceberg', 'Apache Hudi', 'CAP Theorem',
    'Consistency', 'Availability', 'Partition Tolerance', 'ACID', 'BASE', 'CQRS',
    'Message Queues', 'RabbitMQ', 'ActiveMQ', 'ZeroMQ', 'MQTT', 'Streaming Data',
    'Kafka Streams', 'Kinesis', 'Pub/Sub', 'Event Hubs', 'Continuous Integration',
    'Continuous Deployment', 'Agile Methodology', 'Scrum', 'Data Transformation',
    'Data Normalization', 'Standardization', 'One-Hot Encoding', 'Label Encoding',
    'Cross-Validation', 'Random Search', 'K-Fold Cross Validation', 'Ensemble Methods',
    'Stacking', 'Transfer Learning', 'Model Deployment', 'Model Serving', 'Model Monitoring',
    'A/B Testing', 'Concept Drift', 'Model Retraining', 'Model Explainability',
    'SHAP Values', 'LIME', 'Partial Dependence Plots', 'Data Annotation', 'Labeling',
    'Overfitting', 'Underfitting', 'Bias-Variance Tradeoff', 'Regularization',
    'Elastic Net', 'Dropout', 'Early Stopping', 'Mini-Batch Gradient Descent',
    'Adam Optimizer', 'Activation Functions', 'Sigmoid Function', 'Softmax Function',
    'Cost Function', 'Loss Function', 'Backpropagation', 'Epoch', 'Batch Size',
    'Data Silo', 'Edge AI', 'Data Compression', 'UMAP', 'Data Partitioning',
    'Data Sharding', 'Bloom Filters', 'Data Streaming Algorithms', 'Real-Time Analytics',
    'Apache Druid', 'ClickHouse', 'OLAP Cubes', 'Data Marketplace', 'Data Exchange',
    'Data Brokerage', 'Data Licensing', 'Data Contracts', 'Data Quality Metrics',
    'Data Accuracy', 'Data Completeness', 'Data Timeliness', 'Data Conformity',
    'Data Uniqueness', 'Data Validation', 'Data Profiling', 'Data Observability',
    'Data SLOs', 'Anonymization Techniques', 'K-Anonymity', 'Differential Privacy',
    'Homomorphic Encryption', 'Secure Multi-Party Computation', 'Zero-Knowledge Proofs',
    'Blockchain', 'Smart Contracts', 'Consensus Algorithms', 'Proof of Work',
    'Proof of Stake', 'Machine Learning Lifecycle', 'Data Acquisition', 'Data Processing',
    'Model Building', 'Model Evaluation', 'Model Maintenance', 'Experiment Tracking',
    'Weights & Biases', 'Data Privacy Laws', 'CCPA', 'Data Residency', 'Hybrid Cloud',
    'Multicloud', 'Data Compression Techniques', 'Video Analytics', 'Audio Analytics',
    'Topic Modeling', 'Hierarchical Clustering', 'DBSCAN', 'Affinity Propagation',
    'Mean Shift', 'K-Nearest Neighbors', 'Multilayer Perceptron', 'Polynomial Regression',
    'Ridge Regression', 'Lasso Regression', 'Elastic Net Regression', 'ARIMA Models',
    'Autocorrelation', 'Cross-Correlation', 'Spectral Analysis', 'Fourier Transform',
    'Wavelets', 'Mutual Information', 'Chi-Square Test', 'Recursive Feature Elimination',
    'Univariate Selection', 'Canonical Correlation Analysis', 'Discriminant Analysis',
    'Cluster Analysis', 'Cohort Analysis', 'Customer Lifetime Value', 'Churn Prediction',
    'Market Basket Analysis', 'Apriori Algorithm', 'FP-Growth Algorithm', 'Hybrid Recommenders',
    'Matrix Factorization', 'Isolation Forest', 'One-Class SVM', 'Local Outlier Factor',
    'CRISP-DM', 'SEMMA', 'KDD', 'Information Retrieval', 'Named Entity Recognition',
    'Latent Dirichlet Allocation', 'Latent Semantic Analysis', 'Word Embeddings', 'GloVe',
    'BERT', 'Autoencoders', 'Data Masking', 'Data Swapping', 'Re-identification Risk',
    'DAMA-DMBOK', 'DCAM', 'Data Stewardship Council', 'Data Quality Tools',
    'Data Integration Tools', 'Data Governance Tools', 'Data Catalog Tools',
    'Data Lineage Tools', 'Data Masking Tools', 'Data Encryption Tools',
    'Data Backup and Recovery Tools', 'Cloud Storage', 'Object Storage', 'Block Storage',
    'File Storage', 'SAN', 'NAS', 'Storage Tiers', 'Cold Storage', 'Warm Storage',
    'Data Retention Policies', 'Data Disposal', 'Data Destruction', 'Shredding',
    'Degaussing', 'Physical Destruction', 'Data Forensics', 'Incident Response',
    'SIEM', 'SOAR', 'Threat Intelligence', 'Vulnerability Management',
    'Penetration Testing', 'Ethical Hacking', 'Security Awareness Training', 'Phishing',
    'Malware', 'Ransomware', 'Security Monitoring', 'Logging', 'Auditing',
    'Compliance Auditing', 'Regulatory Compliance', 'Data Legislation',
    'Privacy Impact Assessment', 'Risk Management', 'Business Continuity Planning',
    'Green Computing', 'Energy Efficiency', 'Thermal Management', 'Cooling Systems',
    'Virtualization', 'Hypervisors', 'Virtual Machines', 'Containers',
    'Cloud Native Applications', 'Workflow Management', 'Apache Airflow', 'Luigi',
    'Azkaban', 'Data Flow', 'ETL vs ELT', 'Change Data Capture', 'Query Optimization',
    'Cost-Based Optimization', 'Rule-Based Optimization', 'Data Formats', 'YAML',
    'ProtoBuf', 'Thrift', 'Data Serialization', 'Data Deserialization', 'Marshaling',
    'Unmarshaling', 'Operational Data Store', 'Staging Area', 'Sandbox Environment',
    'Data Environments', 'Version Control', 'Branching Strategies', 'Trunk-Based Development',
    'Code Review', 'Pull Requests', 'Merge Requests', 'Build Automation', 'Test Automation',
    'Unit Testing', 'Integration Testing', 'System Testing', 'Acceptance Testing',
    'Regression Testing', 'Test-Driven Development', 'Behavior-Driven Development',
    'API Contracts', 'SLAs', 'SLOs', 'KPIs', 'Metrics', 'Monitoring', 'Alerting',
    'Observability', 'Tracing', 'Metrics Collection', 'Dashboards', 'Reports',
    'Choropleth Maps', 'Bar Charts', 'Line Charts', 'Pie Charts', 'Interactive Visualizations',
    'Natural Language Query', 'Voice Interfaces', 'Chatbots', 'Virtual Assistants',
    'Cognitive Computing', 'Knowledge Graphs', 'Ontologies', 'Taxonomies', 'RDF',
    'SPARQL', 'OWL', 'Open Data', 'Data Sharing', 'FAIR Principles', 'Data Repositories',
    'Data Archives', 'Data Curation', 'Data Lifecycle', 'Data Management Plan',
    'Metadata Standards', 'Dublin Core', 'Data Citation', 'Data Journals', 'Open Science',
    'Data Collaborations', 'Data Consortia', 'Data Partnerships', 'Data Commons',
    'Data Cooperatives', 'Data Crowdsourcing', 'Citizen Science','algorithm'
]

lowercase_terms = [term.lower() for term in domain_terms]

In [None]:
import tqdm
from transformers import pipeline
from pymongo import MongoClient
from collections import Counter
import matplotlib.pyplot as plt

# Загрузка дообученной модели и токенизатора
model_path = "./trained_scibert_ner_model"
ner_pipeline = pipeline("ner", model=model_path, tokenizer=model_path, aggregation_strategy="simple")

# Список исключаемых сущностей
EXCLUDED_ENTITIES = {"data", "big data", "drivers", "dedicated", "driver"}

# Удаление субтокенов, коротких, повторяющихся и нерелевантных сущностей
def postprocess_entities(entities):
    """
    Удаляет некорректные, короткие, повторяющиеся и нерелевантные сущности.
    """
    seen = set()
    processed = [
        {"text": entity["text"].replace("##", "").strip().lower(), "label": entity["label"]}
        for entity in entities
    ]
    return [
        e for e in processed
        if len(e["text"]) > 3 and e["text"] not in EXCLUDED_ENTITIES and (e["text"], e["label"]) not in seen and not seen.add((e["text"], e["label"]))
    ]

# Извлечение сущностей
def extract_entities_ner(text):
    """
    Извлекает и обрабатывает сущности из текста.
    """
    results = ner_pipeline(text)
    entities = [{"text": res["word"], "label": res["entity_group"]} for res in results]
    return postprocess_entities(entities)

# Подсчет статистики
def compute_statistics(entities):
    """
    Вычисляет статистику по извлеченным сущностям.
    """
    total_entities = len(entities)
    unique_entities = set(entity["text"] for entity in entities)
    category_counts = Counter(entity["label"] for entity in entities)
    avg_length = sum(len(entity["text"]) for entity in entities) / total_entities if total_entities else 0
    min_length = min((len(entity["text"]) for entity in entities), default=0)
    max_length = max((len(entity["text"]) for entity in entities), default=0)

    return {
        "total": total_entities,
        "unique": len(unique_entities),
        "category_counts": category_counts,
        "avg_length": avg_length,
        "min_length": min_length,
        "max_length": max_length,
    }

# Вывод статистики
def print_statistics(stats):
    """
    Выводит статистику на экран.
    """
    print("\n=== Общая статистика ===")
    print(f"Всего извлечено сущностей: {stats['total']}")
    print(f"Уникальных сущностей: {stats['unique']}")
    for category, count in stats['category_counts'].items():
        print(f" - {category}: {count} сущностей")
    print(f"Средняя длина сущности: {stats['avg_length']:.2f} символов")
    print(f"Минимальная длина: {stats['min_length']} символов")
    print(f"Максимальная длина: {stats['max_length']} символов")
    print("=======================\n")

# Построение графика распределения категорий
def plot_category_distribution(category_counts, title="Распределение категорий"):
    """
    Визуализация распределения категорий.
    """
    plt.figure(figsize=(10, 6))
    plt.bar(category_counts.keys(), category_counts.values(), color='skyblue')
    plt.title(title, fontsize=14)
    plt.ylabel('Количество сущностей', fontsize=12)
    plt.xlabel('Категории', fontsize=12)
    plt.xticks(rotation=45)
    plt.show()

# Анализ документов из MongoDB
def process_documents(limit=10):
    """
    Анализирует документы из MongoDB и выводит обработанные сущности со статистикой.
    """
    client = MongoClient('localhost', 27017)
    collection = client['VKR1']['proccessed_for_topic_modelling']
    cursor = collection.find({}, {"_id": 1, "abstract_bert_emb": 1}).limit(limit)

    all_entities = []

    for doc in tqdm.tqdm(cursor, desc="NER-анализ"):
        text = doc.get("abstract_bert_emb", "")
        if not text or len(text) <= 50:
            continue

        entities = extract_entities_ner(text)
        all_entities.extend(entities)
        print(f"Документ ID {doc['_id']}:")
        if entities:
            for entity in entities:
                print(f" - Сущность: {entity['text']}, Категория: {entity['label']}")
            stats = compute_statistics(entities)
            print_statistics(stats)
        else:
            print(" - Сущности не найдены.")
        print("-" * 50)

    # Итоговая статистика
    print("\n=== Итоговая статистика по всем документам ===")
    total_stats = compute_statistics(all_entities)
    print_statistics(total_stats)

    # Построение графика
    plot_category_distribution(total_stats['category_counts'], title="Итоговое распределение категорий")

    # Частотный анализ сущностей
    entity_frequencies = Counter(entity["text"] for entity in all_entities)
    print("Топ 10 самых частых сущностей:")
    for entity, freq in entity_frequencies.most_common(10):
        print(f"{entity}: {freq} раз(а)")

# Запуск анализа документов
process_documents(limit=30)