In [1]:
class LDAInputs:
    def __init__(self):
        # 1. 文本语料
        self.documents = [
            "反射望远镜使用曲面镜形成图像",
            "牛顿在1668年发明了反射望远镜", 
            "主要类型包括牛顿式、卡塞格林式",
            "反射望远镜没有色差问题",
            "折射望远镜使用透镜而不是镜子"
        ]
        
        # 2. 预处理参数
        self.preprocessing_params = {
            'remove_stopwords': True,
            'tokenization': True,
            'lemmatization': True,
            'min_word_length': 2,
            'max_df': 0.95,  # 忽略出现在95%以上文档中的词
            'min_df': 2,     # 忽略出现少于2次的词
        }
        
        # 3. LDA超参数
        self.lda_params = {
            'n_topics': 3,           # 主题数量
            'max_iter': 1000,        # 迭代次数
            'random_state': 42,      # 随机种子
            'learning_method': 'batch'
        }

In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import defaultdict

class LDATreeProcess:
    def __init__(self, documents, n_topics=3):
        self.documents = documents
        self.n_topics = n_topics
        self.vectorizer = None
        self.lda_model = None
        self.topic_tree = None
    
    def preprocess_text(self):
        """步骤1: 文本预处理和向量化"""
        self.vectorizer = CountVectorizer(
            max_df=0.95, 
            min_df=2,
            stop_words='english',  # 中文需要自定义停用词
            token_pattern=r'\b[a-zA-Z]{3,}\b'  # 匹配至少3个字母的单词
        )
        
        # 创建文档-词项矩阵
        self.document_term_matrix = self.vectorizer.fit_transform(self.documents)
        self.feature_names = self.vectorizer.get_feature_names_out()
        
        print("文档-词项矩阵形状:", self.document_term_matrix.shape)
        print("特征词数量:", len(self.feature_names))
    
    def apply_lda(self):
        """步骤2: 应用LDA模型"""
        self.lda_model = LatentDirichletAllocation(
            n_components=self.n_topics,
            max_iter=1000,
            random_state=42,
            learning_method='batch'
        )
        
        # 训练LDA模型
        self.lda_model.fit(self.document_term_matrix)
        
        # 获取文档-主题分布
        self.document_topic_dist = self.lda_model.transform(self.document_term_matrix)
        
        # 获取主题-词项分布
        self.topic_word_dist = self.lda_model.components_
    
    def analyze_topics(self):
        """步骤3: 分析主题结果"""
        self.topic_keywords = {}
        
        for topic_idx, topic in enumerate(self.topic_word_dist):
            # 获取每个主题最重要的词
            top_keyword_indices = topic.argsort()[:-10:-1]  # 取前10个关键词
            top_keywords = [self.feature_names[i] for i in top_keyword_indices]
            
            self.topic_keywords[topic_idx] = {
                'keywords': top_keywords,
                'keyword_weights': topic[top_keyword_indices]
            }
            
            print(f"主题 {topic_idx}: {', '.join(top_keywords)}")
    
    def build_topic_hierarchy(self):
        """步骤4: 构建主题层次树"""
        # 创建根节点
        root = {
            'node_id': 'knowledge_root',
            'content': '文档主题知识',
            'node_type': 'root',
            'children': [],
            'level': 0,
            'topic_distribution': None
        }
        
        # 为每个主题创建分支节点
        for topic_idx in range(self.n_topics):
            topic_info = self.topic_keywords[topic_idx]
            
            topic_node = {
                'node_id': f'topic_{topic_idx}',
                'content': f"主题 {topic_idx}: {' '.join(topic_info['keywords'][:3])}",
                'node_type': 'topic_branch',
                'keywords': topic_info['keywords'],
                'children': [],
                'level': 1,
                'topic_coherence': self.calculate_topic_coherence(topic_idx)
            }
            
            # 将文档分配到对应的主题下
            self._assign_documents_to_topic(topic_idx, topic_node)
            
            root['children'].append(topic_node)
        
        self.topic_tree = root
        return root
    
    def _assign_documents_to_topic(self, topic_idx, topic_node):
        """将文档分配到主题节点下"""
        for doc_idx, doc_topic_dist in enumerate(self.document_topic_dist):
            dominant_topic = np.argmax(doc_topic_dist)
            
            if dominant_topic == topic_idx:
                confidence = doc_topic_dist[topic_idx]
                
                doc_node = {
                    'node_id': f'doc_{doc_idx}',
                    'content': self.documents[doc_idx],
                    'node_type': 'document_leaf',
                    'topic_confidence': confidence,
                    'children': [],
                    'level': 2
                }
                
                topic_node['children'].append(doc_node)
    
    def calculate_topic_coherence(self, topic_idx):
        """计算主题一致性分数"""
        # 简化的主题一致性计算
        keywords = self.topic_keywords[topic_idx]['keywords'][:5]
        return len(keywords) / 5.0  # 简化计算

In [3]:
class LDAOutputs:
    def __init__(self, lda_process):
        self.process = lda_process
        self.extract_outputs()
    
    def extract_outputs(self):
        """提取LDA分析的所有输出"""
        self.outputs = {
            # 1. 主题模型输出
            'topic_model': {
                'topic_keywords': self.process.topic_keywords,
                'document_topic_distribution': self.process.document_topic_dist,
                'topic_word_distribution': self.process.topic_word_dist
            },
            
            # 2. 树结构输出
            'knowledge_tree': self.process.topic_tree,
            
            # 3. 统计信息
            'statistics': {
                'n_topics': self.process.n_topics,
                'n_documents': len(self.process.documents),
                # 'topic_coherence_scores': self._get_topic_coherence_scores(),
                # 'document_assignment': self._get_document_assignments()
            },
            
            # 4. 可视化数据
            'visualization_data': {
                # 'topic_relationships': self._analyze_topic_relationships(),
                # 'keyword_networks': self._build_keyword_networks()
            }
        }
    
    def get_tree_structure(self):
        """获取最终的树结构"""
        return self.outputs['knowledge_tree']
    
    
    def print_tree_summary(self):
        """打印树结构摘要"""
        tree = self.outputs['knowledge_tree']
        print("=== LDA生成的树结构摘要 ===")
        print(f"根节点: {tree['content']}")
        print(f"主题分支数量: {len(tree['children'])}")
        
        for topic_node in tree['children']:
            print(f"\n主题 {topic_node['node_id']}:")
            print(f"  关键词: {', '.join(topic_node['keywords'][:5])}")
            print(f"  包含文档: {len(topic_node['children'])}个")
            print(f"  主题一致性: {topic_node['topic_coherence']:.3f}")

In [4]:
def complete_lda_tree_workflow():
    # 输入数据
    documents = [
        "A reflecting telescope uses curved mirrors to form an image",
        "Isaac Newton invented the reflecting telescope in 1668",
        "Main types include Newtonian and Cassegrain designs", 
        "Reflecting telescopes eliminate chromatic aberration",
        "Refracting telescopes use lenses instead of mirrors",
        "The Hubble Space Telescope is a famous reflector",
        "Mirror materials evolved from metal to glass with coatings",
        "Large telescopes use segmented mirror designs"
    ]
    
    # 中间处理过程
    lda_processor = LDATreeProcess(documents, n_topics=3)
    
    print("步骤1: 文本预处理...")
    lda_processor.preprocess_text()
    
    print("\n步骤2: 训练LDA模型...")
    lda_processor.apply_lda()
    
    print("\n步骤3: 分析主题...")
    lda_processor.analyze_topics()
    
    print("\n步骤4: 构建主题树...")
    topic_tree = lda_processor.build_topic_hierarchy()
    
    # 输出结果
    output_analyzer = LDAOutputs(lda_processor)
    output_analyzer.print_tree_summary()
    
    return topic_tree

# 执行完整流程
final_knowledge_tree = complete_lda_tree_workflow()

步骤1: 文本预处理...
文档-词项矩阵形状: (8, 7)
特征词数量: 7

步骤2: 训练LDA模型...

步骤3: 分析主题...
主题 0: telescope, reflecting, mirrors, telescopes, designs, mirror, use
主题 1: mirror, designs, use, telescopes, reflecting, mirrors, telescope
主题 2: telescopes, use, mirrors, reflecting, designs, mirror, telescope

步骤4: 构建主题树...
=== LDA生成的树结构摘要 ===
根节点: 文档主题知识
主题分支数量: 3

主题 topic_0:
  关键词: telescope, reflecting, mirrors, telescopes, designs
  包含文档: 3个
  主题一致性: 1.000

主题 topic_1:
  关键词: mirror, designs, use, telescopes, reflecting
  包含文档: 3个
  主题一致性: 1.000

主题 topic_2:
  关键词: telescopes, use, mirrors, reflecting, designs
  包含文档: 2个
  主题一致性: 1.000


In [5]:
!pip install bertopic

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple/
Collecting bertopic
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/98/05/2d6b305391efff89c2b4cf19cf847f971ca163eb5c149d0d2ffac0a9c7ed/bertopic-0.17.3-py3-none-any.whl (153 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a3/ef/32c8a0b3dc6e6c4e433b85b30c3723d8eb48d115c0185b82ab89e1a0ef89/hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m5.4 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hCollecting umap-learn>=0.5.0 (from bertopic)
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/6b/b1/c24deeda9baf1fd491aaad941ed89e0fed6c583a117fd7b79e0a33a1e6c0/umap_learn-0.5.9.post2-py3-none-any.whl (90 kB)
Collecting plotly>=4.7.0 (from bertopic)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/3f/93/023955c26b0ce614342d11cc0652f1e45e3239

In [6]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
 
docs = [
    "A reflecting telescope (also called a reflector) is a telescope that uses a single or a combination of curved mirrors that reflect light and form an image. The reflecting telescope was invented in the 17th century by Isaac Newton as an alternative to the refracting telescope which, at that time, was a design that suffered from severe chromatic aberration. Although reflecting telescopes produce other types of optical aberrations, it is a design that allows for very large diameter objectives. Almost all of the major telescopes used in astronomy research are reflectors. Reflecting telescopes come in many design variations and may employ extra optical elements to improve image quality or place the image in a mechanically advantageous position. Since reflecting telescopes use mirrors, the design is sometimes referred to as a catoptric telescope.",
    "From the time of Newton to the 1800s, the mirror itself was made of metal usually speculum metal. This type included Newton's first designs and even the largest telescopes of the 19th century, the Leviathan of Parsonstown with a 1.8 meter wide metal mirror. In the 19th century a new method using a block of glass coated with very thin layer of silver began to become more popular by the turn of the century. Common telescopes which led to the Crossley and Harvard reflecting telescopes, which helped establish a better reputation for reflecting telescopes as the metal mirror designs were noted for their drawbacks. Chiefly the metal mirrors only reflected about of the light and the metal would tarnish. After multiple polishings and tarnishings, the mirror could lose its precise figuring needed.",
    "Reflecting telescopes became extraordinarily popular for astronomy and many famous telescopes, such as the Hubble Space Telescope, and popular amateur models use this design. In addition, the reflection telescope principle was applied to other electromagnetic wavelengths, and for example, X-ray telescopes also use the reflection principle to make image-forming optics.",
    "## History. The idea that curved mirrors behave like lenses dates back at least to Alhazen's 11th century treatise on optics, works that had been widely disseminated in Latin translations in early modern Europe. Soon after the invention of the refracting telescope, Galileo, Giovanni Francesco Sagredo, and others, spurred on by their knowledge of the principles of curved mirrors, discussed the idea of building a telescope using a mirror as the image forming objective. There were reports that the Bolognese Cesare Caravaggi had constructed one around 1626 and the Italian professor Niccolò Zucchi, in a later work, wrote that he had experimented with a concave bronze mirror in 1616, but said it did not produce a satisfactory image. The potential advantages of using parabolic mirrors, primarily reduction of spherical aberration with no chromatic aberration, led to many proposed designs for reflecting telescopes. The most notable being James Gregory, who published an innovative design for a ‘reflecting’ telescope in 1663. It would be ten years (1673), before the experimental scientist Robert Hooke was able to build this type of telescope, which became known as the Gregorian telescope.",
    "Five years after Gregory designed his telescope and five years before Hooke built the first such Gregorian telescope, Isaac Newton in 1668 built his own reflecting telescope, which is generally acknowledged as the first reflecting telescope. It used a spherically ground metal primary mirror and a small diagonal mirror in an optical configuration that has come to be known as the Newtonian telescope.",
    "Despite the theoretical advantages of the reflector design, the difficulty of construction and the poor performance of the speculum metal mirrors being used at the time meant it took over 100 years for them to become popular. Many of the advances in reflecting telescopes included the perfection of parabolic mirror fabrication in the 18th century, silver coated glass mirrors in the 19th century (built by Léon Foucault in 1858), long-lasting aluminum coatings in the 20th century, segmented mirrors to allow larger diameters, and active optics to compensate for gravitational deformation. A mid-20th century innovation was catadioptric telescopes such as the Schmidt camera, which use both a spherical mirror and a lens (called a corrector plate) as primary optical elements, mainly used for wide-field imaging without spherical aberration.",
    "The late 20th century has seen the development of adaptive optics and lucky imaging to overcome the problems of seeing, and reflecting telescopes are ubiquitous on space telescopes and many types of spacecraft imaging devices.",
    "## Technical considerations. A curved primary mirror is the reflector telescope's basic optical element that creates an image at the focal plane. The distance from the mirror to the focal plane is called the focal length. Film or a digital sensor may be located here to record the image, or a secondary mirror may be added to modify the optical characteristics and/or redirect the light to film, digital sensors, or an eyepiece for visual observation.",
    "The primary mirror in most modern telescopes is composed of a solid glass cylinder whose front surface has been ground to a spherical or parabolic shape. A thin layer of aluminum is vacuum deposited onto the mirror, forming a highly reflective first surface mirror.",
    "Some telescopes use primary mirrors which are made differently. Molten glass is rotated to make its surface paraboloidal, and is kept rotating while it cools and solidifies. (See Rotating furnace.) The resulting mirror shape approximates a desired paraboloid shape that requires minimal grinding and polishing to reach the exact figure needed.",
    "## Optical errors. Reflecting telescopes, just like any other optical system, do not produce 'perfect' images. The need to image objects at distances up to infinity, view them at different wavelengths of light, along with the requirement to have some way to view the image the primary mirror produces, means there is always some compromise in a reflecting telescope's optical design.",
    "Because the primary mirror focuses light to a common point in front of its own reflecting surface almost all reflecting telescope designs have a secondary mirror, film holder, or detector near that focal point partially obstructing the light from reaching the primary mirror. Not only does this cause some reduction in the amount of light the system collects, it also causes a loss in contrast in the image due to diffraction effects of the obstruction as well as diffraction spikes caused by most secondary support structures.",
    "The use of mirrors avoids chromatic aberration but they produce other types of aberrations. A simple spherical mirror cannot bring light from a distant object to a common focus since the reflection of light rays striking the mirror near its edge do not converge with those that reflect from nearer the center of the mirror, a defect called spherical aberration. To avoid this problem most reflecting telescopes use parabolic shaped mirrors, a shape that can focus all the light to a common focus.",
    "Parabolic mirrors work well with objects near the center of the image they produce, (light traveling parallel to the mirror's optical axis), but towards the edge of that same field of view they suffer from off axis aberrations: Coma – an aberration where point sources (stars) at the center of the image are focused to a point but typically appears as 'comet-like' radial smudges that get worse towards the edges of the image. Field curvature – The best image plane is in general curved, which may not correspond to the detector's shape and leads to a focus error across the field. It is sometimes corrected by a field flattening lens. Astigmatism – an azimuthal variation of focus around the aperture causing point source images off-axis to appear elliptical. Astigmatism is not usually a problem in a narrow field of view, but in a wide field image it gets rapidly worse and varies quadratically with field angle. Distortion – Distortion does not affect image quality (sharpness) but does affect object shapes. It is sometimes corrected by image processing.",
    "There are reflecting telescope designs that use modified mirror surfaces (such as the Ritchey–Chrétien telescope) or some form of correcting lens (such as catadioptric telescopes) that correct some of these aberrations.",
    "## Use in astronomical research. Nearly all large research-grade astronomical telescopes are reflectors. There are several reasons for this: Reflectors work in a wider spectrum of light since certain wavelengths are absorbed when passing through glass elements like those found in a refractor or in a catadioptric telescope. In a lens the entire volume of material has to be free of imperfection and inhomogeneities, whereas in a mirror, only one surface has to be perfectly polished. Light of different wavelengths travels through a medium other than vacuum at different speeds. This causes chromatic aberration. Reducing this to acceptable levels usually involves a combination of two or three aperture sized lenses. The cost of such systems therefore scales significantly with aperture size. An image obtained from a mirror does not suffer from chromatic aberration to begin with, and the cost of the mirror scales much more modestly with its size.",
    "There are structural problems involved in manufacturing and manipulating large-aperture lenses. Since a lens can only be held in place by its edge, the center of a large lens will sag due to gravity, distorting the image it produces. The largest practical lens size in a refracting telescope is around 1 meter. In contrast, a mirror can be supported by the whole side opposite its reflecting face, allowing for reflecting telescope designs that can overcome gravitational sag. The largest reflector designs currently exceed 10 meters in diameter.",
    "## Reflecting telescope designs. ## Gregorian. The Gregorian telescope, described by Scottish astronomer and mathematician James Gregory in his 1663 book 'Optica Promota', employs a concave secondary mirror that reflects the image back through a hole in the primary mirror. This produces an upright image, useful for terrestrial observations. Some small spotting scopes are still built this way. There are several large modern telescopes that use a Gregorian configuration such as the Vatican Advanced Technology Telescope, the Magellan telescopes, the Large Binocular Telescope, and the Giant Magellan Telescope.",
    "## Newtonian. The Newtonian telescope was the first successful reflecting telescope, completed by Isaac Newton in 1668. It usually has a paraboloid primary mirror but at focal ratios of about f/10 or longer a spherical primary mirror can be sufficient for high visual resolution. A flat secondary mirror reflects the light to a focal plane at the side of the top of the telescope tube. It is one of the simplest and least expensive designs for a given size of primary, and is popular with amateur telescope makers as a home-build project.",
    "## The Cassegrain design and its variations. The cassegrain telescope (sometimes called the 'Classic Cassegrain') was first published in a 1672 design attributed to Laurent Cassegrain. It has a parabolic primary mirror, and a hyperbolic secondary mirror that reflects the light back down through a hole in the primary. The folding and diverging effect of the secondary mirror creates a telescope with a long focal length while having a short tube length.",
    "## Ritchey–Chrétien. The Ritchey–Chrétien telescope, invented by George Willis Ritchey and Henri Chrétien in the early 1910s, is a specialized Cassegrain reflector which has two hyperbolic mirrors (instead of a parabolic primary). It is free of coma and spherical aberration at a nearly flat focal plane if the primary and secondary curvature are properly figured, making it well suited for wide field and photographic observations. Almost every professional reflector telescope in the world is of the Ritchey–Chrétien design.",
    "## Three-mirror anastigmat. Including a third curved mirror allows correction of the remaining distortion, astigmatism, from the Ritchey–Chrétien design. This allows much larger fields of view.",
    "## Dall–Kirkham. The Dall–Kirkham Cassegrain telescope's design was created by Horace Dall in 1928 and took on the name in an article published in 'Scientific American' in 1930 following discussion between amateur astronomer Allan Kirkham and Albert G. Ingalls. It uses a concave elliptical primary mirror and a convex spherical secondary. While this system is easier to grind than a classic Cassegrain or Ritchey–Chrétien system, it does not correct for off-axis coma.",
    "Field curvature is actually less than a classical Cassegrain. Because this is less noticeable at longer focal ratios, Dall–Kirkhams are seldom faster than f/15.",
    "## Off-axis designs. There are several designs that try to avoid obstructing the incoming light by eliminating the secondary or moving any secondary element off the primary mirror's optical axis, commonly called off-axis optical systems.",
    "## Herschelian. The Herschelian reflector is named after William Herschel, who used this design to build very large telescopes including the 40-foot telescope in 1789. In the Herschelian reflector the primary mirror is tilted so the observer's head does not block the incoming light. Although this introduces geometrical aberrations, Herschel employed this design to avoid the use of a Newtonian secondary mirror since the speculum metal mirrors of that time tarnished quickly and could only achieve 60% reflectivity.",
    "## Schiefspiegler. A variant of the Cassegrain, the Schiefspiegler telescope ('skewed' or 'oblique reflector') uses tilted mirrors to avoid the secondary mirror casting a shadow on the primary. However, while eliminating diffraction patterns this leads to an increase in coma and astigmatism. These defects become manageable at large focal ratios — most Schiefspieglers use f/15 or longer, which tends to restrict useful observation to the Moon and planets.",
    "## Stevick-Paul. Stevick-Paul telescopes are off-axis versions of Paul 3-mirror systems with an added flat diagonal mirror. A convex secondary mirror is placed just to the side of the light entering the telescope, and positioned afocally so as to send parallel light on to the tertiary.",
    "## Yolo. The Yolo was developed by Arthur S. Leonard in the mid-1960s. Like the Schiefspiegler, it is an unobstructed, tilted reflector telescope.",
    "## Liquid-mirror telescopes. One design of telescope uses a rotating mirror consisting of a liquid metal in a tray that is spun at constant speed. As the tray spins, the liquid forms a paraboloidal surface of essentially unlimited size. This allows making very big telescope mirrors (over 6 metres), but unfortunately they cannot be steered, as they always point vertically.",
    "## Focal planes. ## Prime focus. In a 'prime focus' design no secondary optics are used, the image is accessed at the focal point of the primary mirror. At the focal point is some type of structure for holding a film plate or electronic detector. In the past, in very large telescopes, an observer would sit inside the telescope in an 'observing cage' to directly view the image or operate a camera. Nowadays CCD cameras allow for remote operation of the telescope from almost anywhere in the world. The space available at prime focus is severely limited by the need to avoid obstructing the incoming light. Radio telescopes often have a prime focus design. The mirror is replaced by a metal surface for reflecting radio waves, and the observer is an antenna.",
    "## Cassegrain focus. For telescopes built to the Cassegrain design or other related designs, the image is formed behind the primary mirror, at the focal point of the secondary mirror. An observer views through the rear of the telescope, or a camera or other instrument is mounted on the rear.",
    "## Nasmyth and coudé focus. ## Nasmyth. The Nasmyth design is similar to the Cassegrain except the light is not directed through a hole in the primary mirror; instead, a third mirror reflects the light to the side of the telescope to allow for the mounting of heavy instruments.",
    "## Coudé. Adding further optics to a Nasmyth-style telescope to deliver the light (usually through the declination axis) to a fixed focus point that does not move as the telescope is reoriented gives a coudé focus (from the French word for elbow). The coudé focus gives a narrower field of view than a Nasmyth focus and is used with very heavy instruments that do not need a wide field of view.",
    "## Fibre-fed spectrographs. For instruments requiring very high stability, or that are very large and cumbersome, it is desirable to mount the instrument on a rigid structure, rather than moving it with the telescope. Whilst transmission of the full field of view would require a standard coudé focus, spectroscopy typically involves the measurement of only a few discrete objects, such as stars or galaxies. It is therefore feasible to collect light from these objects with optical fibers at the telescope, placing the instrument at an arbitrary distance from the telescope. Examples of fiber-fed spectrographs include the planet-hunting spectrographs HARPS or ESPRESSO."
]

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,35,-1_the_of_to_in,"[the, of, to, in, mirror, telescope, and, is, ...",[## History. The idea that curved mirrors beha...


In [8]:
topic_model.get_topic(0)


False