In [None]:
pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (493 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.8/493.8 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.1 konlpy-0.6.0


In [None]:
import pandas as pd
import pandas as pd
from konlpy.tag import Okt
from gensim import corpora
from gensim.models import CoherenceModel, LdaModel
import re
import gensim.corpora as corpora
import spacy
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import Counter
from sklearn.metrics import silhouette_score
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Load data
data = pd.read_csv('/content/drive/MyDrive/URP/Questions_old.csv')
stop_words = pd.read_csv('/content/drive/MyDrive/URP/stopwords-ko.txt')
# Rename the columns
data.rename(columns={'제목': 'title', '질문': 'questions'}, inplace=True)
print(data.columns)


Index(['문서 번호', 'title', 'questions', '위치 고유번호', '조회수', '질문 날짜'], dtype='object')


In [None]:
# Step 1: Define preprocessing function for cleaning Korean text
def preprocess_text_korean(text):
    """
    Preprocess Korean text by:
    - Removing extra spaces
    - Keeping only Korean, English, and spaces
    """
    if pd.isna(text):  # Handle NaN values
        return ""
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\uAC00-\uD7A3a-zA-Z\s]', ' ', text)  # Keep only Korean, English, and spaces
    return text.strip()

# Step 2: Combine title and questions columns for preprocessing
# Assuming 'title' and 'questions' are the columns to process
data['combined_text'] = data['title'] + " " + data['questions']  # Combine title and questions columns

# Step 3: Apply text preprocessing to the combined text
data['cleaned_text'] = data['combined_text'].apply(preprocess_text_korean)

# Step 4: Define tokenization function using Okt tokenizer
def preprocess_korean(text):
    """
    Tokenize Korean text into nouns while removing common stopwords.
    """
    okt = Okt()  # Initialize the Okt tokenizer
    # Define common stopwords to remove from the tokenized text
    common_words = {'질문', '곳', '제', '요', '좀', '어디', '저', '분', '수', '중',
                   '정도', '시', '있나요', '알려주세요', '있을까요', '하는데', '혹시'}
    # Extract nouns and filter out stopwords or single-character tokens
    nouns = [n for n in okt.nouns(str(text)) if n not in common_words and len(n) > 1]
    return nouns

# Step 5: Analyze community topics using LDA
def analyze_community_topics(data, num_topics=6):
    """
    Perform LDA topic modeling on the community questions and calculate a silhouette score.

    Steps:
    - Preprocess and tokenize text
    - Create a dictionary and corpus for LDA
    - Train the LDA model
    - Calculate the silhouette score to evaluate topic separability
    - Retrieve top terms and example questions for each topic
    """

    # Step 5.1: Process documents for topic modeling
    docs = []
    raw_docs = []
    for text in data['cleaned_text']:
        if pd.notna(text):
            tokens = preprocess_korean(text)
            if len(tokens) > 3:
                docs.append(tokens)
                raw_docs.append(text)

    # Step 5.2: Create dictionary and corpus for LDA
    dictionary = corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=5, no_above=0.3)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    # Step 5.3: Train the LDA model
    lda_model = models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=20,
        alpha='symmetric'
    )

    # Step 5.4: Calculate silhouette score to evaluate topic clustering
    # Get the topic distribution for each document
    doc_topics = [lda_model.get_document_topics(doc, minimum_probability=0.0) for doc in corpus]
    # Convert topic distributions into a dense matrix
    topic_vectors = np.array([[prob for _, prob in sorted(doc)] for doc in doc_topics])
    # Assign each document to its dominant topic
    dominant_topics = np.argmax(topic_vectors, axis=1)
    # Compute the silhouette score
    silhouette_avg = silhouette_score(topic_vectors, dominant_topics, metric='cosine')

    # Step 5.5: Extract top terms and example documents for each topic
    topics_info = []
    for topic_id in range(num_topics):
        # Get the top terms for the topic
        top_terms = lda_model.show_topic(topic_id, topn=10)

        # Find example documents for the topic
        topic_docs = []
        for doc_id, doc_bow in enumerate(corpus):
            topic_dist = lda_model.get_document_topics(doc_bow)
            doc_topic_probs = dict(topic_dist)
            if topic_id in doc_topic_probs and doc_topic_probs[topic_id] > 0.4:  # Filter based on topic probability
                topic_docs.append((raw_docs[doc_id], doc_topic_probs[topic_id]))

        # Sort and select top 3 example documents
        topic_docs.sort(key=lambda x: x[1], reverse=True)
        examples = topic_docs[:3]

        # Append the topic information
        topics_info.append({
            'terms': top_terms,
            'examples': examples
        })

    return topics_info, silhouette_avg

# Step 6: Run topic analysis on the dataset
topics_info, silhouette_score = analyze_community_topics(data)

# Step 7: Display the results
print(f"Silhouette Score: {silhouette_score:.3f}\n")
print("Community Question Topics:\n")
for idx, topic in enumerate(topics_info, 1):
    print(f"\nTopic {idx}:")
    print("Key Terms:")
    terms = [(term, f"{prob:.3f}") for term, prob in topic['terms']]
    print(", ".join([f"{term}({prob})" for term, prob in terms]))

    print("\nExample Questions:")
    for text, prob in topic['examples']:
        print(f"- [{prob:.2f}] {text[:100]}...")
    print("-" * 80)

Silhouette Score: 0.596

Community Question Topics:


Topic 1:
Key Terms:
전주(0.051), 임플란트(0.051), 치과(0.046), 시술(0.034), 김제(0.018), 기숙사(0.017), 추천(0.016), 익산(0.015), 병원(0.014), 여수(0.013)

Example Questions:
- [0.96] 질문김제 백구면에 임플란트 시술 중 환자의 구강 상태에 맞    김제 백구면에 임플란트 시술 중 환자의 구강 상태에 맞춘 맞춤형 치료를 제공하는 치과가 있나요...
- [0.96] 질문김제시 요촌동       번지의 영문 표기 Daum지도에서 살펴보니 전라북도 김제시 요촌동       번지 는 향교길 골목길에 접해있는 단독주택입니다  김제시 요촌동      ...
- [0.95] 질문전주 삼천동에서 임플란트 시술 후 환자 맞춤형 회복 계    전주 삼천동에서 임플란트 시술 후 환자 맞춤형 회복 계획을 제공하는 치과는 어디인가요...
--------------------------------------------------------------------------------

Topic 2:
Key Terms:
고등학교(0.043), 학교(0.034), 내신(0.021), 지역(0.016), 여고(0.015), 학원(0.015), 중학교(0.013), 전학(0.012), 학생(0.010), 학년(0.010)

Example Questions:
- [0.98] 질문부산 사직고등학교 이사벨고등학교 저는 중  여학생인데요 중학교는 널널하고 빡센편은 아닌 학교에 진학중이에요 내신성적 퍼센테이지는        정도입니당 여고는 부산진여고 성모여...
- [0.97] 질문군산여고랑 영광여고에 대해 알려주세요 군산여고와 영광여고중에서 어디를 갈지 고민하는 중 입니다  이 두 학교에 대해서 질문이 있습니다    군산여고 내신따기어렵다는데 얼마나 어...
- [0.97] 질문연천중 중 여학생 해운대