## 1. Data collection

In [1]:
import requests
import pandas as pd
import time
from datetime import datetime
import html # for decoding HTML entities

STACK_API_KEY = 'rl_dcgdAD45Dx2PKtP3xdw9FFoB2' 
# -------------------------------------------------------------

if STACK_API_KEY is None: 
    print("*"*60)
    print("Warning: STACK_API_KEY not configured.")
    print("Strongly recommend registering and using an API Key for higher request quota.")
    print("Visit https://stackapps.com/apps/oauth/register to get a Key.")
    print("Running without a Key can easily trigger 429 Too Many Requests errors.")
    print("*"*60)
    # time.sleep(5)

TAG_TO_FETCH = 'nlp'
START_YEAR = 2008
CURRENT_YEAR = datetime.now().year
OUTPUT_FILENAME = f"{TAG_TO_FETCH}_stackoverflow_posts_collected.csv"
API_BASE_URL = "https://api.stackexchange.com/2.3"
API_SITE = "stackoverflow"
MAX_RETRIES_ON_429 = 3 
INITIAL_WAIT_ON_429 = 60 

# --- Helper functions ---

def decode_html_entities(text):
    """Decode HTML entities in text, e.g., &amp; -> &"""
    if isinstance(text, str):
        return html.unescape(text)
    return text

# --- Main data retrieval functions ---

def fetch_questions(tag, start_year, end_year, api_key=None):
    """Get question data for specified tag and year range, with retry logic"""
    all_questions = []
    question_ids = set()
    question_filter = 'withbody' 

    print(f"Starting to fetch questions tagged [{tag}] from {start_year} to {end_year}")

    for year in range(start_year, end_year + 1):
        print(f"\nFetching questions for year {year}...")
        from_timestamp = int(datetime(year, 1, 1).timestamp())
        to_timestamp = int(datetime(year, 12, 31, 23, 59, 59).timestamp())

        page = 1
        has_more = True
        retries_429 = 0 
        current_wait_429 = INITIAL_WAIT_ON_429

        while has_more:
            print(f"  Fetching page {page} for year {year} (Attempt {retries_429 + 1})...")
            params = {
                'order': 'desc', 'sort': 'creation', 'tagged': tag,
                'site': API_SITE, 'pagesize': 100, 'page': page,
                'filter': question_filter, 'fromdate': from_timestamp,
                'todate': to_timestamp
            }
            if api_key:
                params['key'] = api_key

            data = None 
            try:
                response = requests.get(f"{API_BASE_URL}/questions", params=params, timeout=45) # Longer timeout

                if response.status_code == 200:
                    retries_429 = 0 
                    current_wait_429 = INITIAL_WAIT_ON_429 
                    data = response.json()

                    if 'backoff' in data:
                        wait_time = data['backoff']
                        print(f"  API requested backoff: Sleeping for {wait_time} seconds.")
                        time.sleep(wait_time + 1)
                    else:
                        time.sleep(1.1) 

                elif response.status_code == 429:
                    print(f"  Error: Received status code 429 (Too Many Requests).")
                    if retries_429 < MAX_RETRIES_ON_429:
                        retries_429 += 1
                        print(f"  Waiting for {current_wait_429} seconds before retry ({retries_429}/{MAX_RETRIES_ON_429})...")
                        time.sleep(current_wait_429)
                        current_wait_429 *= 2 
                        continue 
                    else:
                        print(f"  Max retries ({MAX_RETRIES_ON_429}) reached for page {page}, year {year}. Stopping for this year.")
                        has_more = False 
                        data = None
                else:
                    print(f"  Error: Received status code {response.status_code}")
                    print(f"  Response content: {response.text[:500]}")
                    print(f"  Stopping fetch for year {year}.")
                    has_more = False 
                    data = None
                    time.sleep(5) 

                # --- Process successfully retrieved data ---
                if data is not None and has_more:
                    items = data.get('items', [])
                    if not items:
                        print(f"  No more items found for year {year} on page {page}.")
                        has_more = False
                    else:
                        new_items_count = 0
                        for item in items:
                            if item['question_id'] not in question_ids:
                               item['title'] = decode_html_entities(item.get('title'))
                               item['body'] = decode_html_entities(item.get('body'))
                               question_data = {
                                   'question_id': item.get('question_id'), 'title': item.get('title'),
                                   'body': item.get('body'), 'tags': item.get('tags', []),
                                   'accepted_answer_id': item.get('accepted_answer_id'),
                                   'creation_date': item.get('creation_date'),
                                   'view_count': item.get('view_count', 0), 'link': item.get('link'),
                                   'is_answered': item.get('is_answered', False)
                               }
                               all_questions.append(question_data)
                               question_ids.add(item['question_id'])
                               new_items_count += 1

                        print(f"  Collected {new_items_count} new items from page {page}. Total unique items: {len(all_questions)}")
                        has_more = data.get('has_more', False)
                        page += 1

                    if not has_more:
                         print(f"  No more pages for year {year}.")

                    if data.get('quota_remaining', 1) <= 1:
                        print("  Warning: Low API quota remaining. Sleeping for 60 seconds.")
                        time.sleep(60)

            except requests.exceptions.RequestException as e:
                print(f"  Network error fetching page {page} for year {year}: {e}")
                print("  Waiting for 60 seconds before potentially retrying...")
                time.sleep(60)
                # Simple handling: can choose to retry (continue) or abandon current year (break/has_more=False)
                # Here we choose to abandon the current year to simplify logic
                print(f"  Stopping fetch for year {year} due to network error.")
                has_more = False

        # Brief pause after completing a year
        print(f"Finished fetching for year {year}. Pausing briefly...")
        time.sleep(5)

    print(f"\nFinished fetching questions. Total unique questions collected: {len(all_questions)}")
    return all_questions


def fetch_answer_bodies(answer_ids, api_key=None):
    """Batch retrieve answer bodies based on answer ID list, with retry logic"""
    print(f"\nStarting to fetch bodies for {len(answer_ids)} accepted answers...")
    answer_bodies = {}
    ids_to_fetch = list(answer_ids)
    answer_filter = 'withbody'
    batch_size = 100

    for i in range(0, len(ids_to_fetch), batch_size):
        chunk = ids_to_fetch[i : i + batch_size]
        ids_str = ";".join(map(str, chunk))
        retries_429 = 0
        current_wait_429 = INITIAL_WAIT_ON_429
        current_batch_success = False

        while retries_429 <= MAX_RETRIES_ON_429 and not current_batch_success:
            print(f"  Fetching batch {i // batch_size + 1}/{(len(ids_to_fetch) + batch_size - 1) // batch_size} (IDs: {chunk[0]}...) (Attempt {retries_429 + 1})")
            params = {
                'site': API_SITE, 'filter': answer_filter,
                'pagesize': batch_size
            }
            if api_key:
                params['key'] = api_key

            data = None
            try:
                response = requests.get(f"{API_BASE_URL}/answers/{ids_str}", params=params, timeout=45)

                if response.status_code == 200:
                    current_batch_success = True 
                    data = response.json()
                    if 'backoff' in data:
                        wait_time = data['backoff']
                        print(f"  API requested backoff: Sleeping for {wait_time} seconds.")
                        time.sleep(wait_time + 1)
                    else:
                        time.sleep(1.1)

                elif response.status_code == 429:
                    print(f"  Error: Received status code 429 (Too Many Requests).")
                    retries_429 += 1
                    if retries_429 <= MAX_RETRIES_ON_429:
                        print(f"  Waiting for {current_wait_429} seconds before retry ({retries_429}/{MAX_RETRIES_ON_429})...")
                        time.sleep(current_wait_429)
                        current_wait_429 *= 2
                        # No need for continue, the while condition will handle retries
                    else:
                        print(f"  Max retries ({MAX_RETRIES_ON_429}) reached for answer batch starting with ID {chunk[0]}. Skipping batch.")
                        # Break out of inner while loop, process next batch
                        break
                else:
                    print(f"  Error: Received status code {response.status_code} fetching answers.")
                    print(f"  Response content: {response.text[:500]}")
                    print(f"  Skipping this batch of answers.")
                    # Break out of inner while loop
                    break

                # --- Process successfully retrieved data ---
                if data is not None and current_batch_success:
                    items = data.get('items', [])
                    if items:
                        fetched_count = 0
                        for item in items:
                            answer_bodies[item['answer_id']] = decode_html_entities(item.get('body'))
                            fetched_count +=1
                        print(f"  Fetched {fetched_count} answer bodies in this batch.")
                    else:
                        print("  No items returned for this batch of answer IDs.")

                    if data.get('quota_remaining', 1) <= 1:
                        print("  Warning: Low API quota remaining. Sleeping for 60 seconds.")
                        time.sleep(60)

            except requests.exceptions.RequestException as e:
                print(f"  Network error fetching answer batch starting with ID {chunk[0]}: {e}")
                print("  Waiting for 60 seconds before potentially retrying...")
                time.sleep(60)
                # Simple handling: retry (if retries_429 not at limit) or abandon
                retries_429 += 1
                if retries_429 > MAX_RETRIES_ON_429:
                    print(f"  Skipping batch due to repeated network errors.")
                    break # Break out of inner while loop

    print(f"\nFinished fetching answer bodies. Found bodies for {len(answer_bodies)} answers.")
    return answer_bodies


# --- Main program ---
if __name__ == "__main__":
    # 1. Get all questions
    questions = fetch_questions(TAG_TO_FETCH, START_YEAR, CURRENT_YEAR, STACK_API_KEY)

    if not questions:
        print("No questions collected. Exiting.")
        exit()

    print(f"\nTotal unique questions collected: {len(questions)}")
    if len(questions) < 20000:
        print(f"Warning: Collected {len(questions)} posts, which is less than the target of 20,000.")


    # 2. Extract accepted answer IDs that need body fetching
    accepted_answer_ids = set(q['accepted_answer_id'] for q in questions if q.get('accepted_answer_id'))
    print(f"Found {len(accepted_answer_ids)} unique accepted answer IDs to fetch.")

    # 3. Get answer bodies
    answer_bodies_map = fetch_answer_bodies(accepted_answer_ids, STACK_API_KEY) if accepted_answer_ids else {}

    # 4. Build DataFrame
    print("\nBuilding final DataFrame...")
    df = pd.DataFrame(questions)
    df['creation_date'] = pd.to_datetime(df['creation_date'], unit='s')
    df['accepted_answer_body'] = df['accepted_answer_id'].map(answer_bodies_map)

    missing_bodies_count = df['accepted_answer_id'].notna().sum() - df['accepted_answer_body'].notna().sum()
    if missing_bodies_count > 0:
         print(f"Info: Could not fetch bodies for {missing_bodies_count} accepted answers (may be deleted/inaccessible).")

    # 5. Select and organize final columns
    final_columns = [
        'question_id', 'title', 'body', 'tags', 'accepted_answer_id',
        'accepted_answer_body', 'creation_date', 'view_count', 'link', 'is_answered'
    ]
    final_columns = [col for col in final_columns if col in df.columns]
    df_final = df[final_columns]

    print("\nFinal DataFrame structure:")
    print(df_final.info())
    print("\nSample data:")
    print(df_final.head())

    # 6. Save to CSV
    try:
        df_final.to_csv(OUTPUT_FILENAME, index=False, encoding='utf-8')
        print(f"\nSuccessfully saved collected data to {OUTPUT_FILENAME}")
    except Exception as e:
        print(f"\nError saving data to CSV: {e}")

Starting to fetch questions tagged [nlp] from 2008 to 2025

Fetching questions for year 2008...
  Fetching page 1 for year 2008 (Attempt 1)...
  Collected 45 new items from page 1. Total unique items: 45
  No more pages for year 2008.
Finished fetching for year 2008. Pausing briefly...

Fetching questions for year 2009...
  Fetching page 1 for year 2009 (Attempt 1)...
  Collected 100 new items from page 1. Total unique items: 145
  Fetching page 2 for year 2009 (Attempt 1)...
  Collected 61 new items from page 2. Total unique items: 206
  No more pages for year 2009.
Finished fetching for year 2009. Pausing briefly...

Fetching questions for year 2010...
  Fetching page 1 for year 2010 (Attempt 1)...
  Collected 100 new items from page 1. Total unique items: 306
  Fetching page 2 for year 2010 (Attempt 1)...
  Collected 100 new items from page 2. Total unique items: 406
  Fetching page 3 for year 2010 (Attempt 1)...
  Collected 99 new items from page 3. Total unique items: 505
  No mor

## 2. Data Pre-processing

In [2]:
import nltk

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import html
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_data(csv_file):
    """
    Perform preprocessing on the dataset
    
    Parameters:
    csv_file (str): Path to the CSV file
    
    Returns:
    pandas.DataFrame: Preprocessed dataframe
    """
    print("Starting data preprocessing...")
    
    # Read the CSV file
    df = pd.read_csv(csv_file)
    print(f"Original dataset contains {df.shape[0]} rows and {df.shape[1]} columns")
    
    # 1. HTML decoding - decode HTML entities (e.g., &amp; -> &)
    print("Performing HTML decoding...")
    for col in ['title', 'body', 'accepted_answer_body']:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: html.unescape(x) if isinstance(x, str) else x)
    
    # 2. Remove HTML tags - remove <p>, <code>, etc. tags
    print("Removing HTML tags...")
    def remove_html_tags(text):
        if not isinstance(text, str):
            return text
        return re.sub(r'<.*?>', ' ', text)
    
    for col in ['title', 'body', 'accepted_answer_body']:
        if col in df.columns:
            df[col] = df[col].apply(remove_html_tags)
    
    # 3. Convert to lowercase
    print("Converting to lowercase...")
    for col in ['title', 'body', 'accepted_answer_body']:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: x.lower() if isinstance(x, str) else x)
    
    # 4. Remove punctuation and special characters
    print("Removing punctuation and special characters...")
    def remove_punctuation(text):
        if not isinstance(text, str):
            return text
        # Create a translation table to map all punctuation to spaces
        translator = str.maketrans('', '', string.punctuation)
        # Apply the translation table
        return text.translate(translator)
    
    for col in ['title', 'body', 'accepted_answer_body']:
        if col in df.columns:
            df[col + '_clean'] = df[col].apply(remove_punctuation)
    
    # 5. Tokenization and stopword removal
    print("Performing tokenization and stopword removal...")
    stop_words = set(stopwords.words('english'))
    
    def tokenize_and_remove_stopwords(text):
        if not isinstance(text, str):
            return []
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        return filtered_tokens
    
    for col in ['title', 'body', 'accepted_answer_body']:
        if col in df.columns:
            df[col + '_tokens'] = df[col + '_clean'].apply(tokenize_and_remove_stopwords)
    
    # 6. Process tags column - if tags is a string representation of a list, convert to actual list
    print("Processing tags column...")
    if 'tags' in df.columns:
        df['tags_list'] = df['tags'].apply(lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else x)
    
    print("Data preprocessing complete!")
    return df

# Use the function
if __name__ == "__main__":
    preprocessed_df = preprocess_data("nlp_stackoverflow_posts_collected.csv")
    # Save the preprocessed data
    preprocessed_df.to_csv("nlp_stackoverflow_posts_preprocessed.csv", index=False)
    print("Preprocessed data saved to 'nlp_stackoverflow_posts_preprocessed.csv'")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Starting data preprocessing...
Original dataset contains 13649 rows and 10 columns
Performing HTML decoding...
Removing HTML tags...
Converting to lowercase...
Removing punctuation and special characters...
Performing tokenization and stopword removal...
Processing tags column...
Data preprocessing complete!
Preprocessed data saved to 'nlp_stackoverflow_posts_preprocessed.csv'


## 3.  Word Cloud

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from PIL import Image
import os

def create_wordcloud(csv_file, output_folder='visualizations'):
    """
    Generate a word cloud for post titles
    
    Parameters:
    csv_file (str): Path to the preprocessed CSV file
    output_folder (str): Output folder
    """
    print("Starting word cloud creation...")
    
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Read the preprocessed data
    df = pd.read_csv(csv_file)
    
    if 'title_tokens' in df.columns:
        text_column = 'title_tokens'
        # Join token lists into strings
        text_data = df[text_column].apply(lambda x: ' '.join(eval(x)) if isinstance(x, str) else '').str.cat(sep=' ')
    elif 'title_clean' in df.columns:
        text_column = 'title_clean'
        text_data = ' '.join(df[text_column].dropna().astype(str))
    else:
        text_column = 'title'
        text_data = ' '.join(df[text_column].dropna().astype(str))
    
    print(f"Using '{text_column}' column to generate word cloud...")
    
    
    # Create the word cloud
    wordcloud = WordCloud(
        width=800, 
        height=400,
        background_color='white',
        max_words=200,
        # mask=mask,  # Uncomment to use custom shape
        contour_width=1,
        contour_color='steelblue',
        colormap='viridis'
    ).generate(text_data)
    
    # Save the word cloud image
    wordcloud_path = os.path.join(output_folder, 'nlp_title_wordcloud.png')
    wordcloud.to_file(wordcloud_path)
    
    # Display the word cloud using matplotlib
    plt.figure(figsize=(16, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title('Common Terms in Stack Overflow NLP Post Titles', fontsize=20)
    plt.tight_layout(pad=0)
    
    # Save the matplotlib image
    plt_path = os.path.join(output_folder, 'nlp_title_wordcloud_plt.png')
    plt.savefig(plt_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"Word cloud saved to '{wordcloud_path}' and '{plt_path}'")
    
    print("Creating frequency chart of the top 20 common words...")
    word_freq = {}
    for tokens in df[text_column]:
        if isinstance(tokens, str):
            try:
                token_list = eval(tokens) if tokens.startswith('[') else tokens.split()
                for word in token_list:
                    if len(word) > 2:  # Ignore words that are too short
                        word_freq[word] = word_freq.get(word, 0) + 1
            except:
                continue
    
    # Get the top 20 most common words
    top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]
    words = [item[0] for item in top_words]
    freqs = [item[1] for item in top_words]
    
    # Create a bar chart
    plt.figure(figsize=(12, 8))
    plt.barh(words, freqs, color='skyblue')
    plt.xlabel('Frequency', fontsize=12)
    plt.ylabel('Word', fontsize=12)
    plt.title('Top 20 Most Common Terms in Stack Overflow NLP Posts', fontsize=16)
    plt.tight_layout()
    
    # Save the frequency chart
    freq_path = os.path.join(output_folder, 'nlp_top_words_frequency.png')
    plt.savefig(freq_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"Word frequency chart saved to '{freq_path}'")
    print("Word cloud visualization complete!")
    
    return wordcloud_path, freq_path

# Use the function
if __name__ == "__main__":
    create_wordcloud("nlp_stackoverflow_posts_preprocessed.csv")

Starting word cloud creation...
Using 'title_tokens' column to generate word cloud...
Word cloud saved to 'visualizations/nlp_title_wordcloud.png' and 'visualizations/nlp_title_wordcloud_plt.png'
Creating frequency chart of the top 20 common words...
Word frequency chart saved to 'visualizations/nlp_top_words_frequency.png'
Word cloud visualization complete!


## 4. Posts Categorization

In [5]:
import pandas as pd
import re
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import os
import json

def categorize_posts(csv_file, output_folder='categorization'):
    """
    Categorize NLP posts
    
    Parameters:
    csv_file (str): Path to the preprocessed CSV file
    output_folder (str): Output folder
    
    Returns:
    pandas.DataFrame: DataFrame containing categorization results
    """
    print("Starting post categorization...")
    
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Read the preprocessed data
    df = pd.read_csv(csv_file)
    print(f"Read {df.shape[0]} posts for categorization")
    
    # Initialize categorization columns
    df['category'] = None
    df['subcategory'] = None
    df['classification_reason'] = None
    
    # Define categorization rules
    
    # 1. Implementation Issues
    implementation_patterns = [
        r'\bhow to\b', r'\bhow do i\b', r'\bhow can i\b', r'\bhow would i\b',
        r'\bhow should i\b', r'\bimplementation\b', r'\bimplement\b'
    ]
    
    # 2. Understanding Issues
    understanding_patterns = [
        r'\bwhat is\b', r'\bwhat are\b', r'\bwhy\b', r'\bwhat does\b',
        r'\bmean\b', r'\bmeaning\b', r'\bdefinition\b', r'\bunderstand\b', r'\bexplain\b'
    ]
    
    # 3. Task-based categories
    nlp_tasks = {
        'text_similarity': [
            r'similar', r'similarity', r'distance', r'compare text', r'match', 
            r'semantic similarity', r'document similarity', r'text comparison'
        ],
        'tokenization': [
            r'tokeniz', r'token', r'split text', r'segment', r'word breaking',
            r'sentence boundary', r'sentence splitting'
        ],
        'stemming_lemmatization': [
            r'stem', r'lemma', r'root word', r'word form', r'inflect',
            r'morphological', r'normalization'
        ],
        'language_identification': [
            r'language detect', r'identify language', r'determine language',
            r'language recognition', r'language identification'
        ],
        'sentiment_analysis': [
            r'sentiment', r'opinion', r'emotion', r'feeling', r'polarity',
            r'positive negative', r'attitude', r'tone'
        ],
        'topic_modeling': [
            r'topic model', r'lda', r'latent dirichlet', r'topic extraction',
            r'theme', r'document clustering', r'topic classification'
        ],
        'entity_recognition': [
            r'ner', r'named entity', r'entity extraction', r'entity recognition',
            r'entity detect', r'information extraction'
        ],
        'text_summarization': [
            r'summar', r'extract key', r'condense', r'digest', r'abstract',
            r'text reduction', r'key information'
        ],
        'pos_tagging': [
            r'pos tag', r'part of speech', r'pos', r'syntactic category',
            r'grammatical tag', r'word class'
        ],
        'machine_translation': [
            r'translat', r'language conversion', r'cross-lingual', r'multilingual',
            r'language transfer'
        ]
    }
    
    # 4. Technology-based categories
    technologies = {
        'spacy': [r'spacy', r'spacyjs'],
        'nltk': [r'nltk', r'natural language toolkit'],
        'transformers': [
            r'transformer', r'bert', r'gpt', r'huggingface', r't5', 
            r'roberta', r'xlnet', r'distilbert'
        ],
        'word_embeddings': [
            r'embedding', r'word2vec', r'glove', r'fasttext', r'word vector',
            r'distributed representation', r'semantic vector'
        ],
        'tensorflow_keras': [r'tensorflow', r'keras', r'tf\.', r'tf2'],
        'pytorch': [r'pytorch', r'torch', r'nn\.'],
        'gensim': [r'gensim'],
        'stanford_nlp': [r'stanford', r'corenlp', r'stanford parser'],
        'openai': [r'openai', r'gpt-3', r'chatgpt', r'davinci']
    }
    
    # Apply categorization rules to each post
    categorized_count = 0
    
    for idx, row in df.iterrows():
        if idx % 1000 == 0:
            print(f"Processed {idx} posts...")
        
        # Get title and content
        title = str(row['title']).lower() if pd.notna(row['title']) else ""
        body = str(row['body']).lower() if pd.notna(row['body']) else ""
        
        # Combine text for categorization
        text = f"{title} {body}"
        
        # 1. Check if it's an implementation issue
        for pattern in implementation_patterns:
            if re.search(pattern, text):
                df.at[idx, 'category'] = 'implementation_issues'
                df.at[idx, 'classification_reason'] = f"Matched implementation pattern: {pattern}"
                
                for task, task_patterns in nlp_tasks.items():
                    for task_pattern in task_patterns:
                        if re.search(task_pattern, text):
                            df.at[idx, 'subcategory'] = task
                            break
                    if pd.notna(df.at[idx, 'subcategory']):
                        break
                        
                if pd.isna(df.at[idx, 'subcategory']):
                    for tech, tech_patterns in technologies.items():
                        for tech_pattern in tech_patterns:
                            if re.search(tech_pattern, text):
                                df.at[idx, 'subcategory'] = tech
                                break
                        if pd.notna(df.at[idx, 'subcategory']):
                            break
                
                categorized_count += 1
                break
        
        # 2. If not an implementation issue, check if it's an understanding issue
        if pd.isna(df.at[idx, 'category']):
            for pattern in understanding_patterns:
                if re.search(pattern, text):
                    df.at[idx, 'category'] = 'understanding_issues'
                    df.at[idx, 'classification_reason'] = f"Matched understanding pattern: {pattern}"
                    
                    # Check if there's a subcategory (task or technology)
                    for task, task_patterns in nlp_tasks.items():
                        for task_pattern in task_patterns:
                            if re.search(task_pattern, text):
                                df.at[idx, 'subcategory'] = task
                                break
                        if pd.notna(df.at[idx, 'subcategory']):
                            break
                            
                    if pd.isna(df.at[idx, 'subcategory']):
                        for tech, tech_patterns in technologies.items():
                            for tech_pattern in tech_patterns:
                                if re.search(tech_pattern, text):
                                    df.at[idx, 'subcategory'] = tech
                                    break
                            if pd.notna(df.at[idx, 'subcategory']):
                                break
                    
                    categorized_count += 1
                    break
        
        # 3. If none of the above, check task-related categories
        if pd.isna(df.at[idx, 'category']):
            for task, patterns in nlp_tasks.items():
                for pattern in patterns:
                    if re.search(pattern, text):
                        df.at[idx, 'category'] = 'nlp_task'
                        df.at[idx, 'subcategory'] = task
                        df.at[idx, 'classification_reason'] = f"Matched NLP task: {task}, pattern: {pattern}"
                        categorized_count += 1
                        break
                if pd.notna(df.at[idx, 'category']):
                    break
        
        # 4. If none of the above, check technology-related categories
        if pd.isna(df.at[idx, 'category']):
            for tech, patterns in technologies.items():
                for pattern in patterns:
                    if re.search(pattern, text):
                        df.at[idx, 'category'] = 'nlp_technology'
                        df.at[idx, 'subcategory'] = tech
                        df.at[idx, 'classification_reason'] = f"Matched technology term: {tech}, pattern: {pattern}"
                        categorized_count += 1
                        break
                if pd.notna(df.at[idx, 'category']):
                    break
    
    print(f"Categorization complete! Categorized a total of {categorized_count} posts")
    
    if categorized_count < 100:
        print(f"Warning: Only {categorized_count} posts were categorized, less than the required 100")
    
    # Count categories
    category_counts = df['category'].value_counts()
    subcategory_counts = df['subcategory'].value_counts()
    
    print("\nCategory distribution:")
    print(category_counts)
    print("\nSubcategory distribution:")
    print(subcategory_counts)
    
    categories_under_10 = [cat for cat, count in category_counts.items() if count < 10 and pd.notna(cat)]
    if categories_under_10:
        print(f"\nWarning: The following categories contain fewer than 10 posts: {categories_under_10}")
    
    # Save categorization results
    categorized_df = df[~df['category'].isna()]
    categorized_df.to_csv(os.path.join(output_folder, 'nlp_posts_categorized.csv'), index=False)
    print(f"Categorization results saved to '{os.path.join(output_folder, 'nlp_posts_categorized.csv')}'")
    
    # Create categorization visualizations
    # 1. Main categories pie chart
    plt.figure(figsize=(10, 8))
    category_counts.plot.pie(autopct='%1.1f%%', startangle=90, fontsize=12, figsize=(10, 8))
    plt.title('Distribution of Main Categories for NLP Posts', fontsize=16)
    plt.axis('equal')
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, 'category_distribution_pie.png'), dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Subcategories bar chart
    plt.figure(figsize=(12, 10))
    subcategory_counts.plot.barh(fontsize=12, figsize=(12, 10))
    plt.title('Distribution of Subcategories for NLP Posts', fontsize=16)
    plt.xlabel('Number of Posts', fontsize=12)
    plt.ylabel('Subcategory', fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, 'subcategory_distribution_bar.png'), dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Save categorization rules for reference
    classification_rules = {
        'implementation_patterns': implementation_patterns,
        'understanding_patterns': understanding_patterns,
        'nlp_tasks': nlp_tasks,
        'technologies': technologies
    }
    
    with open(os.path.join(output_folder, 'classification_rules.json'), 'w', encoding='utf-8') as f:
        json.dump(classification_rules, f, indent=4, ensure_ascii=False)
    
    print(f"Classification rules saved to '{os.path.join(output_folder, 'classification_rules.json')}'")
    print("Post categorization complete!")
    
    return df

if __name__ == "__main__":
    categorized_df = categorize_posts("nlp_stackoverflow_posts_preprocessed.csv")

Starting post categorization...
Read 13649 posts for categorization
Processed 0 posts...
Processed 1000 posts...
Processed 2000 posts...
Processed 3000 posts...
Processed 4000 posts...
Processed 5000 posts...
Processed 6000 posts...
Processed 7000 posts...
Processed 8000 posts...
Processed 9000 posts...
Processed 10000 posts...
Processed 11000 posts...
Processed 12000 posts...
Processed 13000 posts...
Categorization complete! Categorized a total of 12833 posts

Category distribution:
category
implementation_issues    6208
nlp_task                 3851
understanding_issues     2210
nlp_technology            564
Name: count, dtype: int64

Subcategory distribution:
subcategory
tokenization               3204
text_similarity            2832
entity_recognition         1407
pos_tagging                1105
stemming_lemmatization      686
sentiment_analysis          442
word_embeddings             355
transformers                334
spacy                       262
nltk                        2