In [1]:
"""
This script loads the processed MBTI data and performs two experiments:
1. Ontology-based features: Uses WordNet to extract hypernyms as semantic features, then trains Logistic Regression.
2. Topic modeling: Uses LDA to extract topic distributions as features, then trains Logistic Regression.
Evaluates both on the binary MBTI dimensions (IE, NS, FT, JP) using the test set.
"""

import logging
import os
from typing import List

import joblib
import nltk
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from nltk.corpus import wordnet as wn
from nltk import pos_tag, word_tokenize
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Ensure NLTK WordNet is downloaded
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

def get_hypernyms(word: str, pos: str = 'n') -> List[str]:
    """
    Get hypernyms for a word using WordNet.
    Returns a list of hypernym lemmas at the first level.
    """
    synsets = wn.synsets(word, pos=pos)
    if not synsets:
        return []
    hypernyms = set()
    for syn in synsets:
        for hyper in syn.hypernyms():
            hypernyms.update(lemma.name() for lemma in hyper.lemmas())
    return list(hypernyms)

In [3]:
def ontology_features(texts: pd.Series, max_features: int = 1000) -> np.ndarray:
    """
    Extract ontology-based features using WordNet hypernyms.
    - Tokenize and get hypernyms for nouns/verbs.
    - Use CountVectorizer on the hypernyms as 'semantic bag-of-words'.
    """
    hypernym_docs = []
    for text in tqdm(texts, desc="Extracting hypernyms"):
        tokens = nltk.word_tokenize(text)
        pos_tags = nltk.pos_tag(tokens)
        hypernyms = []
        for word, tag in pos_tags:
            if tag.startswith('N') or tag.startswith('V'):
                hypers = get_hypernyms(word.lower(), pos='n' if tag.startswith('N') else 'v')
                hypernyms.extend(hypers)
        hypernym_docs.append(' '.join(set(hypernyms)))  # Unique hypernyms per doc
    
    vectorizer = CountVectorizer(max_features=max_features)
    features = vectorizer.fit_transform(hypernym_docs).toarray()
    logger.info(f"Ontology features shape: {features.shape}")
    return features, vectorizer

In [4]:
def topic_modeling_features(texts: pd.Series, n_topics: int = 20, max_features: int = 5000) -> np.ndarray:
    """
    Extract topic distributions using LDA.
    - Vectorize with TF-IDF.
    - Fit LDA to get topic probabilities per document.
    """
    tfidf = TfidfVectorizer(max_features=max_features)
    X_tfidf = tfidf.fit_transform(texts)
    
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    features = lda.fit_transform(X_tfidf)
    logger.info(f"LDA features shape: {features.shape}")
    return features, (tfidf, lda)



In [5]:
def train_evaluate_binary(X_train: np.ndarray, X_test: np.ndarray, train_df: pd.DataFrame, test_df: pd.DataFrame,
                          feature_type: str, use_smote: bool = True):
    """
    Train and evaluate Logistic Regression on binary MBTI dimensions.
    """
    results = {}
    for dim in ['IE', 'NS', 'FT', 'JP']:
        y_train = train_df[dim]
        y_test = test_df[dim]
        
        if use_smote:
            smote = SMOTE(random_state=42)
            X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        else:
            X_train_res, y_train_res = X_train, y_train
        
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_train_res, y_train_res)
        
        # Save model (optional)
        model_path = f'../models/binary_{dim}_{feature_type}.pkl'
        joblib.dump(clf, model_path)
        
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        results[dim] = {'accuracy': acc, 'f1': f1}
        
        logger.info(f"{feature_type} - {dim} - Accuracy: {acc:.4f}, F1: {f1:.4f}")
        logger.info(classification_report(y_test, y_pred))
    
    return results

In [6]:
def main():
    # Load processed data
    train_df = pd.read_pickle('../data/processed/train.pkl')
    test_df = pd.read_pickle('../data/processed/test.pkl')
    logger.info(f"Loaded train ({len(train_df)}) and test ({len(test_df)}) data.")
    
    # Use 'cleaned_text' column
    X_train_text = train_df['cleaned_posts']
    X_test_text = test_df['cleaned_posts']
    
    # 1. Ontology-based model
    logger.info("Generating ontology features...")
    X_train_ontology, ontology_vectorizer = ontology_features(X_train_text)
    X_test_hypernym_docs = []
    
    for text in tqdm(X_test_text, desc="Test hypernyms"):
        tokens = word_tokenize(text)
        pos_tags = pos_tag(tokens)
        hypernyms = []
        for word, tag in pos_tags:
            if tag.startswith('N') or tag.startswith('V'):
                pos_wn = 'n' if tag.startswith('N') else 'v'
                hypers = get_hypernyms(word.lower(), pos=pos_wn)
                hypernyms.extend(hypers)
        X_test_hypernym_docs.append(' '.join(set(hypernyms)))
    
    X_test_ontology = ontology_vectorizer.transform(X_test_hypernym_docs).toarray()
    
    ontology_results = train_evaluate_binary(X_train_ontology, X_test_ontology, train_df, test_df, "ontology")
    
    # 2. Topic modeling-based model
    logger.info("Generating topic modeling features...")
    X_train_topics, (topic_tfidf, topic_lda) = topic_modeling_features(X_train_text)
    X_test_tfidf = topic_tfidf.transform(X_test_text)
    X_test_topics = topic_lda.transform(X_test_tfidf)
    
    topic_results = train_evaluate_binary(X_train_topics, X_test_topics, train_df, test_df, "topics")
    
    # Compare results
    logger.info("Ontology Results:")
    for dim, res in ontology_results.items():
        logger.info(f"{dim}: Acc={res['accuracy']:.4f}, F1={res['f1']:.4f}")
    
    logger.info("Topic Modeling Results:")
    for dim, res in topic_results.items():
        logger.info(f"{dim}: Acc={res['accuracy']:.4f}, F1={res['f1']:.4f}")

if __name__ == "__main__":
    main()

2025-11-22 14:44:31,662 - INFO - Loaded train (6940) and test (1735) data.
2025-11-22 14:44:31,663 - INFO - Generating ontology features...
Extracting hypernyms: 100%|██████████| 6940/6940 [09:40<00:00, 11.95it/s]
2025-11-22 14:54:24,450 - INFO - Ontology features shape: (6940, 1000)
Test hypernyms: 100%|██████████| 1735/1735 [02:20<00:00, 12.38it/s]
2025-11-22 14:56:55,167 - INFO - ontology - IE - Accuracy: 0.6427, F1: 0.6497
2025-11-22 14:56:55,180 - INFO -               precision    recall  f1-score   support

           0       0.26      0.29      0.27       401
           1       0.78      0.75      0.76      1334

    accuracy                           0.64      1735
   macro avg       0.52      0.52      0.52      1735
weighted avg       0.66      0.64      0.65      1735

2025-11-22 14:57:02,857 - INFO - ontology - NS - Accuracy: 0.7389, F1: 0.7587
2025-11-22 14:57:02,871 - INFO -               precision    recall  f1-score   support

           0       0.20      0.29      0.24