In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from typing import List, Tuple
import os
import joblib
from tqdm import tqdm

In [2]:
# Load data
print("üìÇ Loading processed data...")
train_df = pd.read_pickle('../data/processed/train.pkl')
test_df = pd.read_pickle('../data/processed/test.pkl')

# Variants
variants = [
    'without_lemma',
    'with_lemma',
    'with_lemma_pos',
    'with_dep_tree',
    'with_chunking'
]

üìÇ Loading processed data...


In [3]:
# Function to prepare text from tokens/ngrams
def prepare_text(tokens: List, ngrams_b: List[Tuple], ngrams_t: List[Tuple], use_ngrams: bool = False) -> str:
    """
    Convert tokens and optionally ngrams to string for TF-IDF.
    For ngrams, join tuples into space-separated strings.
    """
    if not tokens:
        return ''
    # Handle different token formats (str or tuples)
    if isinstance(tokens[0], str):
        text = ' '.join(tokens)
    elif isinstance(tokens[0], tuple):
        text = ' '.join(['_'.join(t) for t in tokens])
    else:
        text = ''
    
    if use_ngrams:
        bigrams_str = ' '.join(['_'.join(gram) for gram in ngrams_b]) if ngrams_b else ''
        trigrams_str = ' '.join(['_'.join(gram) for gram in ngrams_t]) if ngrams_t else ''
        text = f"{text} {bigrams_str} {trigrams_str}".strip()
    
    return text

In [4]:
# Results storage
results_multi = {}
results_binary = {dim: {} for dim in ['IE', 'NS', 'FT', 'JP']}

In [5]:
# Loop over variants
for var in variants:
    print(f"\nüîç Processing variant: {var}")
    
    # Prepare train and test texts
    print("üìù Preparing text features...")
    tqdm.pandas()
    train_df['text'] = train_df.progress_apply(
        lambda row: prepare_text(row[f'tokens_{var}'], row[f'Bigrams_{var}'], row[f'Trigrams_{var}'], use_ngrams=True), axis=1
    )
    test_df['text'] = test_df.progress_apply(
        lambda row: prepare_text(row[f'tokens_{var}'], row[f'Bigrams_{var}'], row[f'Trigrams_{var}'], use_ngrams=True), axis=1
    )
    
    X_train = train_df['text']
    X_test = test_df['text']
    
    # Multi-class (16 types)
    print("üß† Training multi-class model with balanced class weights...")
    y_train_multi = train_df['type']
    y_test_multi = test_df['type']
    
    pipeline_multi = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,3))),
        ('clf', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, class_weight='balanced'))
    ])
    
    pipeline_multi.fit(X_train, y_train_multi)
    y_pred_multi = pipeline_multi.predict(X_test)
    
    acc_multi = accuracy_score(y_test_multi, y_pred_multi)
    f1_multi = f1_score(y_test_multi, y_pred_multi, average='weighted')
    
    results_multi[var] = {'accuracy': acc_multi, 'f1': f1_multi}
    print(f"Multi-class - Accuracy: {acc_multi:.4f}, F1: {f1_multi:.4f}")
    print(classification_report(y_test_multi, y_pred_multi))
    
    # Save model
    os.makedirs('models', exist_ok=True)
    joblib.dump(pipeline_multi, f'models/multi_{var}.pkl')
    
    # Binary classifiers for each dimension
    for dim in ['IE', 'NS', 'FT', 'JP']:
        print(f"üß† Training binary model for {dim} with balanced class weights...")
        y_train_bin = train_df[dim]
        y_test_bin = test_df[dim]
        
        pipeline_bin = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,3))),
            ('clf', LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced'))
        ])
        
        pipeline_bin.fit(X_train, y_train_bin)
        y_pred_bin = pipeline_bin.predict(X_test)
        
        acc_bin = accuracy_score(y_test_bin, y_pred_bin)
        f1_bin = f1_score(y_test_bin, y_pred_bin, average='weighted')
        
        results_binary[dim][var] = {'accuracy': acc_bin, 'f1': f1_bin}
        print(f"{dim} - Accuracy: {acc_bin:.4f}, F1: {f1_bin:.4f}")
        print(classification_report(y_test_bin, y_pred_bin))
        
        # Save model
        joblib.dump(pipeline_bin, f'models/binary_{dim}_{var}.pkl')


üîç Processing variant: without_lemma
üìù Preparing text features...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6940/6940 [00:02<00:00, 2337.82it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1735/1735 [00:00<00:00, 3589.66it/s]


üß† Training multi-class model with balanced class weights...




Multi-class - Accuracy: 0.4732, F1: 0.4866
              precision    recall  f1-score   support

        ENFJ       0.24      0.45      0.31        38
        ENFP       0.41      0.42      0.41       135
        ENTJ       0.22      0.43      0.29        46
        ENTP       0.50      0.47      0.49       137
        ESFJ       0.33      0.22      0.27         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.20      0.12      0.15         8
        ESTP       0.11      0.11      0.11        18
        INFJ       0.64      0.41      0.50       294
        INFP       0.65      0.55      0.59       366
        INTJ       0.52      0.44      0.48       218
        INTP       0.60      0.57      0.59       261
        ISFJ       0.27      0.42      0.33        33
        ISFP       0.23      0.44      0.30        54
        ISTJ       0.20      0.32      0.25        41
        ISTP       0.39      0.60      0.47        67

    accuracy                         

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6940/6940 [00:03<00:00, 2209.52it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1735/1735 [00:00<00:00, 2151.45it/s]


üß† Training multi-class model with balanced class weights...




Multi-class - Accuracy: 0.4749, F1: 0.4887
              precision    recall  f1-score   support

        ENFJ       0.23      0.42      0.30        38
        ENFP       0.40      0.44      0.42       135
        ENTJ       0.24      0.46      0.31        46
        ENTP       0.49      0.49      0.49       137
        ESFJ       0.29      0.22      0.25         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.25      0.12      0.17         8
        ESTP       0.19      0.17      0.18        18
        INFJ       0.65      0.41      0.50       294
        INFP       0.66      0.57      0.61       366
        INTJ       0.53      0.43      0.47       218
        INTP       0.61      0.57      0.59       261
        ISFJ       0.24      0.36      0.29        33
        ISFP       0.21      0.43      0.28        54
        ISTJ       0.18      0.27      0.21        41
        ISTP       0.40      0.58      0.47        67

    accuracy                         

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6940/6940 [00:04<00:00, 1436.53it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1735/1735 [00:01<00:00, 1478.74it/s]


üß† Training multi-class model with balanced class weights...




Multi-class - Accuracy: 0.4703, F1: 0.4820
              precision    recall  f1-score   support

        ENFJ       0.24      0.45      0.31        38
        ENFP       0.38      0.42      0.40       135
        ENTJ       0.23      0.48      0.31        46
        ENTP       0.49      0.45      0.47       137
        ESFJ       0.33      0.22      0.27         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.20      0.12      0.15         8
        ESTP       0.15      0.11      0.13        18
        INFJ       0.64      0.40      0.50       294
        INFP       0.63      0.55      0.59       366
        INTJ       0.52      0.44      0.48       218
        INTP       0.60      0.55      0.57       261
        ISFJ       0.27      0.42      0.33        33
        ISFP       0.25      0.48      0.33        54
        ISTJ       0.19      0.29      0.23        41
        ISTP       0.39      0.61      0.47        67

    accuracy                         

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6940/6940 [00:03<00:00, 2216.17it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1735/1735 [00:01<00:00, 1723.96it/s]


üß† Training multi-class model with balanced class weights...




Multi-class - Accuracy: 0.3585, F1: 0.3836
              precision    recall  f1-score   support

        ENFJ       0.18      0.47      0.26        38
        ENFP       0.37      0.36      0.36       135
        ENTJ       0.12      0.37      0.18        46
        ENTP       0.43      0.36      0.39       137
        ESFJ       0.14      0.22      0.17         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.17      0.12      0.14         8
        ESTP       0.05      0.11      0.07        18
        INFJ       0.58      0.32      0.41       294
        INFP       0.63      0.42      0.50       366
        INTJ       0.41      0.27      0.32       218
        INTP       0.49      0.42      0.45       261
        ISFJ       0.14      0.30      0.19        33
        ISFP       0.17      0.35      0.23        54
        ISTJ       0.13      0.27      0.17        41
        ISTP       0.26      0.43      0.32        67

    accuracy                         

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6940/6940 [00:03<00:00, 1962.59it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1735/1735 [00:00<00:00, 1760.68it/s]


üß† Training multi-class model with balanced class weights...




Multi-class - Accuracy: 0.3804, F1: 0.3979
              precision    recall  f1-score   support

        ENFJ       0.17      0.42      0.25        38
        ENFP       0.30      0.30      0.30       135
        ENTJ       0.15      0.37      0.21        46
        ENTP       0.40      0.37      0.39       137
        ESFJ       0.17      0.33      0.22         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.50      0.25      0.33         8
        ESTP       0.10      0.17      0.12        18
        INFJ       0.54      0.31      0.39       294
        INFP       0.59      0.41      0.48       366
        INTJ       0.45      0.37      0.41       218
        INTP       0.53      0.48      0.50       261
        ISFJ       0.19      0.39      0.26        33
        ISFP       0.20      0.43      0.27        54
        ISTJ       0.20      0.29      0.24        41
        ISTP       0.33      0.48      0.39        67

    accuracy                         

In [6]:
# Compare results
print("\nüìä Comparison of Multi-class Results:")
for var, res in results_multi.items():
    print(f"{var}: Accuracy={res['accuracy']:.4f}, F1={res['f1']:.4f}")

for dim in ['IE', 'NS', 'FT', 'JP']:
    print(f"\nüìä Comparison of Binary {dim} Results:")
    for var, res in results_binary[dim].items():
        print(f"{var}: Accuracy={res['accuracy']:.4f}, F1={res['f1']:.4f}")

print("‚úÖ Training and evaluation complete! Models saved in models/ directory.")


üìä Comparison of Multi-class Results:
without_lemma: Accuracy=0.4732, F1=0.4866
with_lemma: Accuracy=0.4749, F1=0.4887
with_lemma_pos: Accuracy=0.4703, F1=0.4820
with_dep_tree: Accuracy=0.3585, F1=0.3836
with_chunking: Accuracy=0.3804, F1=0.3979

üìä Comparison of Binary IE Results:
without_lemma: Accuracy=0.7833, F1=0.7917
with_lemma: Accuracy=0.7741, F1=0.7832
with_lemma_pos: Accuracy=0.7712, F1=0.7804
with_dep_tree: Accuracy=0.7366, F1=0.7508
with_chunking: Accuracy=0.7280, F1=0.7412

üìä Comparison of Binary NS Results:
without_lemma: Accuracy=0.8092, F1=0.8263
with_lemma: Accuracy=0.8069, F1=0.8237
with_lemma_pos: Accuracy=0.8092, F1=0.8267
with_dep_tree: Accuracy=0.7810, F1=0.8040
with_chunking: Accuracy=0.7793, F1=0.8010

üìä Comparison of Binary FT Results:
without_lemma: Accuracy=0.8058, F1=0.8060
with_lemma: Accuracy=0.8017, F1=0.8020
with_lemma_pos: Accuracy=0.8012, F1=0.8015
with_dep_tree: Accuracy=0.7741, F1=0.7744
with_chunking: Accuracy=0.7700, F1=0.7704

üìä Comp