In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from typing import List, Tuple
import os
import joblib
from tqdm import tqdm

In [None]:
# Load data
print("üìÇ Loading processed data...")
train_df = pd.read_pickle('../data/processed/train.pkl')
test_df = pd.read_pickle('../data/processed/test.pkl')

# Variants
variants = [
    'without_lemma',
    'with_lemma',
    'with_lemma_pos',
    'with_dep_tree',
    'with_chunking'
]

üìÇ Loading processed data...


In [None]:
# Function to prepare text from tokens/ngrams
def prepare_text(tokens: List, ngrams_b: List[Tuple], ngrams_t: List[Tuple], use_ngrams: bool = False) -> str:
    """
    Convert tokens and optionally ngrams to string for TF-IDF.
    For ngrams, join tuples into space-separated strings.
    """
    if not tokens:
        return ''
    # Handle different token formats (str or tuples)
    if isinstance(tokens[0], str):
        text = ' '.join(tokens)
    elif isinstance(tokens[0], tuple):
        text = ' '.join(['_'.join(t) for t in tokens])
    else:
        text = ''
    
    if use_ngrams:
        bigrams_str = ' '.join(['_'.join(gram) for gram in ngrams_b]) if ngrams_b else ''
        trigrams_str = ' '.join(['_'.join(gram) for gram in ngrams_t]) if ngrams_t else ''
        text = f"{text} {bigrams_str} {trigrams_str}".strip()
    
    return text

In [None]:
# Results storage
results_multi = {}
results_binary = {dim: {} for dim in ['IE', 'NS', 'FT', 'JP']}

In [None]:
# Loop over variants
for var in variants:
    print(f"\nüîç Processing variant: {var}")
    
    # Prepare train and test texts
    print("üìù Preparing text features...")
    tqdm.pandas()
    train_df['text'] = train_df.progress_apply(
        lambda row: prepare_text(row[f'tokens_{var}'], row[f'Bigrams_{var}'], row[f'Trigrams_{var}'], use_ngrams=True), axis=1
    )
    test_df['text'] = test_df.progress_apply(
        lambda row: prepare_text(row[f'tokens_{var}'], row[f'Bigrams_{var}'], row[f'Trigrams_{var}'], use_ngrams=True), axis=1
    )
    
    X_train = train_df['text']
    X_test = test_df['text']
    
    # Multi-class (16 types)
    print("üß† Training multi-class model with balanced class weights...")
    y_train_multi = train_df['type']
    y_test_multi = test_df['type']
    
    pipeline_multi = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,3))),
        ('clf', LinearSVC(class_weight='balanced', max_iter=10000))  # SVM classifier
    ])
    
    pipeline_multi.fit(X_train, y_train_multi)
    y_pred_multi = pipeline_multi.predict(X_test)
    
    acc_multi = accuracy_score(y_test_multi, y_pred_multi)
    f1_multi = f1_score(y_test_multi, y_pred_multi, average='weighted')
    
    results_multi[var] = {'accuracy': acc_multi, 'f1': f1_multi}
    print(f"Multi-class - Accuracy: {acc_multi:.4f}, F1: {f1_multi:.4f}")
    print(classification_report(y_test_multi, y_pred_multi))
    
    # Save model
    os.makedirs('models', exist_ok=True)
    joblib.dump(pipeline_multi, f'models/multi_{var}.pkl')
    
    # Binary classifiers for each dimension
    for dim in ['IE', 'NS', 'FT', 'JP']:
        print(f"üß† Training binary model for {dim} with balanced class weights...")
        y_train_bin = train_df[dim]
        y_test_bin = test_df[dim]
        
        pipeline_bin = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,3))),
            ('clf', LinearSVC(class_weight='balanced', max_iter=10000))  # SVM classifier
        ])
        
        pipeline_bin.fit(X_train, y_train_bin)
        y_pred_bin = pipeline_bin.predict(X_test)
        
        acc_bin = accuracy_score(y_test_bin, y_pred_bin)
        f1_bin = f1_score(y_test_bin, y_pred_bin, average='weighted')
        
        results_binary[dim][var] = {'accuracy': acc_bin, 'f1': f1_bin}
        print(f"{dim} - Accuracy: {acc_bin:.4f}, F1: {f1_bin:.4f}")
        print(classification_report(y_test_bin, y_pred_bin))
        
        # Save model
        joblib.dump(pipeline_bin, f'models/binary_{dim}_{var}.pkl')


üîç Processing variant: without_lemma
üìù Preparing text features...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6940/6940 [00:01<00:00, 3574.41it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1735/1735 [00:00<00:00, 4061.45it/s]


üß† Training multi-class model with balanced class weights...
Multi-class - Accuracy: 0.3476, F1: 0.3511
              precision    recall  f1-score   support

        ENFJ       0.24      0.29      0.26        38
        ENFP       0.29      0.32      0.30       135
        ENTJ       0.09      0.15      0.12        46
        ENTP       0.32      0.39      0.35       137
        ESFJ       0.00      0.00      0.00         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.00      0.00      0.00        18
        INFJ       0.41      0.34      0.37       294
        INFP       0.50      0.44      0.47       366
        INTJ       0.35      0.33      0.34       218
        INTP       0.44      0.46      0.45       261
        ISFJ       0.07      0.09      0.08        33
        ISFP       0.13      0.15      0.14        54
        ISTJ       0.09      0.07      0.08        41
        ISTP       0.27      

: 

In [None]:
# Compare results
print("\nüìä Comparison of Multi-class Results:")
for var, res in results_multi.items():
    print(f"{var}: Accuracy={res['accuracy']:.4f}, F1={res['f1']:.4f}")

for dim in ['IE', 'NS', 'FT', 'JP']:
    print(f"\nüìä Comparison of Binary {dim} Results:")
    for var, res in results_binary[dim].items():
        print(f"{var}: Accuracy={res['accuracy']:.4f}, F1={res['f1']:.4f}")

print("‚úÖ Training and evaluation complete! Models saved in models/ directory.")

: 

: 