In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import os
import joblib
from tqdm import tqdm
import warnings
from gensim.models import Word2Vec, FastText
import logging

warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [2]:

# Function to average word embeddings for a list of tokens
def average_embedding(tokens, model, embedding_size=100):
    """
    Compute average embedding for a list of tokens.
    Ignores words not in vocabulary.
    """
    valid_embeddings = []
    for token in tokens:
        if token in model.wv:
            valid_embeddings.append(model.wv[token])
    if not valid_embeddings:
        return np.zeros(embedding_size)
    return np.mean(valid_embeddings, axis=0)


In [3]:

# Function to prepare embeddings for a dataframe column
def prepare_embeddings(df, variant, model, embedding_size=100):
    """
    Generate averaged embeddings for each row's tokens.
    """
    embeddings = []
    for tokens in tqdm(df[f'tokens_{variant}']):
        emb = average_embedding(tokens, model, embedding_size)
        embeddings.append(emb)
    return np.array(embeddings)


In [4]:
# Main training and evaluation function
def train_and_evaluate(embedding_type='word2vec', variant='with_lemma', embedding_size=100, epochs=10):
    """
    Train and evaluate models using specified embeddings.
    Supports 'word2vec' or 'fasttext'.
    """
    # Load data
    logger.info("üìÇ Loading processed data...")
    train_df = pd.read_pickle('../data/processed/train.pkl')
    test_df = pd.read_pickle('../data/processed/test.pkl')

    # Prepare sentences (list of token lists) for embedding training
    sentences = train_df[f'tokens_{variant}'].tolist()

    # Train embedding model
    logger.info(f"üß† Training {embedding_type.upper()} model on {variant} tokens...")
    if embedding_type == 'word2vec':
        model = Word2Vec(sentences, vector_size=embedding_size, window=5, min_count=1, workers=4, epochs=epochs)
    elif embedding_type == 'fasttext':
        model = FastText(sentences, vector_size=embedding_size, window=5, min_count=1, workers=4, epochs=epochs)
    else:
        raise ValueError("Unsupported embedding_type. Use 'word2vec' or 'fasttext'.")

    # Save embedding model
    model_path = f'../models/{embedding_type}_{variant}.model'
    model.save(model_path)
    logger.info(f"‚úÖ {embedding_type.upper()} model saved to {model_path}")

    # Generate embeddings for train and test
    logger.info("üìù Generating embeddings for train...")
    X_train_emb = prepare_embeddings(train_df, variant, model, embedding_size)
    
    logger.info("üìù Generating embeddings for test...")
    X_test_emb = prepare_embeddings(test_df, variant, model, embedding_size)

    # Results storage
    results_binary = {dim: {} for dim in ['IE', 'NS', 'FT', 'JP']}
    results_binary_balanced = {dim: {} for dim in ['IE', 'NS', 'FT', 'JP']}

    # Binary dimensions training
    for dim in ['IE', 'NS', 'FT', 'JP']:
        logger.info(f"üß† Training binary model for {dim} with SMOTE...")
        y_train_bin = train_df[dim]
        y_test_bin = test_df[dim]

        # Apply SMOTE to train
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train_emb, y_train_bin)

        # Train logistic regression
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_train_res, y_train_res)

        # Predict on original test
        y_pred_bin = clf.predict(X_test_emb)
        acc_bin = accuracy_score(y_test_bin, y_pred_bin)
        f1_bin = f1_score(y_test_bin, y_pred_bin, average='weighted')
        results_binary[dim] = {'accuracy': acc_bin, 'f1': f1_bin}
        
        print(f"{dim} - Accuracy: {acc_bin:.4f}, F1: {f1_bin:.4f}")
        print(classification_report(y_test_bin, y_pred_bin))

        # Save model
        clf_path = f'../models/binary_{dim}_{embedding_type}_{variant}.pkl'
        joblib.dump(clf, clf_path)

    # Balanced test evaluation
    logger.info("\nüîç Evaluating on SMOTE-balanced test...")
    for dim in ['IE', 'NS', 'FT', 'JP']:
        y_test_bin = test_df[dim]
        
        # Load classifier (since we saved it)
        clf = joblib.load(f'../models/binary_{dim}_{embedding_type}_{variant}.pkl')
        
        # Apply SMOTE to test embeddings
        smote_test = SMOTE(random_state=42)
        X_test_res, y_test_res = smote_test.fit_resample(X_test_emb, y_test_bin)
        
        # Predict on balanced test
        y_pred_bin = clf.predict(X_test_res)
        
        acc_bin = accuracy_score(y_test_res, y_pred_bin)
        f1_bin = f1_score(y_test_res, y_pred_bin, average='weighted')
        
        results_binary_balanced[dim] = {'accuracy': acc_bin, 'f1': f1_bin}
        print(f"{dim} - Accuracy (on SMOTE-balanced test): {acc_bin:.4f}, F1: {f1_bin:.4f}")
        print(classification_report(y_test_res, y_pred_bin))

    # Compare results
    for dim in ['IE', 'NS', 'FT', 'JP']:
        print(f"\nüìä Binary {dim} Results:")
        print(f"Original Test: Accuracy={results_binary[dim]['accuracy']:.4f}, F1={results_binary[dim]['f1']:.4f}")
        print(f"Balanced Test: Accuracy={results_binary_balanced[dim]['accuracy']:.4f}, F1={results_binary_balanced[dim]['f1']:.4f}")

    logger.info("‚úÖ Training and evaluation complete!")


In [5]:
if __name__ == "__main__":
    # Train with Word2Vec
    train_and_evaluate(embedding_type='word2vec', variant='with_lemma', embedding_size=100, epochs=10)
    
    # Train with FastText
    train_and_evaluate(embedding_type='fasttext', variant='with_lemma', embedding_size=100, epochs=10)

2025-11-29 09:36:16,624 - INFO - üìÇ Loading processed data...
2025-11-29 09:36:36,847 - INFO - üß† Training WORD2VEC model on with_lemma tokens...
2025-11-29 09:36:36,901 - INFO - collecting all words and their counts
2025-11-29 09:36:36,903 - INFO - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-11-29 09:36:37,996 - INFO - collected 82324 word types from a corpus of 4133474 raw words and 6940 sentences
2025-11-29 09:36:37,997 - INFO - Creating a fresh vocabulary
2025-11-29 09:36:38,392 - INFO - Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 82324 unique words (100.00% of original 82324, drops 0)', 'datetime': '2025-11-29T09:36:38.391933', 'gensim': '4.4.0', 'python': '3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:28:27) [Clang 14.0.6 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2025-11-29 09:36:38,395 - INFO - Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 4133474 word corpus (100.

IE - Accuracy: 0.6870, F1: 0.7091
              precision    recall  f1-score   support

           0       0.40      0.67      0.50       401
           1       0.87      0.69      0.77      1334

    accuracy                           0.69      1735
   macro avg       0.64      0.68      0.64      1735
weighted avg       0.76      0.69      0.71      1735



2025-11-29 09:37:25,591 - INFO - üß† Training binary model for FT with SMOTE...


NS - Accuracy: 0.6824, F1: 0.7298
              precision    recall  f1-score   support

           0       0.25      0.66      0.37       240
           1       0.93      0.69      0.79      1495

    accuracy                           0.68      1735
   macro avg       0.59      0.67      0.58      1735
weighted avg       0.83      0.68      0.73      1735



2025-11-29 09:37:25,811 - INFO - üß† Training binary model for JP with SMOTE...


FT - Accuracy: 0.7550, F1: 0.7554
              precision    recall  f1-score   support

           0       0.72      0.76      0.74       796
           1       0.79      0.75      0.77       939

    accuracy                           0.76      1735
   macro avg       0.75      0.76      0.75      1735
weighted avg       0.76      0.76      0.76      1735

JP - Accuracy: 0.6104, F1: 0.6145


2025-11-29 09:37:26,003 - INFO - 
üîç Evaluating on SMOTE-balanced test...


              precision    recall  f1-score   support

           0       0.71      0.61      0.65      1048
           1       0.51      0.61      0.56       687

    accuracy                           0.61      1735
   macro avg       0.61      0.61      0.60      1735
weighted avg       0.63      0.61      0.61      1735

IE - Accuracy (on SMOTE-balanced test): 0.6960, F1: 0.6960
              precision    recall  f1-score   support

           0       0.69      0.70      0.70      1334
           1       0.70      0.69      0.69      1334

    accuracy                           0.70      2668
   macro avg       0.70      0.70      0.70      2668
weighted avg       0.70      0.70      0.70      2668

NS - Accuracy (on SMOTE-balanced test): 0.6729, F1: 0.6729
              precision    recall  f1-score   support

           0       0.68      0.66      0.67      1495
           1       0.67      0.69      0.68      1495

    accuracy                           0.67      2990
   macro a

2025-11-29 09:37:26,229 - INFO - ‚úÖ Training and evaluation complete!


JP - Accuracy (on SMOTE-balanced test): 0.6169, F1: 0.6169
              precision    recall  f1-score   support

           0       0.62      0.61      0.61      1048
           1       0.61      0.63      0.62      1048

    accuracy                           0.62      2096
   macro avg       0.62      0.62      0.62      2096
weighted avg       0.62      0.62      0.62      2096


üìä Binary IE Results:
Original Test: Accuracy=0.6870, F1=0.7091
Balanced Test: Accuracy=0.6960, F1=0.6960

üìä Binary NS Results:
Original Test: Accuracy=0.6824, F1=0.7298
Balanced Test: Accuracy=0.6729, F1=0.6729

üìä Binary FT Results:
Original Test: Accuracy=0.7550, F1=0.7554
Balanced Test: Accuracy=0.7614, F1=0.7614

üìä Binary JP Results:
Original Test: Accuracy=0.6104, F1=0.6145
Balanced Test: Accuracy=0.6169, F1=0.6169


2025-11-29 09:37:33,826 - INFO - üìÇ Loading processed data...
2025-11-29 09:37:57,574 - INFO - üß† Training FASTTEXT model on with_lemma tokens...
2025-11-29 09:37:57,579 - INFO - collecting all words and their counts
2025-11-29 09:37:57,579 - INFO - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-11-29 09:37:58,676 - INFO - collected 82324 word types from a corpus of 4133474 raw words and 6940 sentences
2025-11-29 09:37:58,677 - INFO - Creating a fresh vocabulary
2025-11-29 09:37:59,134 - INFO - FastText lifecycle event {'msg': 'effective_min_count=1 retains 82324 unique words (100.00% of original 82324, drops 0)', 'datetime': '2025-11-29T09:37:59.134285', 'gensim': '4.4.0', 'python': '3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:28:27) [Clang 14.0.6 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2025-11-29 09:37:59,135 - INFO - FastText lifecycle event {'msg': 'effective_min_count=1 leaves 4133474 word corpus (100.

IE - Accuracy: 0.6755, F1: 0.6990
              precision    recall  f1-score   support

           0       0.38      0.67      0.49       401
           1       0.87      0.68      0.76      1334

    accuracy                           0.68      1735
   macro avg       0.63      0.67      0.63      1735
weighted avg       0.76      0.68      0.70      1735



2025-11-29 09:40:20,335 - INFO - üß† Training binary model for FT with SMOTE...


NS - Accuracy: 0.6795, F1: 0.7271
              precision    recall  f1-score   support

           0       0.25      0.64      0.36       240
           1       0.92      0.69      0.79      1495

    accuracy                           0.68      1735
   macro avg       0.58      0.66      0.57      1735
weighted avg       0.83      0.68      0.73      1735



2025-11-29 09:40:20,660 - INFO - üß† Training binary model for JP with SMOTE...


FT - Accuracy: 0.7493, F1: 0.7497
              precision    recall  f1-score   support

           0       0.71      0.76      0.73       796
           1       0.78      0.74      0.76       939

    accuracy                           0.75      1735
   macro avg       0.75      0.75      0.75      1735
weighted avg       0.75      0.75      0.75      1735



2025-11-29 09:40:20,879 - INFO - 
üîç Evaluating on SMOTE-balanced test...
2025-11-29 09:40:21,019 - INFO - ‚úÖ Training and evaluation complete!


JP - Accuracy: 0.6023, F1: 0.6063
              precision    recall  f1-score   support

           0       0.69      0.61      0.65      1048
           1       0.50      0.59      0.54       687

    accuracy                           0.60      1735
   macro avg       0.60      0.60      0.59      1735
weighted avg       0.62      0.60      0.61      1735

IE - Accuracy (on SMOTE-balanced test): 0.6822, F1: 0.6822
              precision    recall  f1-score   support

           0       0.68      0.69      0.68      1334
           1       0.68      0.68      0.68      1334

    accuracy                           0.68      2668
   macro avg       0.68      0.68      0.68      2668
weighted avg       0.68      0.68      0.68      2668

NS - Accuracy (on SMOTE-balanced test): 0.6722, F1: 0.6722
              precision    recall  f1-score   support

           0       0.68      0.66      0.67      1495
           1       0.67      0.69      0.68      1495

    accuracy                  