In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import os
import joblib
from tqdm import tqdm
import warnings
from gensim.models import Word2Vec, FastText
import logging

warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:

# Function to average word embeddings for a list of tokens
def average_embedding(tokens, model, embedding_size=100):
    """
    Compute average embedding for a list of tokens.
    Ignores words not in vocabulary.
    """
    valid_embeddings = []
    for token in tokens:
        if token in model.wv:
            valid_embeddings.append(model.wv[token])
    if not valid_embeddings:
        return np.zeros(embedding_size)
    return np.mean(valid_embeddings, axis=0)


In [None]:

# Function to prepare embeddings for a dataframe column
def prepare_embeddings(df, variant, model, embedding_size=100):
    """
    Generate averaged embeddings for each row's tokens.
    """
    embeddings = []
    for tokens in tqdm(df[f'tokens_{variant}']):
        emb = average_embedding(tokens, model, embedding_size)
        embeddings.append(emb)
    return np.array(embeddings)


In [None]:
# Main training and evaluation function
def train_and_evaluate(embedding_type='word2vec', variant='with_lemma', embedding_size=100, epochs=10):
    """
    Train and evaluate models using specified embeddings.
    Supports 'word2vec' or 'fasttext'.
    """
    # Load data
    logger.info("üìÇ Loading processed data...")
    train_df = pd.read_pickle('../data/processed/train.pkl')
    test_df = pd.read_pickle('../data/processed/test.pkl')

    # Prepare sentences (list of token lists) for embedding training
    sentences = train_df[f'tokens_{variant}'].tolist()

    # Train embedding model
    logger.info(f"üß† Training {embedding_type.upper()} model on {variant} tokens...")
    if embedding_type == 'word2vec':
        model = Word2Vec(sentences, vector_size=embedding_size, window=5, min_count=1, workers=4, epochs=epochs)
    elif embedding_type == 'fasttext':
        model = FastText(sentences, vector_size=embedding_size, window=5, min_count=1, workers=4, epochs=epochs)
    else:
        raise ValueError("Unsupported embedding_type. Use 'word2vec' or 'fasttext'.")

    # Save embedding model
    model_path = f'../models/{embedding_type}_{variant}.model'
    model.save(model_path)
    logger.info(f"‚úÖ {embedding_type.upper()} model saved to {model_path}")

    # Generate embeddings for train and test
    logger.info("üìù Generating embeddings for train...")
    X_train_emb = prepare_embeddings(train_df, variant, model, embedding_size)
    
    logger.info("üìù Generating embeddings for test...")
    X_test_emb = prepare_embeddings(test_df, variant, model, embedding_size)

    # Results storage
    results_binary = {dim: {} for dim in ['IE', 'NS', 'FT', 'JP']}
    results_binary_balanced = {dim: {} for dim in ['IE', 'NS', 'FT', 'JP']}

    # Binary dimensions training
    for dim in ['IE', 'NS', 'FT', 'JP']:
        logger.info(f"üß† Training binary model for {dim} with SMOTE...")
        y_train_bin = train_df[dim]
        y_test_bin = test_df[dim]

        # Apply SMOTE to train
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train_emb, y_train_bin)

        # Train logistic regression
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_train_res, y_train_res)

        # Predict on original test
        y_pred_bin = clf.predict(X_test_emb)
        acc_bin = accuracy_score(y_test_bin, y_pred_bin)
        f1_bin = f1_score(y_test_bin, y_pred_bin, average='weighted')
        results_binary[dim] = {'accuracy': acc_bin, 'f1': f1_bin}
        
        print(f"{dim} - Accuracy: {acc_bin:.4f}, F1: {f1_bin:.4f}")
        print(classification_report(y_test_bin, y_pred_bin))

        # Save model
        clf_path = f'../models/binary_{dim}_{embedding_type}_{variant}.pkl'
        joblib.dump(clf, clf_path)

    # Balanced test evaluation
    logger.info("\nüîç Evaluating on SMOTE-balanced test...")
    for dim in ['IE', 'NS', 'FT', 'JP']:
        y_test_bin = test_df[dim]
        
        # Load classifier (since we saved it)
        clf = joblib.load(f'../models/binary_{dim}_{embedding_type}_{variant}.pkl')
        
        # Apply SMOTE to test embeddings
        smote_test = SMOTE(random_state=42)
        X_test_res, y_test_res = smote_test.fit_resample(X_test_emb, y_test_bin)
        
        # Predict on balanced test
        y_pred_bin = clf.predict(X_test_res)
        
        acc_bin = accuracy_score(y_test_res, y_pred_bin)
        f1_bin = f1_score(y_test_res, y_pred_bin, average='weighted')
        
        results_binary_balanced[dim] = {'accuracy': acc_bin, 'f1': f1_bin}
        print(f"{dim} - Accuracy (on SMOTE-balanced test): {acc_bin:.4f}, F1: {f1_bin:.4f}")
        print(classification_report(y_test_res, y_pred_bin))

    # Compare results
    for dim in ['IE', 'NS', 'FT', 'JP']:
        print(f"\nüìä Binary {dim} Results:")
        print(f"Original Test: Accuracy={results_binary[dim]['accuracy']:.4f}, F1={results_binary[dim]['f1']:.4f}")
        print(f"Balanced Test: Accuracy={results_binary_balanced[dim]['accuracy']:.4f}, F1={results_binary_balanced[dim]['f1']:.4f}")

    logger.info("‚úÖ Training and evaluation complete!")


In [None]:
if __name__ == "__main__":
    # Train with Word2Vec
    train_and_evaluate(embedding_type='word2vec', variant='with_lemma', embedding_size=100, epochs=10)
    
    # Train with FastText
    train_and_evaluate(embedding_type='fasttext', variant='with_lemma', embedding_size=100, epochs=10)