In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from typing import List, Tuple
import os
import joblib
from tqdm import tqdm

In [2]:
# Load data
print("üìÇ Loading processed data...")
train_df = pd.read_pickle('../data/processed/train.pkl')
test_df = pd.read_pickle('../data/processed/test.pkl')

# Variants
variants = [
    'without_lemma',
    'with_lemma',
    'with_lemma_pos',
    'with_dep_tree',
    'with_chunking'
]

üìÇ Loading processed data...


In [3]:
# Function to prepare text from tokens/ngrams
def prepare_text(tokens: List, ngrams_b: List[Tuple], ngrams_t: List[Tuple], use_ngrams: bool = False) -> str:
    """
    Convert tokens and optionally ngrams to string for TF-IDF.
    For ngrams, join tuples into space-separated strings.
    """
    if not tokens:
        return ''
    # Handle different token formats (str or tuples)
    if isinstance(tokens[0], str):
        text = ' '.join(tokens)
    elif isinstance(tokens[0], tuple):
        text = ' '.join(['_'.join(t) for t in tokens])
    else:
        text = ''
    
    if use_ngrams:
        bigrams_str = ' '.join(['_'.join(gram) for gram in ngrams_b]) if ngrams_b else ''
        trigrams_str = ' '.join(['_'.join(gram) for gram in ngrams_t]) if ngrams_t else ''
        text = f"{text} {bigrams_str} {trigrams_str}".strip()
    
    return text

In [4]:
# Results storage
results_multi = {}
results_binary = {dim: {} for dim in ['IE', 'NS', 'FT', 'JP']}

In [5]:
# Loop over variants
for var in variants:
    print(f"\nüîç Processing variant: {var}")
    
    # Prepare train and test texts
    print("üìù Preparing text features...")
    tqdm.pandas()
    train_df['text'] = train_df.progress_apply(
        lambda row: prepare_text(row[f'tokens_{var}'], row[f'Bigrams_{var}'], row[f'Trigrams_{var}'], use_ngrams=True), axis=1
    )
    test_df['text'] = test_df.progress_apply(
        lambda row: prepare_text(row[f'tokens_{var}'], row[f'Bigrams_{var}'], row[f'Trigrams_{var}'], use_ngrams=True), axis=1
    )
    
    X_train = train_df['text']
    X_test = test_df['text']
    
    
    # Binary classifiers for each dimension
    for dim in ['IE', 'NS', 'FT', 'JP']:
        print(f"üß† Training binary model for {dim} with balanced class weights...")
        y_train_bin = train_df[dim]
        y_test_bin = test_df[dim]
        
        
        tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
        X_train_tfidf = tfidf.fit_transform(X_train).toarray()
        X_test_tfidf = tfidf.transform(X_test).toarray()  
        
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train_tfidf, y_train_bin)
        
        clf = LinearSVC(class_weight='balanced', max_iter=10000)  # SVM classifier
        
        
        clf.fit(X_train_res, y_train_res)
        y_pred_bin = clf.predict(X_test_tfidf)
        
        acc_bin = accuracy_score(y_test_bin, y_pred_bin)
        f1_bin = f1_score(y_test_bin, y_pred_bin, average='weighted')
        
        results_binary[dim][var] = {'accuracy': acc_bin, 'f1': f1_bin}
        print(f"{dim} - Accuracy: {acc_bin:.4f}, F1: {f1_bin:.4f}")
        print(classification_report(y_test_bin, y_pred_bin))
        
        # Save model
        joblib.dump((tfidf,clf), f'models/svm_binary_{dim}_{var}.pkl')


üîç Processing variant: without_lemma
üìù Preparing text features...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6940/6940 [00:01<00:00, 4399.33it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1735/1735 [00:00<00:00, 3843.54it/s]


üß† Training binary model for IE with balanced class weights...
IE - Accuracy: 0.7695, F1: 0.7730
              precision    recall  f1-score   support

           0       0.50      0.55      0.52       401
           1       0.86      0.84      0.85      1334

    accuracy                           0.77      1735
   macro avg       0.68      0.69      0.69      1735
weighted avg       0.78      0.77      0.77      1735

üß† Training binary model for NS with balanced class weights...
NS - Accuracy: 0.8334, F1: 0.8350
              precision    recall  f1-score   support

           0       0.40      0.42      0.41       240
           1       0.91      0.90      0.90      1495

    accuracy                           0.83      1735
   macro avg       0.65      0.66      0.66      1735
weighted avg       0.84      0.83      0.83      1735

üß† Training binary model for FT with balanced class weights...
FT - Accuracy: 0.7666, F1: 0.7667
              precision    recall  f1-score   sup

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6940/6940 [00:05<00:00, 1308.86it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1735/1735 [00:03<00:00, 437.78it/s] 


üß† Training binary model for IE with balanced class weights...
IE - Accuracy: 0.7637, F1: 0.7680
              precision    recall  f1-score   support

           0       0.49      0.55      0.52       401
           1       0.86      0.83      0.84      1334

    accuracy                           0.76      1735
   macro avg       0.67      0.69      0.68      1735
weighted avg       0.77      0.76      0.77      1735

üß† Training binary model for NS with balanced class weights...
NS - Accuracy: 0.8271, F1: 0.8308
              precision    recall  f1-score   support

           0       0.39      0.43      0.41       240
           1       0.91      0.89      0.90      1495

    accuracy                           0.83      1735
   macro avg       0.65      0.66      0.65      1735
weighted avg       0.83      0.83      0.83      1735

üß† Training binary model for FT with balanced class weights...
FT - Accuracy: 0.7718, F1: 0.7720
              precision    recall  f1-score   sup

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6940/6940 [00:17<00:00, 404.00it/s] 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1735/1735 [00:03<00:00, 507.64it/s]


üß† Training binary model for IE with balanced class weights...
IE - Accuracy: 0.7683, F1: 0.7712
              precision    recall  f1-score   support

           0       0.50      0.54      0.52       401
           1       0.86      0.84      0.85      1334

    accuracy                           0.77      1735
   macro avg       0.68      0.69      0.68      1735
weighted avg       0.77      0.77      0.77      1735

üß† Training binary model for NS with balanced class weights...
NS - Accuracy: 0.8248, F1: 0.8290
              precision    recall  f1-score   support

           0       0.38      0.43      0.40       240
           1       0.91      0.89      0.90      1495

    accuracy                           0.82      1735
   macro avg       0.64      0.66      0.65      1735
weighted avg       0.83      0.82      0.83      1735

üß† Training binary model for FT with balanced class weights...
FT - Accuracy: 0.7683, F1: 0.7686
              precision    recall  f1-score   sup

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6940/6940 [00:00<00:00, 9610.91it/s] 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1735/1735 [00:00<00:00, 8794.65it/s]


üß† Training binary model for IE with balanced class weights...
IE - Accuracy: 0.6801, F1: 0.6861
              precision    recall  f1-score   support

           0       0.33      0.37      0.35       401
           1       0.80      0.77      0.79      1334

    accuracy                           0.68      1735
   macro avg       0.57      0.57      0.57      1735
weighted avg       0.69      0.68      0.69      1735

üß† Training binary model for NS with balanced class weights...
NS - Accuracy: 0.7424, F1: 0.7597
              precision    recall  f1-score   support

           0       0.19      0.27      0.23       240
           1       0.87      0.82      0.85      1495

    accuracy                           0.74      1735
   macro avg       0.53      0.54      0.54      1735
weighted avg       0.78      0.74      0.76      1735

üß† Training binary model for FT with balanced class weights...
FT - Accuracy: 0.6530, F1: 0.6535
              precision    recall  f1-score   sup

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6940/6940 [00:11<00:00, 610.28it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1735/1735 [00:02<00:00, 597.81it/s]


üß† Training binary model for IE with balanced class weights...
IE - Accuracy: 0.7591, F1: 0.7617
              precision    recall  f1-score   support

           0       0.48      0.51      0.50       401
           1       0.85      0.83      0.84      1334

    accuracy                           0.76      1735
   macro avg       0.67      0.67      0.67      1735
weighted avg       0.76      0.76      0.76      1735

üß† Training binary model for NS with balanced class weights...
NS - Accuracy: 0.8098, F1: 0.8147
              precision    recall  f1-score   support

           0       0.33      0.38      0.36       240
           1       0.90      0.88      0.89      1495

    accuracy                           0.81      1735
   macro avg       0.62      0.63      0.62      1735
weighted avg       0.82      0.81      0.81      1735

üß† Training binary model for FT with balanced class weights...
FT - Accuracy: 0.7579, F1: 0.7580
              precision    recall  f1-score   sup

In [6]:
# Compare results
print("\nüìä Comparison of Multi-class Results:")
for var, res in results_multi.items():
    print(f"{var}: Accuracy={res['accuracy']:.4f}, F1={res['f1']:.4f}")

for dim in ['IE', 'NS', 'FT', 'JP']:
    print(f"\nüìä Comparison of Binary {dim} Results:")
    for var, res in results_binary[dim].items():
        print(f"{var}: Accuracy={res['accuracy']:.4f}, F1={res['f1']:.4f}")

print("‚úÖ Training and evaluation complete! Models saved in models/ directory.")


üìä Comparison of Multi-class Results:

üìä Comparison of Binary IE Results:
without_lemma: Accuracy=0.7695, F1=0.7730
with_lemma: Accuracy=0.7637, F1=0.7680
with_lemma_pos: Accuracy=0.7683, F1=0.7712
with_dep_tree: Accuracy=0.6801, F1=0.6861
with_chunking: Accuracy=0.7591, F1=0.7617

üìä Comparison of Binary NS Results:
without_lemma: Accuracy=0.8334, F1=0.8350
with_lemma: Accuracy=0.8271, F1=0.8308
with_lemma_pos: Accuracy=0.8248, F1=0.8290
with_dep_tree: Accuracy=0.7424, F1=0.7597
with_chunking: Accuracy=0.8098, F1=0.8147

üìä Comparison of Binary FT Results:
without_lemma: Accuracy=0.7666, F1=0.7667
with_lemma: Accuracy=0.7718, F1=0.7720
with_lemma_pos: Accuracy=0.7683, F1=0.7686
with_dep_tree: Accuracy=0.6530, F1=0.6535
with_chunking: Accuracy=0.7579, F1=0.7580

üìä Comparison of Binary JP Results:
without_lemma: Accuracy=0.6836, F1=0.6851
with_lemma: Accuracy=0.6830, F1=0.6840
with_lemma_pos: Accuracy=0.6893, F1=0.6906
with_dep_tree: Accuracy=0.5816, F1=0.5844
with_chunking