In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from typing import List, Tuple
import os
import joblib
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
print("üìÇ Loading processed data...")
train_df = pd.read_pickle('../data/processed/train.pkl')
test_df = pd.read_pickle('../data/processed/test.pkl')

# Variants
variants = [
    'without_lemma',
    'with_lemma',
    'with_lemma_pos',
    'with_dep_tree',
    'with_chunking'
]

üìÇ Loading processed data...


In [3]:
# Function to prepare text from tokens/ngrams
def prepare_text(tokens: List, ngrams_b: List[Tuple], ngrams_t: List[Tuple], use_ngrams: bool = False) -> str:
    """
    Convert tokens and optionally ngrams to string for TF-IDF.
    For ngrams, join tuples into space-separated strings.
    """
    if not tokens:
        return ''
    # Handle different token formats (str or tuples)
    if isinstance(tokens[0], str):
        text = ' '.join(tokens)
    elif isinstance(tokens[0], tuple):
        text = ' '.join(['_'.join(t) for t in tokens])
    else:
        text = ''
    
    if use_ngrams:
        bigrams_str = ' '.join(['_'.join(gram) for gram in ngrams_b]) if ngrams_b else ''
        trigrams_str = ' '.join(['_'.join(gram) for gram in ngrams_t]) if ngrams_t else ''
        text = f"{text} {bigrams_str} {trigrams_str}".strip()
    
    return text

In [4]:
# Results storage
results_multi = {}
results_binary = {dim: {} for dim in ['IE', 'NS', 'FT', 'JP']}

In [None]:
# Loop over variants
for var in variants:
    print(f"\nüîç Processing variant: {var}")
    
    # Prepare train and test texts
    print("üìù Preparing text features...")
    tqdm.pandas()
    train_df['text'] = train_df.progress_apply(
        lambda row: prepare_text(row[f'tokens_{var}'], row[f'Bigrams_{var}'], row[f'Trigrams_{var}'], use_ngrams=True), axis=1
    )
    test_df['text'] = test_df.progress_apply(
        lambda row: prepare_text(row[f'tokens_{var}'], row[f'Bigrams_{var}'], row[f'Trigrams_{var}'], use_ngrams=True), axis=1
    )
    
    X_train = train_df['text']
    X_test = test_df['text']
    
    # Multi-class (16 types)
    # print("üß† Training multi-class model with balanced class weights...")
    # y_train_multi = train_df['type']
    # y_test_multi = test_df['type']
    
    # pipeline_multi = Pipeline([
    #     ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,3))),
    #     ('clf', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, class_weight='balanced'))
    # ])
    # params = {'clf__C': [0.1, 1, 10], 'clf__solver': ['lbfgs', 'liblinear']}
    # grid = GridSearchCV(pipeline_multi, params, cv=5, scoring='f1_weighted')
    # grid.fit(X_train, y_train_multi)
    
    # pipeline_multi.fit(X_train, y_train_multi)
    
    # y_pred_multi = pipeline_multi.predict(X_test)
    
    # acc_multi = accuracy_score(y_test_multi, y_pred_multi)
    # f1_multi = f1_score(y_test_multi, y_pred_multi, average='weighted')
    
    # results_multi[var] = {'accuracy': acc_multi, 'f1': f1_multi}
    # print(f"Multi-class - Accuracy: {acc_multi:.4f}, F1: {f1_multi:.4f}")
    # print(classification_report(y_test_multi, y_pred_multi))
    
    # Save model
    # os.makedirs('models', exist_ok=True)
    # joblib.dump(pipeline_multi, f'models/multi_{var}.pkl')
    
    # Binary classifiers for each dimension
    for dim in ['IE', 'NS', 'FT', 'JP']:
        print(f"üß† Training binary model for {dim} with SMOTE...")
        y_train_bin = train_df[dim]
        y_test_bin = test_df[dim]
        
        from imblearn.over_sampling import SMOTE
        tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
        X_train_tfidf = tfidf.fit_transform(X_train).toarray()
        X_test_tfidf = tfidf.transform(X_test).toarray()  
        
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train_tfidf, y_train_bin)
        
        clf = LogisticRegression(solver='lbfgs', max_iter=1000)
        clf.fit(X_train_res, y_train_res)
        y_pred_bin = clf.predict(X_test_tfidf)
        
        acc_bin = accuracy_score(y_test_bin, y_pred_bin)
        f1_bin = f1_score(y_test_bin, y_pred_bin, average='weighted')
        
        results_binary[dim][var] = {'accuracy': acc_bin, 'f1': f1_bin}
        print(f"{dim} - Accuracy: {acc_bin:.4f}, F1: {f1_bin:.4f}")
        print(classification_report(y_test_bin, y_pred_bin))
    
        joblib.dump((tfidf, clf), f'../models/binary_{dim}_{var}.pkl') 


üîç Processing variant: without_lemma
üìù Preparing text features...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6940/6940 [00:03<00:00, 2006.97it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1735/1735 [00:00<00:00, 2529.57it/s]


üß† Training binary model for IE with SMOTE...


In [8]:
# Compare results
print("\nüìä Comparison of Multi-class Results:")
for var, res in results_multi.items():
    print(f"{var}: Accuracy={res['accuracy']:.4f}, F1={res['f1']:.4f}")

for dim in ['IE', 'NS', 'FT', 'JP']:
    print(f"\nüìä Comparison of Binary {dim} Results:")
    for var, res in results_binary[dim].items():
        print(f"{var}: Accuracy={res['accuracy']:.4f}, F1={res['f1']:.4f}")

print("‚úÖ Training and evaluation complete! Models saved in models/ directory.")


üìä Comparison of Multi-class Results:
without_lemma: Accuracy=0.3527, F1=0.3671
with_lemma: Accuracy=0.3470, F1=0.3608
with_lemma_pos: Accuracy=0.3412, F1=0.3572
with_dep_tree: Accuracy=0.1516, F1=0.1692
with_chunking: Accuracy=0.2767, F1=0.2926

üìä Comparison of Binary IE Results:
without_lemma: Accuracy=0.7308, F1=0.7447
with_lemma: Accuracy=0.7297, F1=0.7436
with_lemma_pos: Accuracy=0.7222, F1=0.7368
with_dep_tree: Accuracy=0.6571, F1=0.6775
with_chunking: Accuracy=0.7037, F1=0.7196

üìä Comparison of Binary NS Results:
without_lemma: Accuracy=0.7856, F1=0.8067
with_lemma: Accuracy=0.7844, F1=0.8053
with_lemma_pos: Accuracy=0.7850, F1=0.8061
with_dep_tree: Accuracy=0.6928, F1=0.7291
with_chunking: Accuracy=0.7556, F1=0.7813

üìä Comparison of Binary FT Results:
without_lemma: Accuracy=0.7821, F1=0.7824
with_lemma: Accuracy=0.7879, F1=0.7882
with_lemma_pos: Accuracy=0.7775, F1=0.7779
with_dep_tree: Accuracy=0.6496, F1=0.6501
with_chunking: Accuracy=0.7487, F1=0.7490

üìä Comp