In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import optuna
import joblib
import multiprocessing
import json

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GroupKFold, StratifiedGroupKFold, GroupShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from collections import Counter
from utils import save_eval_results


N_JOBS = multiprocessing.cpu_count() - 1
SEED = 42
DB_PATH = "data/tfm.db" 
SUBTASK2_PATH = "new_data\\subtask2\\"

In [None]:
language = 'spa'
model_type = 'ml'
stemming = False
lemmatization = True
remove_duplicates = False
cased = False # Keep cased for DL models

data_config = f"lang_{language}_model_{model_type}_stem_{stemming}_lem_{lemmatization}_dup_{remove_duplicates}_cased_{cased}"
file_name = 'subtask2_all_aug'

db_file_name = '{}_{}.csv'.format(file_name, data_config)

if os.path.exists(SUBTASK2_PATH + db_file_name):
    full_data = pd.read_csv(SUBTASK2_PATH + db_file_name, encoding='utf-8')
    print('File found: ')
    print(full_data.info())
else:
    print('File not found...')    


original_data = full_data[full_data['is_augmented'] != True].copy()
original_data = original_data.sample(frac=1, random_state=SEED).reset_index(drop=True)
full_data = full_data.sample(frac=1, random_state=SEED).reset_index(drop=True)

original_data.info()

File found: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5840 entries, 0 to 5839
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5840 non-null   object
 1   label              5840 non-null   object
 2   lyrics             5840 non-null   object
 3   lyrics_clean       5840 non-null   object
 4   augmentation_type  5840 non-null   object
 5   is_augmented       5840 non-null   bool  
 6   lyrics_clean_ml    5840 non-null   object
dtypes: bool(1), object(6)
memory usage: 279.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1168 non-null   object
 1   label              1168 non-null   object
 2   lyrics             1168 non-null   object
 3   lyrics_clean       1168 non-null   object
 4   augmentat

In [7]:
cond_aug_nr = (full_data['is_augmented'] == True) & (full_data['label'] == 'NR')
cond_aug_s = (full_data['is_augmented'] == True) & (full_data['label'] == 'S')

full_data = full_data[~(cond_aug_nr | cond_aug_s)] 

In [None]:
full_training = True

data = original_data[['lyrics_clean_ml', 'id', 'label']].copy()
# y = original_data[['label', 'id']].copy()
groups = data['id']

if not full_training:
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
    train_idx, test_idx = next(gss.split(data['lyrics_clean_ml'], data['label'], groups))

    train, test = data.iloc[train_idx], data.iloc[test_idx]
else:
    print('Full training!')
    train = original_data[['lyrics_clean_ml', 'id', 'label']].copy()

Full training!


In [None]:
train = pd.merge(train[['id']], full_data, on = "id", how = "inner").copy()
train = train.sample(frac=1, random_state=SEED).reset_index(drop=True)
train

Unnamed: 0,id,label,lyrics,lyrics_clean,augmentation_type,is_augmented,lyrics_clean_ml
0,T2_TRAIN_0352,V,"[Letra de ""Más viejo que tú""]\nAnoche, anoche ...","[Letra de ""Más viejo que tú""]\nAnoche, anoche ...",back_translation_2,True,anoche anoche dormir anoche pina records soñar...
1,T2_TRAIN_0924,V,"\n[Letra de ""Kobe En LA""]\n\n[Coro]\nLa cadena...","[Letra de ""Kobe En LA""]\nLa cadena con la cara...",original,False,cadena cara jesucristo cuidar salir arrodilla ...
2,T2_TRAIN_0746,S,"\n[Letra de ""Me Fallaste""]\n\nNo, no sé cómo p...","[Letra de ""Me Fallaste""]\nNo, no sé cómo pude ...",original,False,cómo poder contener yo tiempo creas cuenta dar...
3,T2_TRAIN_0708,NR,s\nEl 17 de marzo\nA la ciudad de aguaprieta\n...,El de marzo\nA la ciudad de aguaprieta\nVino ...,original,False,marzo ciudad aguaprieta vino gente quiera veni...
4,T2_TRAIN_0482,V,Ve por el sillón y pónselo al burro\nPónselo a...,Ve por el sillón y pónselo al burro\nPónselo a...,aeda_2,True,sillón pon él él burro pon él él burro pon él ...
...,...,...,...,...,...,...,...
1991,T2_TRAIN_0251,NR,\nPersonas como tu quedan muy pocas ya\nLo sup...,Personas como tu quedan muy pocas ya\nLo supe ...,original,False,persona quedar saber dia conoci bese bastante ...
1992,T2_TRAIN_0976,NR,s\nQue mi novia si sabe\nComo se baila la cumb...,Que mi novia si sabe\nComo se baila la cumbia\...,original,False,novia saber baila cumbia sonar tambores invito...
1993,T2_TRAIN_1045,NR,"\n[Letra de ""La Mujer Que No Soñé""]\n\n[Verso ...","[Letra de ""La Mujer Que No Soñé""]\nLa de lente...",original,False,lent pasado moda aburrido intelectual preferir...
1994,T2_TRAIN_0137,V,"\nSuéltame gorila, no\nQue me sueltes gorila, ...","Suéltame gorila, no\nQue me sueltes gorila, no...",original,False,suéltame gorila soltar gorila suéltame gorila ...


In [9]:
X_train = train['lyrics_clean_ml']
y_train = train['label']

if not full_training:
    X_test = test['lyrics_clean_ml']
    y_test = test['label']
    groups_test = test['id']

groups_train = train['id']

In [None]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
if not full_training:
    y_test_enc = le.transform(y_test)

LABELS = le.classes_
NUM_LABELS = len(LABELS)
label2idx = {label: idx for idx, label in enumerate(LABELS)}
idx2label = {idx: label for label, idx in label2idx.items()}

print('Labels:', label2idx)

scoring = {
    'accuracy': 'accuracy',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}

Labels: {'H': 0, 'NR': 1, 'S': 2, 'V': 3}


### Model Learning (HPO - Optuna)

### Random Forest

In [None]:
rf_pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier())
])
group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)

def objective(trial):
    cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
    max_df = trial.suggest_float('vect_max_df', 0.5, 1.0)
    min_df = trial.suggest_int('vect_min_df', 1, 5)

    ngram_range_str = trial.suggest_categorical('vect_ngram_range', ["1,1", "1,2"])
    ngram_range = tuple(map(int, ngram_range_str.split(',')))

    
    use_idf = trial.suggest_categorical('tfidf_use_idf', [True, False])
    
    n_estimators = trial.suggest_int('rf_n_estimators', 100, 530)
    max_depth = trial.suggest_categorical('rf_max_depth', [None] + list(range(5, 55, 5)))
    min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 15)
    min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2', None])
    criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy', 'log_loss'])
    

    params = {
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__ngram_range': ngram_range,
        'tfidf__use_idf': use_idf,
        'clf__n_estimators': n_estimators,
        'clf__max_depth': max_depth,
        'clf__min_samples_split': min_samples_split,
        'clf__min_samples_leaf': min_samples_leaf,
        'clf__max_features': max_features,
        'clf__criterion': criterion,
        'clf__class_weight': 'balanced',  
        'clf__random_state': SEED      
    }
                
    rf_pipe = rf_pipeline.set_params(**params)
    
    scores = cross_val_score(
        rf_pipe,
        X_train,
        y_train_enc,
        cv=cv,
        scoring='f1_macro',
        n_jobs=N_JOBS
    )

    return np.mean(scores)

budget = 300
rf_study_name = 'rf_optimization_{}_{}'.format(file_name, data_config) + 'new_data'
storage_name = f"sqlite:///objects/task_2/studies/{rf_study_name}.db"

rf_study = optuna.create_study(
    study_name=rf_study_name,
    storage=storage_name,
    direction="maximize",
    load_if_exists=True,  
    sampler=optuna.samplers.TPESampler(seed=SEED)  
)

rf_study.optimize(objective, n_trials=budget)


def expand_rf_params(optuna_params, seed):
    return {
        'vect__max_df': optuna_params['vect_max_df'],
        'vect__min_df': optuna_params['vect_min_df'],
        'vect__ngram_range': tuple(map(int, optuna_params['vect_ngram_range'].split(','))),
        'tfidf__use_idf': optuna_params['tfidf_use_idf'],
        'clf__n_estimators': optuna_params['rf_n_estimators'],
        'clf__max_depth': optuna_params['rf_max_depth'],
        'clf__min_samples_split': optuna_params['rf_min_samples_split'],
        'clf__min_samples_leaf': optuna_params['rf_min_samples_leaf'],
        'clf__max_features': optuna_params['rf_max_features'],
        'clf__criterion': optuna_params['rf_criterion'],
        'clf__class_weight': 'balanced',
        'clf__random_state': seed
    }

best_params_expanded = expand_rf_params(rf_study.best_params, SEED)
rf_model = rf_pipeline.set_params(**best_params_expanded)

[I 2025-05-07 14:44:47,330] Using an existing study with name 'rf_optimization_subtask2_all_aug_lang_spa_model_ml_stem_False_lem_True_dup_False_cased_Falsenew_data' instead of creating a new one.
[I 2025-05-07 14:44:51,092] Trial 150 finished with value: 0.4445184478984923 and parameters: {'vect_max_df': 0.7893814067999234, 'vect_min_df': 1, 'vect_ngram_range': '1,1', 'tfidf_use_idf': False, 'rf_n_estimators': 328, 'rf_max_depth': 25, 'rf_min_samples_split': 8, 'rf_min_samples_leaf': 10, 'rf_max_features': 'log2', 'rf_criterion': 'log_loss'}. Best is trial 113 with value: 0.4505759885868489.
[I 2025-05-07 14:44:53,836] Trial 151 finished with value: 0.4464704455073977 and parameters: {'vect_max_df': 0.8217743740401856, 'vect_min_df': 1, 'vect_ngram_range': '1,1', 'tfidf_use_idf': False, 'rf_n_estimators': 329, 'rf_max_depth': 25, 'rf_min_samples_split': 9, 'rf_min_samples_leaf': 9, 'rf_max_features': 'log2', 'rf_criterion': 'log_loss'}. Best is trial 113 with value: 0.4505759885868489.

In [None]:
group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
cv_results = cross_validate(rf_model, X_train, y_train_enc, cv=cv, scoring=scoring, n_jobs=N_JOBS)

rf_model.fit(X_train, y_train_enc)

acc = np.mean(cv_results['test_accuracy'])
pre = np.mean(cv_results['test_precision_macro'])
rec = np.mean(cv_results['test_recall_macro'])
f1_ = np.mean(cv_results['test_f1_macro'])

rf_best_trial = rf_study.best_trial
print("Best trial Cross-Validation results on the training set for Random Forest:")
print(f"  Accuracy:  {acc:.4f}")
print(f"  Precision: {pre:.4f}")
print(f"  Recall:    {rec:.4f}")
print(f"  F1:        {f1_:.4f}")

print("Best hyperparameters:")
for key, value in rf_best_trial.params.items():
    print(f"  {key}: {value}")

model_filename = 'objects/task_2/models/best_rf_{}_{}.pkl'.format(file_name, data_config)
joblib.dump(rf_model, model_filename)
print(f"Random Forest model saved as {model_filename}.")

task_val   = "task_2"
model_name = "Random Forest"
eval_type  = "inner"

save_eval_results(DB_PATH, db_file_name, task_val, model_name, eval_type, acc, pre, rec, f1_, rf_study_name, json.dumps(rf_study.best_params), language)

Best trial Cross-Validation results on the training set for Random Forest:
  Accuracy:  0.4755
  Precision: 0.4914
  Recall:    0.4948
  F1:        0.4577
Best hyperparameters:
  vect_max_df: 0.7359264039121303
  vect_min_df: 1
  vect_ngram_range: 1,1
  tfidf_use_idf: False
  rf_n_estimators: 336
  rf_max_depth: 40
  rf_min_samples_split: 6
  rf_min_samples_leaf: 8
  rf_max_features: log2
  rf_criterion: log_loss
Random Forest model saved as objects/task_2/models/best_rf_subtask2_all_aug_lang_spa_model_ml_stem_False_lem_True_dup_False_cased_False.pkl.

Inserted CV results into 'eval' table in tfm.db.


### SVM (Best Model)

In [None]:
svm_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC())
])
group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
def svm_objective(trial):
    cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
    max_df = trial.suggest_float('vect_max_df', 0.5, 1.0)
    min_df = trial.suggest_int('vect_min_df', 1, 5)
    
    ngram_range_str = trial.suggest_categorical('vect_ngram_range', ["1,1", "1,2"])
    ngram_range = tuple(map(int, ngram_range_str.split(',')))
    
    use_idf = trial.suggest_categorical('tfidf_use_idf', [True, False])
    norm = trial.suggest_categorical('tfidf_norm', ['l1', 'l2', None]) 
    
    C = trial.suggest_float('svm_C', 1e-3, 10, log=True)
    kernel = trial.suggest_categorical('svm_kernel', ['linear', 'rbf', 'sigmoid', 'poly'])
    shrinking = trial.suggest_categorical('svm_shrinking', [True, False]) 
    
    params = {
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__ngram_range': ngram_range,
        'tfidf__use_idf': use_idf,
        'tfidf__norm': norm, 
        'clf__C': C,
        'clf__kernel': kernel,
        'clf__shrinking': shrinking, 
        'clf__class_weight': 'balanced',
        'clf__random_state': SEED  
    }
    
    if kernel != 'linear':
        gamma = trial.suggest_float('svm_gamma', 1e-4, 1, log=True)
        params['clf__gamma'] = gamma
    if kernel == 'poly':
        params['clf__degree'] = trial.suggest_int('svm_degree', 2, 5)
        params['clf__coef0'] = trial.suggest_float('svm_coef0', 0.0, 1.0)
    
    svm_pipe = svm_pipeline.set_params(**params)
    
    scores = cross_val_score(svm_pipe, X_train, y_train_enc, cv=cv, scoring='f1_macro', n_jobs=N_JOBS)
    return scores.mean()

svm_study_name = 'svm_optimization_{}_{}'.format(file_name,data_config) + '_full_training' if full_training else ''
storage_name = f"sqlite:///objects/task_2/studies/{svm_study_name}.db"

svm_study = optuna.create_study(
    study_name=svm_study_name,
    storage=storage_name,
    direction="maximize",
    load_if_exists=True,
    sampler=optuna.samplers.TPESampler(seed=SEED)
)

budget = 350
svm_study.optimize(svm_objective, n_trials=budget)

best_params = svm_study.best_params

def expand_svm_params(optuna_params, seed):
    expanded = {
        'vect__max_df': optuna_params['vect_max_df'],
        'vect__min_df': optuna_params['vect_min_df'],
        'vect__ngram_range': tuple(map(int, optuna_params['vect_ngram_range'].split(','))),
        'tfidf__use_idf': optuna_params['tfidf_use_idf'],
        'tfidf__norm': optuna_params['tfidf_norm'],
        'clf__C': optuna_params['svm_C'],
        'clf__kernel': optuna_params['svm_kernel'],
        'clf__shrinking': optuna_params['svm_shrinking'],
        'clf__class_weight': 'balanced',
        'clf__random_state': seed
    }

    if optuna_params['svm_kernel'] != 'linear':
        expanded['clf__gamma'] = optuna_params['svm_gamma']
    if optuna_params['svm_kernel'] == 'poly':
        expanded['clf__degree'] = optuna_params['svm_degree']
        expanded['clf__coef0'] = optuna_params['svm_coef0']

    return expanded

best_params_expanded = expand_svm_params(svm_study.best_params, SEED)
svm_model = svm_pipeline.set_params(**best_params_expanded)

[I 2025-05-12 01:32:29,119] Using an existing study with name 'svm_optimization_subtask2_all_aug_lang_spa_model_ml_stem_False_lem_True_dup_False_cased_False_full_training' instead of creating a new one.
[I 2025-05-12 01:32:32,466] Trial 300 finished with value: 0.4045086706811808 and parameters: {'vect_max_df': 0.5008714720306161, 'vect_min_df': 5, 'vect_ngram_range': '1,1', 'tfidf_use_idf': True, 'tfidf_norm': 'l2', 'svm_C': 0.6662068962647606, 'svm_kernel': 'sigmoid', 'svm_shrinking': False, 'svm_gamma': 0.706621536364163}. Best is trial 239 with value: 0.4533098926349906.
[I 2025-05-12 01:32:35,575] Trial 301 finished with value: 0.43073700278111177 and parameters: {'vect_max_df': 0.5214426705353511, 'vect_min_df': 5, 'vect_ngram_range': '1,1', 'tfidf_use_idf': True, 'tfidf_norm': 'l2', 'svm_C': 0.37523607377210383, 'svm_kernel': 'sigmoid', 'svm_shrinking': False, 'svm_gamma': 0.9986641977048709}. Best is trial 239 with value: 0.4533098926349906.
[I 2025-05-12 01:32:38,685] Trial 30

In [None]:
if not full_training:
    group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
    cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
    cv_results = cross_validate(svm_model, X_train, y_train_enc, cv=cv, scoring=scoring, n_jobs=N_JOBS)
    svm_model.fit(X_train, y_train_enc)

    acc = np.mean(cv_results['test_accuracy'])
    pre = np.mean(cv_results['test_precision_macro'])
    rec = np.mean(cv_results['test_recall_macro'])
    f1_ = np.mean(cv_results['test_f1_macro'])

    svm_best_trial = svm_study.best_trial
    print("Best trial Cross-Validation results on the training set for SVM:")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  Precision: {pre:.4f}")
    print(f"  Recall:    {rec:.4f}")
    print(f"  F1:        {f1_:.4f}")


    print("Best hyperparameters:")
    for key, value in svm_best_trial.params.items():
        print("  {}: {}".format(key, value))


    task_val = "task_2"
    model_name   = "SVM"
    eval_type   = "inner"  

    save_eval_results(DB_PATH, db_file_name,task_val, model_name, eval_type,  acc, pre, rec, f1_, svm_study_name, json.dumps(svm_study.best_params), language)
else:
    svm_model.fit(X_train, y_train_enc)

    svm_best_trial = svm_study.best_trial

    print("Best hyperparameters:")
    for key, value in svm_best_trial.params.items():
        print("  {}: {}".format(key, value))

full_data_config =  data_config + '_full_training' if full_training else ''
model_filename = 'objects/task_2/models/best_svm_{}_{}.pkl'.format(file_name, full_data_config)
joblib.dump(svm_model, model_filename)
print("SVM model saved as {}.".format(model_filename))

Best hyperparameters:
  vect_max_df: 0.5109089106538789
  vect_min_df: 5
  vect_ngram_range: 1,1
  tfidf_use_idf: False
  tfidf_norm: l2
  svm_C: 0.9178685792438757
  svm_kernel: sigmoid
  svm_shrinking: False
  svm_gamma: 0.9896267404392098
SVM model saved as objects/task_2/models/best_svm_subtask2_all_aug_lang_spa_model_ml_stem_False_lem_True_dup_False_cased_False_full_training.pkl.


### XGBoost

In [None]:
# Compute scale_pos_weight for XGBoost 
counter = Counter(y_train)
majority = max(counter.values())
minority = min(counter.values())
scale_pos_weight = majority / minority

In [None]:
xgb_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])
group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
def xgb_objective(trial):
    
    cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
    max_df = trial.suggest_float('vect_max_df', 0.5, 1.0)
    min_df = trial.suggest_int('vect_min_df', 1, 5)
    ngram_range_str = trial.suggest_categorical('vect_ngram_range', ["1,1", "1,2"])
    ngram_range = tuple(map(int, ngram_range_str.split(',')))
    
    use_idf = trial.suggest_categorical('tfidf_use_idf', [True, False])
    
    n_estimators = trial.suggest_int('xgb_n_estimators', 50, 500)
    max_depth = trial.suggest_int('xgb_max_depth', 3, 15)
    learning_rate = trial.suggest_float('xgb_learning_rate', 0.001, 0.3, log=True)
    subsample = trial.suggest_float('xgb_subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0)
    reg_alpha = trial.suggest_float('xgb_reg_alpha', 1e-8, 1.0, log=True)
    reg_lambda = trial.suggest_float('xgb_reg_lambda', 1e-8, 1.0, log=True)
    
    params = {
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__ngram_range': ngram_range,
        'tfidf__use_idf': use_idf,
        'clf__n_estimators': n_estimators,
        'clf__max_depth': max_depth,
        'clf__learning_rate': learning_rate,
        'clf__subsample': subsample,
        'clf__colsample_bytree': colsample_bytree,
        'clf__reg_alpha': reg_alpha,
        'clf__reg_lambda': reg_lambda,
        'clf__scale_pos_weight': scale_pos_weight, 
        'clf__random_state': SEED
    }
    
    xgb_pipe = xgb_pipeline.set_params(**params)
    
    scores = cross_val_score(xgb_pipe, X_train, y_train_enc, cv=cv, scoring='f1_macro', n_jobs=N_JOBS)
    return scores.mean()

xgb_study_name = 'xgb_optimization_{}_{}'.format(file_name, data_config) + 'new_data'
storage_name = f"sqlite:///objects/task_2/studies/{xgb_study_name}.db"
xgb_study = optuna.create_study(
    study_name=xgb_study_name,
    storage=storage_name,
    direction="maximize",
    load_if_exists=True,
    sampler=optuna.samplers.TPESampler(seed=SEED)
)

budget = 220
xgb_study.optimize(xgb_objective, n_trials=budget)

# Function to expand flat Optuna parameters into pipeline keys
def expand_xgb_params(optuna_params, seed):
    return {
        'vect__max_df': optuna_params['vect_max_df'],
        'vect__min_df': optuna_params['vect_min_df'],
        'vect__ngram_range': tuple(map(int, optuna_params['vect_ngram_range'].split(','))),
        'tfidf__use_idf': optuna_params['tfidf_use_idf'],
        'clf__n_estimators': optuna_params['xgb_n_estimators'],
        'clf__max_depth': optuna_params['xgb_max_depth'],
        'clf__learning_rate': optuna_params['xgb_learning_rate'],
        'clf__subsample': optuna_params['xgb_subsample'],
        'clf__colsample_bytree': optuna_params['xgb_colsample_bytree'],
        'clf__reg_alpha': optuna_params['xgb_reg_alpha'],
        'clf__reg_lambda': optuna_params['xgb_reg_lambda'],
        'clf__scale_pos_weight': scale_pos_weight,
        'clf__random_state': seed
    }

best_params_expanded = expand_xgb_params(xgb_study.best_params, SEED)
xgb_model = xgb_pipeline.set_params(**best_params_expanded)



[I 2025-05-07 16:23:41,602] Using an existing study with name 'xgb_optimization_subtask2_all_aug_lang_spa_model_ml_stem_False_lem_True_dup_False_cased_Falsenew_data' instead of creating a new one.
[I 2025-05-07 16:24:22,364] Trial 121 finished with value: 0.3900209439867778 and parameters: {'vect_max_df': 0.7644239911797637, 'vect_min_df': 3, 'vect_ngram_range': '1,1', 'tfidf_use_idf': True, 'xgb_n_estimators': 243, 'xgb_max_depth': 5, 'xgb_learning_rate': 0.03060526451756153, 'xgb_subsample': 0.6227723047312252, 'xgb_colsample_bytree': 0.9369172348466429, 'xgb_reg_alpha': 1.2747172406239288e-08, 'xgb_reg_lambda': 1.2252240982214502e-07}. Best is trial 12 with value: 0.4135725914320937.
[I 2025-05-07 16:25:04,173] Trial 122 finished with value: 0.39512113544105637 and parameters: {'vect_max_df': 0.7906283196730245, 'vect_min_df': 2, 'vect_ngram_range': '1,1', 'tfidf_use_idf': True, 'xgb_n_estimators': 310, 'xgb_max_depth': 4, 'xgb_learning_rate': 0.019764713789736685, 'xgb_subsample': 

In [None]:
group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
cv_results = cross_validate(xgb_model, X_train, y_train_enc, cv=cv, scoring=scoring, n_jobs=N_JOBS)

xgb_model.fit(X_train, y_train_enc)

acc = np.mean(cv_results['test_accuracy'])
pre = np.mean(cv_results['test_precision_macro'])
rec = np.mean(cv_results['test_recall_macro'])
f1_ = np.mean(cv_results['test_f1_macro'])

xgb_best_trial = xgb_study.best_trial
print("Best trial Cross-Validation results on the training set for XGBoost:")
print(f"  Accuracy:  {acc:.4f}")
print(f"  Precision: {pre:.4f}")
print(f"  Recall:    {rec:.4f}")
print(f"  F1:        {f1_:.4f}")

print("Best hyperparameters:")
for key, value in xgb_best_trial.params.items():
    print(f"  {key}: {value}")

model_filename = 'objects/task_2/models/best_xgb_{}_{}.pkl'.format(
    file_name,
    data_config
)
joblib.dump(xgb_model, model_filename)
print(f"XGBoost model saved as {model_filename}.")

task_val   = "task_2"
model_name = "XGBoost"
eval_type  = "inner"

save_eval_results(DB_PATH, db_file_name, task_val, model_name, eval_type, acc, pre, rec, f1_, xgb_study_name, json.dumps(xgb_study.best_params),language)

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best trial Cross-Validation results on the training set for XGBoost:
  Accuracy:  0.4670
  Precision: 0.4814
  Recall:    0.4756
  F1:        0.4201
Best hyperparameters:
  vect_max_df: 0.7101803218606985
  vect_min_df: 2
  vect_ngram_range: 1,1
  tfidf_use_idf: True
  xgb_n_estimators: 316
  xgb_max_depth: 3
  xgb_learning_rate: 0.11772720195035212
  xgb_subsample: 0.5762724384077174
  xgb_colsample_bytree: 0.9433095569536869
  xgb_reg_alpha: 0.0008035258701272992
  xgb_reg_lambda: 1.2637214986667974e-07
XGBoost model saved as objects/task_2/models/best_xgb_subtask2_all_aug_lang_spa_model_ml_stem_False_lem_True_dup_False_cased_False.pkl.

Inserted CV results into 'eval' table in tfm.db.


### Logistic Regression

In [None]:
lr_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression())  
])
group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
def lr_objective(trial):
    cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
    max_df = trial.suggest_float('vect_max_df', 0.5, 1.0)
    min_df = trial.suggest_int('vect_min_df', 1, 5)
    ngram_range_str = trial.suggest_categorical('vect_ngram_range', ["1,1", "1,2"])
    ngram_range = tuple(map(int, ngram_range_str.split(',')))
    
    use_idf = trial.suggest_categorical('tfidf_use_idf', [True, False])
    
    C = trial.suggest_float('lr_C', 1e-4, 10, log=True)
    penalty = trial.suggest_categorical('lr_penalty', ['l2', 'l1', 'elasticnet', 'none'])
    
    if penalty == 'none':
        penalty_value = None
    else:
        penalty_value = penalty
    
    solver = trial.suggest_categorical('lr_solver', ['lbfgs', 'sag', 'saga'])
    
    if penalty in ['l1', 'elasticnet'] and solver != 'saga':
        raise optuna.TrialPruned("Solver must be 'saga' for l1 or elasticnet penalty.")
    
    params = {
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__ngram_range': ngram_range,
        'tfidf__use_idf': use_idf,
        'clf__C': C,
        'clf__penalty': penalty_value,
        'clf__solver': solver,
        'clf__class_weight': 'balanced',
        'clf__multi_class': 'multinomial', 
        'clf__random_state': SEED,
        'clf__max_iter': 2000
    }
    
    if penalty == 'elasticnet':
        l1_ratio = trial.suggest_float('lr_l1_ratio', 0.0, 1.0)
        params['clf__l1_ratio'] = l1_ratio
        
    lr_pipe = lr_pipeline.set_params(**params)
    scores = cross_val_score(lr_pipe, X_train, y_train_enc, cv=cv, scoring='f1_macro', n_jobs=N_JOBS)
    return np.mean(scores)

lr_study_name = 'lr_optimization_{}_{}'.format(file_name, data_config)
storage_name = f"sqlite:///objects/task_2/studies/{lr_study_name}.db"
lr_study = optuna.create_study(
    study_name=lr_study_name,
    storage=storage_name,
    direction="maximize",
    load_if_exists=True,
    sampler=optuna.samplers.TPESampler(seed=SEED)
)

budget = 500
lr_study.optimize(lr_objective, n_trials=budget)

def expand_lr_params(optuna_params, seed):
    expanded = {
        'vect__max_df': optuna_params['vect_max_df'],
        'vect__min_df': optuna_params['vect_min_df'],
        'vect__ngram_range': tuple(map(int, optuna_params['vect_ngram_range'].split(','))),
        'tfidf__use_idf': optuna_params['tfidf_use_idf'],
        'clf__C': optuna_params['lr_C'],
        'clf__penalty': None if optuna_params['lr_penalty'] == 'none' else optuna_params['lr_penalty'],
        'clf__solver': optuna_params['lr_solver'],
        'clf__class_weight': 'balanced',
        'clf__multi_class': 'multinomial',
        'clf__random_state': seed,
        'clf__max_iter': 2000
    }
    if optuna_params['lr_penalty'] == 'elasticnet':
        expanded['clf__l1_ratio'] = optuna_params['lr_l1_ratio']
    return expanded

best_params_expanded = expand_lr_params(lr_study.best_params, SEED)
lr_model = lr_pipeline.set_params(**best_params_expanded)

In [None]:
group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
cv_results = cross_validate(lr_model, X_train, y_train_enc, cv=cv, scoring=scoring, n_jobs=N_JOBS)

lr_model.fit(X_train, y_train_enc)

acc = np.mean(cv_results['test_accuracy'])
pre = np.mean(cv_results['test_precision_macro'])
rec = np.mean(cv_results['test_recall_macro'])
f1_ = np.mean(cv_results['test_f1_macro'])

lr_best_trial = lr_study.best_trial
print("Best trial Cross-Validation results on the training set for Logistic Regression:")
print(f"  Accuracy:  {acc:.4f}")
print(f"  Precision: {pre:.4f}")
print(f"  Recall:    {rec:.4f}")
print(f"  F1:        {f1_:.4f}")

print("Best hyperparameters:")
for key, value in lr_best_trial.params.items():
    print(f"  {key}: {value}")

model_filename = 'objects/task_2/models/best_lr_{}_{}.pkl'.format(
    file_name,
    data_config
)
joblib.dump(lr_model, model_filename)
print(f"Logistic Regression model saved as {model_filename}.")

task_val   = "task_2"
model_name = "Logistic Regression"
eval_type  = "inner"

save_eval_results(DB_PATH, db_file_name, task_val, model_name, eval_type, acc, pre, rec, f1_, lr_study_name, json.dumps(lr_study.best_params),language)

Best trial Cross-Validation results on the training set for Logistic Regression:
  Accuracy:  0.4530
  Precision: 0.4884
  Recall:    0.4629
  F1:        0.4552
Best hyperparameters:
  vect_max_df: 0.5076150727965348
  vect_min_df: 1
  vect_ngram_range: 1,1
  tfidf_use_idf: True
  lr_C: 0.0002877848038137215
  lr_penalty: l2
  lr_solver: lbfgs
Logistic Regression model saved as objects/task_2/models/best_lr_subtask2_all_aug_lang_spa_model_ml_stem_False_lem_True_dup_False_cased_False.pkl.

Inserted CV results into 'eval' table in tfm.db.




## External Evaluation (held-out test set)

### Random Forest

In [30]:
# Outer evaluation for Random Forest
y_pred_rf = rf_model.predict(X_test)
acc_rf = accuracy_score(y_test_enc, y_pred_rf)
pre_rf = precision_score(y_test_enc, y_pred_rf, average='macro')
rec_rf = recall_score(y_test_enc, y_pred_rf, average='macro')
f1_rf = f1_score(y_test_enc, y_pred_rf, average='macro')

print("\nOuter evaluation results for Random Forest:")
print(f"  Accuracy:  {acc_rf:.4f}")
print(f"  Precision: {pre_rf:.4f}")
print(f"  Recall:    {rec_rf:.4f}")
print(f"  F1:        {f1_rf:.4f}")

save_eval_results(DB_PATH, db_file_name, 
                  task_val="task_2", 
                  model_name="Random Forest", 
                  eval_type="outer", 
                  acc=acc_rf, pre=pre_rf, rec=rec_rf, f1_=f1_rf, 
                  study_name=rf_study_name, 
                  params=json.dumps(rf_study.best_params),
                  lang=language)


Outer evaluation results for Random Forest:
  Accuracy:  0.5897
  Precision: 0.4165
  Recall:    0.3963
  F1:        0.3917

Inserted CV results into 'eval' table in tfm.db.


### SVM

In [42]:
# Outer evaluation for SVM
y_pred_svm = svm_model.predict(X_test)
acc_svm = accuracy_score(y_test_enc, y_pred_svm)
pre_svm = precision_score(y_test_enc, y_pred_svm, average='macro')
rec_svm = recall_score(y_test_enc, y_pred_svm, average='macro')
f1_svm = f1_score(y_test_enc, y_pred_svm, average='macro')

print("\nOuter evaluation results for SVM:")
print(f"  Accuracy:  {acc_svm:.4f}")
print(f"  Precision: {pre_svm:.4f}")
print(f"  Recall:    {rec_svm:.4f}")
print(f"  F1:        {f1_svm:.4f}")

save_eval_results(DB_PATH, db_file_name, 
                  task_val="task_2", 
                  model_name="SVM", 
                  eval_type="outer", 
                  acc=acc_svm, pre=pre_svm, rec=rec_svm, f1_=f1_svm, 
                  study_name=svm_study_name, 
                  params=json.dumps(svm_study.best_params),
                  lang=language)


Outer evaluation results for SVM:
  Accuracy:  0.5897
  Precision: 0.4166
  Recall:    0.4368
  F1:        0.4230

Inserted CV results into 'eval' table in tfm.db.


### XGBoost

In [17]:
# Outer evaluation for XGBoost
y_pred_xgb = xgb_model.predict(X_test)
acc_xgb = accuracy_score(y_test_enc, y_pred_xgb)
pre_xgb = precision_score(y_test_enc, y_pred_xgb, average='macro')
rec_xgb = recall_score(y_test_enc, y_pred_xgb, average='macro')
f1_xgb = f1_score(y_test_enc, y_pred_xgb, average='macro')

print("\nOuter evaluation results for XGBoost:")
print(f"  Accuracy:  {acc_xgb:.4f}")
print(f"  Precision: {pre_xgb:.4f}")
print(f"  Recall:    {rec_xgb:.4f}")
print(f"  F1:        {f1_xgb:.4f}")

save_eval_results(DB_PATH, db_file_name, 
                  task_val="task_2", 
                  model_name="XGBoost", 
                  eval_type="outer", 
                  acc=acc_xgb, pre=pre_xgb, rec=rec_xgb, f1_=f1_xgb, 
                  study_name=xgb_study_name, 
                  params=json.dumps(xgb_study.best_params),
                  lang=language)


Outer evaluation results for XGBoost:
  Accuracy:  0.6410
  Precision: 0.4300
  Recall:    0.4191
  F1:        0.4178

Inserted CV results into 'eval' table in tfm.db.


### Logistic Regression

In [24]:
# Outer evaluation for Logistic Regression
y_pred_lr = lr_model.predict(X_test)
acc_lr = accuracy_score(y_test_enc, y_pred_lr)
pre_lr = precision_score(y_test_enc, y_pred_lr, average='macro')
rec_lr = recall_score(y_test_enc, y_pred_lr, average='macro')
f1_lr = f1_score(y_test_enc, y_pred_lr, average='macro')

print("Outer evaluation results for Logistic Regression:")
print(f"  Accuracy:  {acc_lr:.4f}")
print(f"  Precision: {pre_lr:.4f}")
print(f"  Recall:    {rec_lr:.4f}")
print(f"  F1:        {f1_lr:.4f}")

save_eval_results(DB_PATH, db_file_name, 
                  task_val="task_2", 
                  model_name="Logistic Regression", 
                  eval_type="outer", 
                  acc=acc_lr, pre=pre_lr, rec=rec_lr, f1_=f1_lr, 
                  study_name=lr_study_name, 
                  params=json.dumps(lr_study.best_params),
                  lang=language)


Outer evaluation results for Logistic Regression:
  Accuracy:  0.4829
  Precision: 0.4545
  Recall:    0.4513
  F1:        0.4045

Inserted CV results into 'eval' table in tfm.db.
