In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import optuna
import joblib
import multiprocessing
import json

from sklearn.model_selection import cross_validate, cross_val_score, StratifiedGroupKFold, GroupShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from collections import Counter
from utils import save_eval_results

J_JOBS = multiprocessing.cpu_count() - 1
SEED = 42
DB_PATH = "data/tfm.db" 
SUBTASK1_PATH = 'new_data\subtask1'

In [16]:
language = 'spa'
model_type = 'ml'
stemming = False
lemmatization = True
remove_duplicates = False
cased = True # Keep cased for DL models

data_config = f"lang_{language}_model_{model_type}_stem_{stemming}_lem_{lemmatization}_dup_{remove_duplicates}_cased_{cased}"
file_name = 'subtask1_balanced_aug_v2'


# db_file_name = 'subtask1_train_clean_{}.csv'.format(data_config)
db_file_name = '{}_{}.csv'.format(os.path.join(SUBTASK1_PATH, file_name), data_config)

if os.path.exists(db_file_name):
    full_data = pd.read_csv(db_file_name, encoding='utf-8') # , index_col='id'
    full_data = full_data.sample(frac=1, random_state=SEED).reset_index(drop=True) # Shuffle the data
    full_data['lyrics_clean'] = full_data['lyrics_clean'].str.lower()
    print('File found: ')
    print(full_data.info())
else:
    print('File not found...')    


original_data = full_data[full_data['is_augmented'] != True].copy()
original_data = original_data.sample(frac=1, random_state=SEED).reset_index(drop=True)
full_data = full_data.sample(frac=1, random_state=SEED).reset_index(drop=True)

original_data.info()

File found: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5136 entries, 0 to 5135
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5136 non-null   object
 1   lyrics             5136 non-null   object
 2   label              5136 non-null   object
 3   augmentation_type  5136 non-null   object
 4   is_augmented       5136 non-null   bool  
 5   lyrics_clean       5136 non-null   object
dtypes: bool(1), object(5)
memory usage: 205.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2104 entries, 0 to 2103
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 2104 non-null   object
 1   lyrics             2104 non-null   object
 2   label              2104 non-null   object
 3   augmentation_type  2104 non-null   object
 4   is_augmented       2104 non-null   bool  
 5   lyrics_cl

In [19]:
full_training = False

# Prepare features and labels
data = original_data[['lyrics_clean', 'id', 'label']].copy()
# y = original_data[['label', 'id']].copy()
groups = data['id']

if not full_training:
    # Group-aware train-test split (80/20)
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
    train_idx, test_idx = next(gss.split(data['lyrics_clean'], data['label'], groups))

    train, test = data.iloc[train_idx], data.iloc[test_idx]
else:
    print('Full training!')
    train = original_data[['lyrics_clean', 'id', 'label']].copy()




In [20]:
train = pd.merge(train[['id']], full_data, on = "id", how = "inner").copy()
train = train.sample(frac=1, random_state=SEED).reset_index(drop=True)

In [21]:
X_train = train['lyrics_clean']
y_train = train['label']

if not full_training:
    X_test = test['lyrics_clean']
    y_test = test['label']
    groups_test = test['id']

groups_train = train['id']

In [22]:
# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
if not full_training:
    # check for leakage
    leaked_ids = set(groups_train).intersection(set(groups_test))
    print("Leaked IDs:", leaked_ids)  # Should be: set()
    y_test_enc = le.transform(y_test)

# Label mapping
LABELS = le.classes_
NUM_LABELS = len(LABELS)
label2idx = {label: idx for idx, label in enumerate(LABELS)}
idx2label = {idx: label for label, idx in label2idx.items()}


scoring = {
    'accuracy': 'accuracy',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}


Leaked IDs: set()


### Random Forest

In [None]:
rf_pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier(n_jobs=J_JOBS))
])
group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)

# objective function for Optuna
def objective(trial):
    cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
    max_df = trial.suggest_float('vect_max_df', 0.5, 1.0)
    min_df = trial.suggest_int('vect_min_df', 1, 5)

    ngram_range_str = trial.suggest_categorical('vect_ngram_range', ["1,1", "1,2"])
    ngram_range = tuple(map(int, ngram_range_str.split(',')))

    
    use_idf = trial.suggest_categorical('tfidf_use_idf', [True, False])
    
    n_estimators = trial.suggest_int('rf_n_estimators', 100, 530)
    max_depth = trial.suggest_categorical('rf_max_depth', [None] + list(range(5, 70, 5)))
    min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 15)
    min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2', None])
    criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy', 'log_loss'])
    

    params = {
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__ngram_range': ngram_range,
        'tfidf__use_idf': use_idf,
        'clf__n_estimators': n_estimators,
        'clf__max_depth': max_depth,
        'clf__min_samples_split': min_samples_split,
        'clf__min_samples_leaf': min_samples_leaf,
        'clf__max_features': max_features,
        'clf__criterion': criterion,
        'clf__class_weight': 'balanced',  
        'clf__random_state': SEED      
    }
                
    rf_pipe = rf_pipeline.set_params(**params)
    
    # Evaluate with F1-score as metric 
    scores = cross_val_score(
        rf_pipe,
        X_train,
        y_train_enc,
        cv=cv,
        scoring='f1_macro',
        n_jobs=J_JOBS
    )

    return np.mean(scores)

# Optuna study and optimize the objective function
budget = 300
rf_study_name = 'rf_optimization_{}_{}'.format(file_name, data_config) + '_full_training' if full_training else ''
storage_name = f"sqlite:///objects/task_1/studies/{rf_study_name}.db"

rf_study = optuna.create_study(
    study_name=rf_study_name,
    storage=storage_name,
    direction="maximize",
    load_if_exists=True,  
    sampler=optuna.samplers.TPESampler(seed=SEED)  
)

rf_study.optimize(objective, n_trials=budget)

def expand_rf_params(optuna_params, seed):
    return {
        'vect__max_df': optuna_params['vect_max_df'],
        'vect__min_df': optuna_params['vect_min_df'],
        'vect__ngram_range': tuple(map(int, optuna_params['vect_ngram_range'].split(','))),
        'tfidf__use_idf': optuna_params['tfidf_use_idf'],
        'clf__n_estimators': optuna_params['rf_n_estimators'],
        'clf__max_depth': optuna_params['rf_max_depth'],
        'clf__min_samples_split': optuna_params['rf_min_samples_split'],
        'clf__min_samples_leaf': optuna_params['rf_min_samples_leaf'],
        'clf__max_features': optuna_params['rf_max_features'],
        'clf__criterion': optuna_params['rf_criterion'],
        'clf__class_weight': 'balanced',
        'clf__random_state': seed
    }

best_params_expanded = expand_rf_params(rf_study.best_params, SEED)
rf_model = rf_pipeline.set_params(**best_params_expanded)


[I 2025-05-11 23:24:48,463] A new study created in RDB with name: rf_optimization_subtask1_balanced_aug_v2_lang_spa_model_ml_stem_False_lem_True_dup_False_cased_True_NEW_full_training
[I 2025-05-11 23:28:26,500] Trial 0 finished with value: 0.735424118705003 and parameters: {'vect_max_df': 0.6982109220775343, 'vect_min_df': 3, 'vect_ngram_range': '1,2', 'tfidf_use_idf': False, 'rf_n_estimators': 240, 'rf_max_depth': 40, 'rf_min_samples_split': 13, 'rf_min_samples_leaf': 6, 'rf_max_features': None, 'rf_criterion': 'log_loss'}. Best is trial 0 with value: 0.735424118705003.
[I 2025-05-11 23:28:33,145] Trial 1 finished with value: 0.7348078416384867 and parameters: {'vect_max_df': 0.6756221701465204, 'vect_min_df': 1, 'vect_ngram_range': '1,2', 'tfidf_use_idf': True, 'rf_n_estimators': 287, 'rf_max_depth': 5, 'rf_min_samples_split': 11, 'rf_min_samples_leaf': 7, 'rf_max_features': 'sqrt', 'rf_criterion': 'log_loss'}. Best is trial 0 with value: 0.735424118705003.
[I 2025-05-11 23:31:43,08

In [None]:
if not full_training:
    group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
    cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
    cv_results = cross_validate(rf_model, X_train, y_train_enc, cv=cv, scoring=scoring, n_jobs=J_JOBS)

    # Refit the best model on the full training data
    rf_model.fit(X_train, y_train_enc)

    # Calculate mean metrics
    acc = np.mean(cv_results['test_accuracy'])
    pre = np.mean(cv_results['test_precision_macro'])
    rec = np.mean(cv_results['test_recall_macro'])
    f1_ = np.mean(cv_results['test_f1_macro'])

    # Best trial details
    rf_best_trial = rf_study.best_trial
    print("Best trial Cross-Validation results on the training set for Random Forest:")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  Precision: {pre:.4f}")
    print(f"  Recall:    {rec:.4f}")
    print(f"  F1:        {f1_:.4f}")

    print("Best hyperparameters:")
    for key, value in rf_best_trial.params.items():
        print(f"  {key}: {value}")

    # Save the best RF model
    model_filename = 'objects/task_1/models/best_rf_{}_{}.pkl'.format(file_name, data_config)
    joblib.dump(rf_model, model_filename)
    print(f"Random Forest model saved as {model_filename}.")

    # Insert results into the 'eval' table in tfm.db (assuming you have a function called save_eval_results)
    task_val   = "task_1"
    model_name = "Random Forest"
    eval_type  = "inner"  

    save_eval_results(DB_PATH, db_file_name, task_val, model_name, eval_type, acc, pre, rec, f1_, rf_study_name, json.dumps(rf_study.best_params), language)

else: 
    # Refit the best model on the full competition training data
    rf_model.fit(full_data['lyrics_clean'], y_train_enc)
    
    # Best trial details
    rf_best_trial = rf_study.best_trial

    print("Best hyperparameters:")
    for key, value in rf_best_trial.params.items():
        print(f"  {key}: {value}")

    # Save the best RF model
    full_data_config =  data_config + '_full_training' if full_training else ''
    model_filename = 'objects/task_1/models/best_rf_{}_{}.pkl'.format(file_name, full_data_config)
    joblib.dump(rf_model, model_filename)
    print(f"Random Forest model saved as {model_filename}.")
    

Best hyperparameters:
  vect_max_df: 0.6825285544981433
  vect_min_df: 2
  vect_ngram_range: 1,1
  tfidf_use_idf: True
  rf_n_estimators: 371
  rf_max_depth: 50
  rf_min_samples_split: 2
  rf_min_samples_leaf: 6
  rf_max_features: sqrt
  rf_criterion: entropy
Random Forest model saved as objects/task_1/models/best_rf_subtask1_balanced_aug_v2_lang_spa_model_ml_stem_False_lem_True_dup_False_cased_True_NEW_full_training.pkl.


### SVM

In [None]:
svm_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC())
])

def svm_objective(trial):
    group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
    cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
    max_df = trial.suggest_float('vect_max_df', 0.5, 1.0)
    min_df = trial.suggest_int('vect_min_df', 1, 5)
    
    ngram_range_str = trial.suggest_categorical('vect_ngram_range', ["1,1", "1,2"])
    ngram_range = tuple(map(int, ngram_range_str.split(',')))
    
    use_idf = trial.suggest_categorical('tfidf_use_idf', [True, False])
    norm = trial.suggest_categorical('tfidf_norm', ['l1', 'l2', None]) 
    
    C = trial.suggest_float('svm_C', 1e-3, 10, log=True)
    kernel = trial.suggest_categorical('svm_kernel', ['linear', 'rbf', 'sigmoid', 'poly'])
    shrinking = trial.suggest_categorical('svm_shrinking', [True, False]) 
    
    params = {
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__ngram_range': ngram_range,
        'tfidf__use_idf': use_idf,
        'tfidf__norm': norm, 
        'clf__C': C,
        'clf__kernel': kernel,
        'clf__shrinking': shrinking, 
        # 'clf__class_weight': 'balanced',
        'clf__random_state': SEED  
    }
    
    # For non-linear kernels explore gamma
    if kernel != 'linear':
        gamma = trial.suggest_float('svm_gamma', 1e-4, 1, log=True)
        params['clf__gamma'] = gamma
    if kernel == 'poly':
        params['clf__degree'] = trial.suggest_int('svm_degree', 2, 5)
        params['clf__coef0'] = trial.suggest_float('svm_coef0', 0.0, 1.0)
    
    # Update pipeline with current hyperparameters
    svm_pipe = svm_pipeline.set_params(**params)
    
    # Evaluate with F1-score using cross-validation
    scores = cross_val_score(svm_pipe, X_train, y_train, cv=cv, scoring='f1_macro', n_jobs=J_JOBS)
    return scores.mean()

# Create an Optuna study for SVM. For a new study, ensure the study name is unique
svm_study_name = 'svm_optimization_{}_{}'.format(file_name,data_config)
storage_name = f"sqlite:///objects/task_1/studies/{svm_study_name}.db"

svm_study = optuna.create_study(
    study_name=svm_study_name,
    storage=storage_name,
    direction="maximize",
    load_if_exists=True,
    sampler=optuna.samplers.TPESampler(seed=SEED)
)

# Optimize the objective function
budget = 150
svm_study.optimize(svm_objective, n_trials=budget)

# Extract the best parameters and update the pipeline
best_params = svm_study.best_params

def expand_svm_params(optuna_params, seed):
    # Expand parameters from Optuna's best trial into full pipeline format
    expanded = {
        'vect__max_df': optuna_params['vect_max_df'],
        'vect__min_df': optuna_params['vect_min_df'],
        'vect__ngram_range': tuple(map(int, optuna_params['vect_ngram_range'].split(','))),
        'tfidf__use_idf': optuna_params['tfidf_use_idf'],
        'tfidf__norm': optuna_params['tfidf_norm'],
        'clf__C': optuna_params['svm_C'],
        'clf__kernel': optuna_params['svm_kernel'],
        'clf__shrinking': optuna_params['svm_shrinking'],
        # 'clf__class_weight': 'balanced',
        'clf__random_state': seed
    }

    if optuna_params['svm_kernel'] != 'linear':
        expanded['clf__gamma'] = optuna_params['svm_gamma']
    if optuna_params['svm_kernel'] == 'poly':
        expanded['clf__degree'] = optuna_params['svm_degree']
        expanded['clf__coef0'] = optuna_params['svm_coef0']

    return expanded

best_params_expanded = expand_svm_params(svm_study.best_params, SEED)
svm_model = svm_pipeline.set_params(**best_params_expanded)

[I 2025-03-31 14:16:37,842] Using an existing study with name 'svm_optimization_subtask1_balanced_aug_lang_spa_model_ml_stem_False_lem_True_dup_True_cased_False_v2' instead of creating a new one.
[I 2025-03-31 14:16:44,362] Trial 70 finished with value: 0.7391784308148306 and parameters: {'vect_max_df': 0.8779230310052889, 'vect_min_df': 3, 'vect_ngram_range': '1,2', 'tfidf_use_idf': True, 'tfidf_norm': 'l2', 'svm_C': 3.1855880538611396, 'svm_kernel': 'sigmoid', 'svm_shrinking': True, 'svm_gamma': 0.031051539889644413}. Best is trial 52 with value: 0.7751173376162185.
[I 2025-03-31 14:16:48,871] Trial 71 finished with value: 0.7390781326415319 and parameters: {'vect_max_df': 0.8102638758125771, 'vect_min_df': 2, 'vect_ngram_range': '1,1', 'tfidf_use_idf': True, 'tfidf_norm': None, 'svm_C': 6.755238196248923, 'svm_kernel': 'linear', 'svm_shrinking': True}. Best is trial 52 with value: 0.7751173376162185.
[I 2025-03-31 14:16:56,284] Trial 72 finished with value: 0.772671909356629 and par

In [None]:
group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
cv_results = cross_validate(svm_model, X_train, y_train_enc, cv=cv, scoring=scoring, n_jobs=J_JOBS)
# Refit the best model on the full training data
svm_model.fit(X_train, y_train)

# Calculate mean metrics
acc = np.mean(cv_results['test_accuracy'])
pre = np.mean(cv_results['test_precision_macro'])
rec = np.mean(cv_results['test_recall_macro'])
f1_ = np.mean(cv_results['test_f1_macro'])

svm_best_trial = svm_study.best_trial
print("Best trial Cross-Validation results on the training set for SVM:")
print(f"  Accuracy:  {acc:.4f}")
print(f"  Precision: {pre:.4f}")
print(f"  Recall:    {rec:.4f}")
print(f"  F1:        {f1_:.4f}")


print("Best hyperparameters:")
for key, value in svm_best_trial.params.items():
    print("  {}: {}".format(key, value))

# Save the best SVM model
model_filename = 'objects/task_1/models/best_svm_{}_{}.pkl'.format(file_name, data_config)
joblib.dump(svm_model, model_filename)
print("SVM model saved as {}.".format(model_filename))

task_val = "task_1"
model_name   = "SVM"
eval_type   = "inner"  

save_eval_results(DB_PATH, db_file_name,task_val, model_name, eval_type,  acc, pre, rec, f1_, svm_study_name, json.dumps(svm_study.best_params), language)

Best trial Cross-Validation results on the training set for SVM:
  Accuracy:  0.7837
  Precision: 0.7906
  Recall:    0.7838
  F1:        0.7813
Best hyperparameters:
  vect_max_df: 0.619430164505521
  vect_min_df: 1
  vect_ngram_range: 1,1
  tfidf_use_idf: True
  tfidf_norm: l2
  svm_C: 0.5479424829877481
  svm_kernel: linear
  svm_shrinking: False
SVM model saved as objects/task_1/models/best_svm_subtask1_balanced_aug_lang_spa_model_ml_stem_False_lem_True_dup_True_cased_False.pkl.

Inserted CV results into 'eval' table in tfm.db.


### XGBoost

In [None]:
# scale_pos_weight for XGBoost 
counter = Counter(y_train)
majority = max(counter.values())
minority = min(counter.values())
scale_pos_weight = majority / minority

In [None]:
xgb_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

def xgb_objective(trial):
    group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
    cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
    max_df = trial.suggest_float('vect_max_df', 0.5, 1.0)
    min_df = trial.suggest_int('vect_min_df', 1, 5)
    ngram_range_str = trial.suggest_categorical('vect_ngram_range', ["1,1", "1,2"])
    ngram_range = tuple(map(int, ngram_range_str.split(',')))
    
    use_idf = trial.suggest_categorical('tfidf_use_idf', [True, False])
    
    n_estimators = trial.suggest_int('xgb_n_estimators', 50, 500)
    max_depth = trial.suggest_int('xgb_max_depth', 3, 15)
    learning_rate = trial.suggest_float('xgb_learning_rate', 0.001, 0.4, log=True)
    subsample = trial.suggest_float('xgb_subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0)
    reg_alpha = trial.suggest_float('xgb_reg_alpha', 1e-8, 1.0, log=True)
    reg_lambda = trial.suggest_float('xgb_reg_lambda', 1e-8, 1.0, log=True)
    
    params = {
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__ngram_range': ngram_range,
        'tfidf__use_idf': use_idf,
        'clf__n_estimators': n_estimators,
        'clf__max_depth': max_depth,
        'clf__learning_rate': learning_rate,
        'clf__subsample': subsample,
        'clf__colsample_bytree': colsample_bytree,
        'clf__reg_alpha': reg_alpha,
        'clf__reg_lambda': reg_lambda,
        'clf__scale_pos_weight': scale_pos_weight,  
        'clf__random_state': SEED
    }
    
    xgb_pipe = xgb_pipeline.set_params(**params)
    
    scores = cross_val_score(xgb_pipe, X_train, y_train_enc, cv=cv, scoring='f1_macro', n_jobs=J_JOBS)
    return scores.mean()

xgb_study_name = 'xgb_optimization_{}_{}'.format(file_name, data_config)
storage_name = f"sqlite:///objects/task_1/studies/{xgb_study_name}.db"
xgb_study = optuna.create_study(
    study_name=xgb_study_name,
    storage=storage_name,
    direction="maximize",
    load_if_exists=True,
    sampler=optuna.samplers.TPESampler(seed=SEED)
)

budget = 300
xgb_study.optimize(xgb_objective, n_trials=budget)

def expand_xgb_params(optuna_params, seed):
    return {
        'vect__max_df': optuna_params['vect_max_df'],
        'vect__min_df': optuna_params['vect_min_df'],
        'vect__ngram_range': tuple(map(int, optuna_params['vect_ngram_range'].split(','))),
        'tfidf__use_idf': optuna_params['tfidf_use_idf'],
        'clf__n_estimators': optuna_params['xgb_n_estimators'],
        'clf__max_depth': optuna_params['xgb_max_depth'],
        'clf__learning_rate': optuna_params['xgb_learning_rate'],
        'clf__subsample': optuna_params['xgb_subsample'],
        'clf__colsample_bytree': optuna_params['xgb_colsample_bytree'],
        'clf__reg_alpha': optuna_params['xgb_reg_alpha'],
        'clf__reg_lambda': optuna_params['xgb_reg_lambda'],
        'clf__scale_pos_weight': scale_pos_weight,
        'clf__random_state': seed
    }

best_params_expanded = expand_xgb_params(xgb_study.best_params, SEED)
xgb_model = xgb_pipeline.set_params(**best_params_expanded)

[I 2025-03-26 23:38:52,334] A new study created in RDB with name: xgb_optimization_lang_spa_model_ml_stem_False_lem_True_dup_False
[I 2025-03-26 23:39:03,221] Trial 0 finished with value: 0.7403614806585939 and parameters: {'vect_max_df': 0.6982109220775343, 'vect_min_df': 3, 'vect_ngram_range': '1,2', 'tfidf_use_idf': False, 'xgb_n_estimators': 196, 'xgb_max_depth': 5, 'xgb_learning_rate': 0.19295792934091066, 'xgb_subsample': 0.8082390952536007, 'xgb_colsample_bytree': 0.9585333734781921, 'xgb_reg_alpha': 2.6476698729459474e-06, 'xgb_reg_lambda': 9.867638589330041e-08}. Best is trial 0 with value: 0.7403614806585939.
[I 2025-03-26 23:39:24,013] Trial 1 finished with value: 0.7370265238743763 and parameters: {'vect_max_df': 0.8948137608297146, 'vect_min_df': 2, 'vect_ngram_range': '1,1', 'tfidf_use_idf': False, 'xgb_n_estimators': 314, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.045438755457014, 'xgb_subsample': 0.8284408073203631, 'xgb_colsample_bytree': 0.5607244381981831, 'xgb_reg_

In [None]:
group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
cv_results = cross_validate(xgb_model, X_train, y_train_enc, cv=cv, scoring=scoring, n_jobs=J_JOBS)

xgb_model.fit(X_train, y_train_enc)

acc = np.mean(cv_results['test_accuracy'])
pre = np.mean(cv_results['test_precision_macro'])
rec = np.mean(cv_results['test_recall_macro'])
f1_ = np.mean(cv_results['test_f1_macro'])

xgb_best_trial = xgb_study.best_trial
print("Best trial Cross-Validation results on the training set for XGBoost:")
print(f"  Accuracy:  {acc:.4f}")
print(f"  Precision: {pre:.4f}")
print(f"  Recall:    {rec:.4f}")
print(f"  F1:        {f1_:.4f}")

print("Best hyperparameters:")
for key, value in xgb_best_trial.params.items():
    print(f"  {key}: {value}")

model_filename = 'objects/task_1/models/best_xgb_{}_{}.pkl'.format(
    file_name,
    data_config
)
joblib.dump(xgb_model, model_filename)
print(f"XGBoost model saved as {model_filename}.")

task_val   = "task_1"
model_name = "XGBoost"
eval_type  = "inner"

save_eval_results(DB_PATH, db_file_name, task_val, model_name, eval_type, acc, pre, rec, f1_, xgb_study_name, json.dumps(xgb_study.best_params),language)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best trial Cross-Validation results on the training set for XGBoost:
  Accuracy:  0.8116
  Precision: 0.7993
  Recall:    0.7347
  F1:        0.7537
Best hyperparameters:
  vect_max_df: 0.7199787513631403
  vect_min_df: 5
  vect_ngram_range: 1,2
  tfidf_use_idf: True
  xgb_n_estimators: 307
  xgb_max_depth: 5
  xgb_learning_rate: 0.1197967376271997
  xgb_subsample: 0.9050577967673583
  xgb_colsample_bytree: 0.8503754664707338
  xgb_reg_alpha: 0.00015171620911956087
  xgb_reg_lambda: 1.0737553932642062e-06
XGBoost model saved as objects/task_1/models/best_xgb_lang_spa_model_ml_stem_False_lem_True_dup_False.pkl.

Inserted CV results into 'eval' table in tfm.db.


### Logistic Regression

In [None]:
lr_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression())  
])

def lr_objective(trial):
    group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
    cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
    max_df = trial.suggest_float('vect_max_df', 0.5, 1.0)
    min_df = trial.suggest_int('vect_min_df', 1, 5)
    ngram_range_str = trial.suggest_categorical('vect_ngram_range', ["1,1", "1,2"])
    ngram_range = tuple(map(int, ngram_range_str.split(',')))
    
    use_idf = trial.suggest_categorical('tfidf_use_idf', [True, False])
    
    C = trial.suggest_float('lr_C', 1e-3, 10, log=True)
    penalty = trial.suggest_categorical('lr_penalty', ['l2', 'l1', 'elasticnet', 'none'])
    
    if penalty == 'none':
        penalty_value = None
    else:
        penalty_value = penalty
    
    solver = trial.suggest_categorical('lr_solver', ['lbfgs', 'sag', 'saga'])
    
    if penalty in ['l1', 'elasticnet'] and solver != 'saga':
        raise optuna.TrialPruned("Solver must be 'saga' for l1 or elasticnet penalty.")
    
    params = {
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__ngram_range': ngram_range,
        'tfidf__use_idf': use_idf,
        'clf__C': C,
        'clf__penalty': penalty_value,
        'clf__solver': solver,
        'clf__class_weight': 'balanced',
        'clf__random_state': SEED,
        'clf__max_iter': 1000
    }
    
    if penalty == 'elasticnet':
        l1_ratio = trial.suggest_float('lr_l1_ratio', 0.0, 1.0)
        params['clf__l1_ratio'] = l1_ratio
        
    lr_pipe = lr_pipeline.set_params(**params)
    scores = cross_val_score(lr_pipe, X_train, y_train_enc, cv=cv, scoring='f1_macro', n_jobs=J_JOBS)
    return np.mean(scores)

lr_study_name = 'lr_optimization_{}_{}'.format(file_name, data_config)
storage_name = f"sqlite:///objects/task_1/studies/{lr_study_name}.db"
lr_study = optuna.create_study(
    study_name=lr_study_name,
    storage=storage_name,
    direction="maximize",
    load_if_exists=True,
    sampler=optuna.samplers.TPESampler(seed=SEED)
)

budget = 100
lr_study.optimize(lr_objective, n_trials=budget)

def expand_lr_params(optuna_params, seed):
    expanded = {
        'vect__max_df': optuna_params['vect_max_df'],
        'vect__min_df': optuna_params['vect_min_df'],
        'vect__ngram_range': tuple(map(int, optuna_params['vect_ngram_range'].split(','))),
        'tfidf__use_idf': optuna_params['tfidf_use_idf'],
        'clf__C': optuna_params['lr_C'],
        'clf__penalty': None if optuna_params['lr_penalty'] == 'none' else optuna_params['lr_penalty'],
        'clf__solver': optuna_params['lr_solver'],
        'clf__class_weight': 'balanced',
        'clf__random_state': seed,
        'clf__max_iter': 1000
    }
    if optuna_params['lr_penalty'] == 'elasticnet':
        expanded['clf__l1_ratio'] = optuna_params['lr_l1_ratio']
    return expanded

best_params_expanded = expand_lr_params(lr_study.best_params, SEED)
lr_model = lr_pipeline.set_params(**best_params_expanded)

[I 2025-03-27 00:05:36,664] A new study created in RDB with name: lr_optimization_lang_spa_model_ml_stem_False_lem_True_dup_False
[I 2025-03-27 00:05:38,843] Trial 0 finished with value: 0.6990686696564321 and parameters: {'vect_max_df': 0.6982109220775343, 'vect_min_df': 3, 'vect_ngram_range': '1,2', 'tfidf_use_idf': False, 'lr_C': 0.019988774029355096, 'lr_penalty': 'none', 'lr_solver': 'saga'}. Best is trial 0 with value: 0.6990686696564321.
[I 2025-03-27 00:05:38,909] Trial 1 pruned. Solver must be 'saga' for l1 or elasticnet penalty.
[I 2025-03-27 00:05:42,756] Trial 2 finished with value: 0.7183755836405268 and parameters: {'vect_max_df': 0.8763273029713101, 'vect_min_df': 4, 'vect_ngram_range': '1,1', 'tfidf_use_idf': True, 'lr_C': 8.844814857004613, 'lr_penalty': 'l1', 'lr_solver': 'saga'}. Best is trial 2 with value: 0.7183755836405268.
[I 2025-03-27 00:05:45,091] Trial 3 finished with value: 0.7374380244031846 and parameters: {'vect_max_df': 0.8338890653642945, 'vect_min_df':

In [None]:
group_kfold = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
cv = group_kfold.split(X_train, y_train_enc, groups=groups_train)
cv_results = cross_validate(lr_model, X_train, y_train_enc, cv=cv, scoring=scoring, n_jobs=J_JOBS)

lr_model.fit(X_train, y_train_enc)

acc = np.mean(cv_results['test_accuracy'])
pre = np.mean(cv_results['test_precision_macro'])
rec = np.mean(cv_results['test_recall_macro'])
f1_ = np.mean(cv_results['test_f1_macro'])

lr_best_trial = lr_study.best_trial
print("Best trial Cross-Validation results on the training set for Logistic Regression:")
print(f"  Accuracy:  {acc:.4f}")
print(f"  Precision: {pre:.4f}")
print(f"  Recall:    {rec:.4f}")
print(f"  F1:        {f1_:.4f}")

print("Best hyperparameters:")
for key, value in lr_best_trial.params.items():
    print(f"  {key}: {value}")

model_filename = 'objects/task_1/models/best_lr_{}_{}.pkl'.format(
    file_name,
    data_config
)
joblib.dump(lr_model, model_filename)
print(f"Logistic Regression model saved as {model_filename}.")

task_val   = "task_1"
model_name = "Logistic Regression"
eval_type  = "inner"

save_eval_results(DB_PATH, db_file_name, task_val, model_name, eval_type, acc, pre, rec, f1_, lr_study_name, json.dumps(lr_study.best_params),language)



Best trial Cross-Validation results on the training set for Logistic Regression:
  Accuracy:  0.7950
  Precision: 0.7626
  Recall:    0.7363
  F1:        0.7459
Best hyperparameters:
  vect_max_df: 0.6982522095406092
  vect_min_df: 1
  vect_ngram_range: 1,2
  tfidf_use_idf: True
  lr_C: 0.11833003575303867
  lr_penalty: none
  lr_solver: lbfgs
Logistic Regression model saved as objects/task_1/models/best_lr_lang_spa_model_ml_stem_False_lem_True_dup_False.pkl.

Inserted CV results into 'eval' table in tfm.db.


## Evaluation

### Random Forest

In [None]:
# Outer evaluation for Random Forest
y_pred_rf = rf_model.predict(X_test)
acc_rf = accuracy_score(y_test_enc, y_pred_rf)
pre_rf = precision_score(y_test_enc, y_pred_rf, average='macro')
rec_rf = recall_score(y_test_enc, y_pred_rf, average='macro')
f1_rf = f1_score(y_test_enc, y_pred_rf, average='macro')

print("\nOuter evaluation results for Random Forest:")
print(f"  Accuracy:  {acc_rf:.4f}")
print(f"  Precision: {pre_rf:.4f}")
print(f"  Recall:    {rec_rf:.4f}")
print(f"  F1:        {f1_rf:.4f}")

save_eval_results(DB_PATH, db_file_name, 
                  task_val="task_1", 
                  model_name="Random Forest", 
                  eval_type="outer", 
                  acc=acc_rf, pre=pre_rf, rec=rec_rf, f1_=f1_rf, 
                  study_name=rf_study_name, 
                  params=json.dumps(rf_study.best_params),
                  lang=language)


Outer evaluation results for Random Forest:
  Accuracy:  0.7682
  Precision: 0.7843
  Recall:    0.7752
  F1:        0.7673

Inserted CV results into 'eval' table in tfm.db.


### SVM

In [None]:
# Outer evaluation for SVM
y_pred_svm = svm_model.predict(X_test)
acc_svm = accuracy_score(y_test_enc, y_pred_svm)
pre_svm = precision_score(y_test_enc, y_pred_svm, average='macro')
rec_svm = recall_score(y_test_enc, y_pred_svm, average='macro')
f1_svm = f1_score(y_test_enc, y_pred_svm, average='macro')

print("\nOuter evaluation results for SVM:")
print(f"  Accuracy:  {acc_svm:.4f}")
print(f"  Precision: {pre_svm:.4f}")
print(f"  Recall:    {rec_svm:.4f}")
print(f"  F1:        {f1_svm:.4f}")

save_eval_results(DB_PATH, db_file_name, 
                  task_val="task_1", 
                  model_name="SVM", 
                  eval_type="outer", 
                  acc=acc_svm, pre=pre_svm, rec=rec_svm, f1_=f1_svm, 
                  study_name=svm_study_name, 
                  params=json.dumps(svm_study.best_params),
                  lang=language)

### XGBoost

In [None]:
# Outer evaluation for XGBoost
y_pred_xgb = xgb_model.predict(X_test)
acc_xgb = accuracy_score(y_test_enc, y_pred_xgb)
pre_xgb = precision_score(y_test_enc, y_pred_xgb, average='macro')
rec_xgb = recall_score(y_test_enc, y_pred_xgb, average='macro')
f1_xgb = f1_score(y_test_enc, y_pred_xgb, average='macro')

print("\nOuter evaluation results for XGBoost:")
print(f"  Accuracy:  {acc_xgb:.4f}")
print(f"  Precision: {pre_xgb:.4f}")
print(f"  Recall:    {rec_xgb:.4f}")
print(f"  F1:        {f1_xgb:.4f}")

save_eval_results(DB_PATH, db_file_name, 
                  task_val="task_1", 
                  model_name="XGBoost", 
                  eval_type="outer", 
                  acc=acc_xgb, pre=pre_xgb, rec=rec_xgb, f1_=f1_xgb, 
                  study_name=xgb_study_name, 
                  params=json.dumps(xgb_study.best_params),
                  lang=language)

### Logistic Regression

In [None]:
# Outer evaluation for Logistic Regression
y_pred_lr = lr_model.predict(X_test)
acc_lr = accuracy_score(y_test_enc, y_pred_lr)
pre_lr = precision_score(y_test_enc, y_pred_lr, average='macro')
rec_lr = recall_score(y_test_enc, y_pred_lr, average='macro')
f1_lr = f1_score(y_test_enc, y_pred_lr, average='macro')

print("Outer evaluation results for Logistic Regression:")
print(f"  Accuracy:  {acc_lr:.4f}")
print(f"  Precision: {pre_lr:.4f}")
print(f"  Recall:    {rec_lr:.4f}")
print(f"  F1:        {f1_lr:.4f}")


save_eval_results(DB_PATH, db_file_name, 
                  task_val="task_1", 
                  model_name="Logistic Regression", 
                  eval_type="outer", 
                  acc=acc_lr, pre=pre_lr, rec=rec_lr, f1_=f1_lr, 
                  study_name=lr_study_name, 
                  params=json.dumps(lr_study.best_params))


Outer evaluation results for Logistic Regression:
  Accuracy:  0.7743
  Precision: 0.7332
  Recall:    0.7235
  F1:        0.7279

Inserted CV results into 'eval' table in tfm.db.

Outer evaluation results for Random Forest:
  Accuracy:  0.7981
  Precision: 0.7625
  Recall:    0.7516
  F1:        0.7565

Inserted CV results into 'eval' table in tfm.db.

Outer evaluation results for SVM:
  Accuracy:  0.7862
  Precision: 0.7478
  Recall:    0.7386
  F1:        0.7428

Inserted CV results into 'eval' table in tfm.db.

Outer evaluation results for XGBoost:
  Accuracy:  0.7838
  Precision: 0.7494
  Recall:    0.7149
  F1:        0.7271

Inserted CV results into 'eval' table in tfm.db.
