# Phishing URL Linear Model Experiments

This notebook explores various linear models using the Kaggle phishing URL dataset.

In [1]:
use_drive = False

# uncomment lines below if running on colab
# from google.colab import drive
# import os
# drive.mount('/content/drive')
# use_drive = True
# drive_root = '/content/drive/MyDrive/fraud-grp-proj/'
# print(os.path.exists(drive_root)) # check path exists

In [2]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, 
                             classification_report, roc_curve)
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.calibration import CalibratedClassifierCV

# Optuna
import optuna

# Set seed for reproducibility
np.random.seed(42)

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [3]:
# Load train and test datasets
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

train_w_features_df = pd.read_csv('dataset/df_train_feature_engineered.csv')
test_w_features_df = pd.read_csv('dataset/df_test_feature_engineered.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

print(f"Train with features shape: {train_w_features_df.shape}")
print(f"Test with features shape: {test_w_features_df.shape}")

Train shape: (9143, 2)
Test shape: (2286, 2)
Train with features shape: (9143, 78)
Test with features shape: (2286, 78)


In [4]:
train_w_features_df.columns

Index(['url', 'target', 'is_http', 'has_subdomain', 'has_tld', 'num_subdomain',
       'is_domain_ip', 'num_hyphens_domain', 'is_punycode', 'has_path',
       'path_depth', 'has_filename', 'has_file_extension', 'has_query',
       'length_url', 'length_hostname', 'length_tld', 'length_sld',
       'length_subdomains', 'length_path', 'length_query', 'num_dots',
       'num_hyphens', 'num_at', 'num_question_marks', 'num_and', 'num_equal',
       'num_percent', 'tld_in_path', 'tld_in_subdomain',
       'subdomain_longer_sld', 'ratio_digits_url', 'ratio_digits_hostname',
       'ratio_letter_url', 'ratio_path_url', 'ratio_hostname_url',
       'length_words_url', 'avg_word_hostname', 'avg_word_path',
       'num_unique_chars_hostname', 'has_shortened_hostname',
       'entropy_hostname', 'has_www_subdomain', 'has_com_tld',
       'is_http_and_many_subdomains', 'ip_and_short_tld',
       'http_and_missing_domain_info', 'subdomain_depth_x_http', 'ip_x_http',
       'domain_complexity_score',

Following the EDA, we use the transformed features and drop the original ones since linear models require normalized and scaled inputs.

In [5]:
# Drop original versions of log transformed features
train_w_features_df.drop(columns=['length_url', 'length_path',  'ratio_hostname_url', 'length_words_url', 'avg_word_hostname', 'num_unique_chars_hostname'], inplace=True)

# Drop original versions of squared transformed features
train_w_features_df.drop(columns=['ratio_letter_url', 'entropy_hostname'], inplace=True)

# Drop original versions of is_zero transformed features
train_w_features_df.drop(columns=['num_hyphens_domain', 'length_subdomains', 'num_hyphens',  'num_at', 'num_question_marks', 'num_and', 'num_equal', 'num_percent', 'ratio_digits_url', 'ratio_digits_hostname', 'avg_word_path', 'length_query'], inplace=True)

# Drop original versions of bucketed transformed features
train_w_features_df.drop(columns=['num_subdomain', 'length_tld', 'path_depth'], inplace=True)

# Check final columns
train_w_features_df.columns

Index(['url', 'target', 'is_http', 'has_subdomain', 'has_tld', 'is_domain_ip',
       'is_punycode', 'has_path', 'has_filename', 'has_file_extension',
       'has_query', 'length_hostname', 'length_sld', 'num_dots', 'tld_in_path',
       'tld_in_subdomain', 'subdomain_longer_sld', 'ratio_path_url',
       'has_shortened_hostname', 'has_www_subdomain', 'has_com_tld',
       'is_http_and_many_subdomains', 'ip_and_short_tld',
       'http_and_missing_domain_info', 'subdomain_depth_x_http', 'ip_x_http',
       'domain_complexity_score', 'suspicion_score', 'contains_brand_misspell',
       'is_homoglyph_attack', 'homoglyph_type', 'risk_score',
       'is_zero_num_hyphens_domain', 'is_zero_length_subdomains',
       'is_zero_num_hyphens', 'is_zero_num_at', 'is_zero_num_question_marks',
       'is_zero_num_and', 'is_zero_num_equal', 'is_zero_num_percent',
       'is_zero_ratio_digits_url', 'is_zero_ratio_digits_hostname',
       'is_zero_avg_word_path', 'is_zero_length_query',
       'num_sub

## Training Models

Now lets move on to training the models. We use the saver class to help us standardize the storing of metrics and models for evaluation later on.

In [6]:
# Import ModelSaver
import sys
import os
sys.path.append(os.path.abspath('.'))
from save_model import ModelSaver

# Configuration
N_FOLDS = 5
RANDOM_STATE = 42

# Check device (not strictly needed for sklearn but good for consistency)
print(f"Running on: {sys.platform}")

Running on: darwin


In [7]:
# --- Data Preparation ---

# 1. Prepare Numeric Features
# Select numeric and boolean columns and exclude target
numeric_cols = train_w_features_df.select_dtypes(include=[np.number, bool]).columns.tolist()
if 'target' in numeric_cols:
    numeric_cols.remove('target')

print(f"Selected {len(numeric_cols)} numeric/boolean features.")

# Ensure boolean columns are converted to integers (0/1) for the model
X_numeric = train_w_features_df[numeric_cols].astype(float).values
y = train_w_features_df['target'].values

# Prepare Test Data for Numeric
X_numeric_test = test_w_features_df[numeric_cols].astype(float).values

# 2. Prepare Text Features (URLs)
X_text = train_df['url'].values
X_text_test = test_df['url'].values

# Check shapes
print(f"Numeric Train Shape: {X_numeric.shape}")
print(f"Numeric Test Shape: {X_numeric_test.shape}")
print(f"Text Train Shape: {X_text.shape}")
print(f"Text Test Shape: {X_text_test.shape}")
print(f"Target Shape: {y.shape}")

Selected 49 numeric/boolean features.
Numeric Train Shape: (9143, 49)
Numeric Test Shape: (2286, 49)
Text Train Shape: (9143,)
Text Test Shape: (2286,)
Target Shape: (9143,)


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def calculate_metrics(y_true, y_pred_proba, threshold=0.5):
    """Calculate standard metrics for binary classification."""
    y_pred = (y_pred_proba >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_true, y_pred_proba),
        'TP': int(tp), 'FP': int(fp), 'TN': int(tn), 'FN': int(fn)
    }

def run_cv_experiment(X, y, X_test, pipeline_creator, experiment_name, model_name, vectorizer_name, params, feature_names_func=None):
    """
    Run a cross-validation experiment and save results using ModelSaver.
    
    Args:
        X: Training features
        y: Training targets
        X_test: Test features
        pipeline_creator: Function that returns a fresh sklearn Pipeline
        experiment_name: Name of the experiment for saving
        model_name: Name of the model type
        vectorizer_name: Name of the vectorizer/feature set
        params: Dictionary containing 'model_params' and 'vectorizer_params'
        feature_names_func: Optional function to extract feature names from fitted pipeline
    """
    print(f"\n=== Running Experiment: {experiment_name} ===")
    
    saver = ModelSaver(base_path="experiments")
    saver.start_experiment(
        experiment_name=experiment_name,
        model_type=model_name,
        vectorizer=vectorizer_name,
        vectorizer_params=params.get('vectorizer_params', {}),
        model_params=params.get('model_params', {}),
        n_folds=N_FOLDS
    )
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), start=1):
        print(f"Fold {fold}/{N_FOLDS}")
        
        # Split data
                # Split data
        if hasattr(X, "iloc"): # Check if DataFrame
             X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        else:
             X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        # Create and fit pipeline
        pipeline = pipeline_creator()
        pipeline.fit(X_train_fold, y_train_fold)
        
        # Validation metrics
        val_probs = pipeline.predict_proba(X_val_fold)[:, 1]
        val_metrics = calculate_metrics(y_val_fold, val_probs)
        val_metrics['fold'] = fold
        
        print(f"  Val AUC: {val_metrics['roc_auc']:.4f}")
        
        # Test predictions (for ensemble later)
        test_probs = pipeline.predict_proba(X_test)[:, 1]
        
        # Get feature names if possible
        feature_names = None
        if feature_names_func:
            try:
                feature_names = feature_names_func(pipeline)
            except Exception as e:
                print(f"  Could not extract feature names: {e}")
            
        saver.add_fold(
            fold_model=pipeline,
            fold_metric=val_metrics,
            test_predictions=test_probs,
            feature_names=feature_names
        )
        
    saver.finalize_experiment()
    print(f"Experiment saved to {saver._exp_dir}")

### 1. Logistic Regression (Engineered Numeric Features)

We first test a simple Logistic Regression model using only the manually engineered numeric features.


In [9]:
def create_numeric_lr_pipeline():
    return Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000, solver='lbfgs'))
    ])

def get_numeric_feature_names(pipeline):
    return numeric_cols

numeric_params = {
    'model_params': {'max_iter': 1000, 'solver': 'lbfgs'},
    'vectorizer_params': {'type': 'StandardScaler'}
}

run_cv_experiment(
    X=X_numeric, 
    y=y, 
    X_test=X_numeric_test,
    pipeline_creator=create_numeric_lr_pipeline,
    experiment_name="exp_1_numeric_lr",
    model_name="LogisticRegression",
    vectorizer_name="NumericFeatures",
    params=numeric_params,
    feature_names_func=get_numeric_feature_names
)


=== Running Experiment: exp_1_numeric_lr ===
Experiment 'exp_1_numeric_lr' initialized at: experiments/exp_1_numeric_lr
Mode: Incremental saving (5 folds)
Fold 1/5
  Val AUC: 0.9450
  Fold 1/5 saved | ROC AUC: 0.9450
Fold 2/5
  Val AUC: 0.9369
  Fold 2/5 saved | ROC AUC: 0.9369
Fold 3/5
  Val AUC: 0.9431
  Fold 3/5 saved | ROC AUC: 0.9431
Fold 4/5
  Val AUC: 0.9459
  Fold 4/5 saved | ROC AUC: 0.9459
Fold 5/5
  Val AUC: 0.9404
  Fold 5/5 saved | ROC AUC: 0.9404

Finalizing experiment...
  Predictions saved to experiments/exp_1_numeric_lr/exp_1_numeric_lr_prediction.csv

✓ Experiment 'exp_1_numeric_lr' finalized!
  Location: experiments/exp_1_numeric_lr
  Folds completed: 5
  Best fold: 4 (ROC AUC: 0.9459)
  Average ROC AUC: 0.9423 ± 0.0033
Experiment saved to experiments/exp_1_numeric_lr


### 2. Logistic Regression (TF-IDF Features)

Next, we test Logistic Regression using TF-IDF features extracted directly from the URL strings. We use character n-grams to capture patterns in the URL structure.


In [10]:
def create_tfidf_lr_pipeline():
    return Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, analyzer='char', ngram_range=(3, 5))),
        ('clf', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
    ])

def get_tfidf_feature_names(pipeline):
    return pipeline.named_steps['tfidf'].get_feature_names_out().tolist()

tfidf_params = {
    'model_params': {'max_iter': 1000},
    'vectorizer_params': {'max_features': 5000, 'analyzer': 'char', 'ngram_range': (3, 5)}
}

run_cv_experiment(
    X=X_text, 
    y=y, 
    X_test=X_text_test,
    pipeline_creator=create_tfidf_lr_pipeline,
    experiment_name="exp_1_tfidf_lr",
    model_name="LogisticRegression",
    vectorizer_name="TfidfVectorizer",
    params=tfidf_params,
    feature_names_func=get_tfidf_feature_names
)


=== Running Experiment: exp_1_tfidf_lr ===
Experiment 'exp_1_tfidf_lr' initialized at: experiments/exp_1_tfidf_lr
Mode: Incremental saving (5 folds)
Fold 1/5
  Val AUC: 0.9713
  Fold 1/5 saved | ROC AUC: 0.9713
Fold 2/5
  Val AUC: 0.9672
  Fold 2/5 saved | ROC AUC: 0.9672
Fold 3/5
  Val AUC: 0.9626
  Fold 3/5 saved | ROC AUC: 0.9626
Fold 4/5
  Val AUC: 0.9688
  Fold 4/5 saved | ROC AUC: 0.9688
Fold 5/5
  Val AUC: 0.9618
  Fold 5/5 saved | ROC AUC: 0.9618

Finalizing experiment...
  Predictions saved to experiments/exp_1_tfidf_lr/exp_1_tfidf_lr_prediction.csv

✓ Experiment 'exp_1_tfidf_lr' finalized!
  Location: experiments/exp_1_tfidf_lr
  Folds completed: 5
  Best fold: 1 (ROC AUC: 0.9713)
  Average ROC AUC: 0.9664 ± 0.0036
Experiment saved to experiments/exp_1_tfidf_lr


### 3. Logistic Regression (Combined Features)

Since we see that tf-idf features perform better, lets try combining both feature sets.

In [11]:
# Create combined DataFrame with both text and numeric features
X_combined_df = train_w_features_df[numeric_cols].copy()
X_combined_df['url'] = train_df['url']

X_combined_test_df = test_w_features_df[numeric_cols].copy()
X_combined_test_df['url'] = test_df['url']

# Define the preprocessor
# Note: TfidfVectorizer expects a 1D array, so we specify the column name 'url'
# but we might need a custom transformer or ensure ColumnTransformer passes it correctly.
# ColumnTransformer passes the column as a Series (which is array-like) to TfidfVectorizer.
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(max_features=5000, analyzer='char', ngram_range=(3, 5)), 'url'),
        ('scaler', StandardScaler(), numeric_cols)
    ],
    remainder='drop' # Drop any other columns if present
)

def create_combined_lr_pipeline():
    return Pipeline([
        ('preprocessor', preprocessor),
        ('clf', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))
    ])

def get_combined_feature_names(pipeline):
    # Extract feature names from the preprocessor
    tfidf_features = pipeline.named_steps['preprocessor'].named_transformers_['tfidf'].get_feature_names_out().tolist()
    # Numeric features are passed through, so their names are preserved
    return tfidf_features + numeric_cols

combined_params = {
    'model_params': {'max_iter': 1000},
    'vectorizer_params': {'tfidf': {'max_features': 5000, 'analyzer': 'char', 'ngram_range': (3, 5)}, 'scaler': 'StandardScaler'}
}

run_cv_experiment(
    X=X_combined_df, 
    y=y, 
    X_test=X_combined_test_df,
    pipeline_creator=create_combined_lr_pipeline,
    experiment_name="exp_1_combined_lr",
    model_name="LogisticRegression",
    vectorizer_name="CombinedFeatures",
    params=combined_params,
    feature_names_func=get_combined_feature_names
)


=== Running Experiment: exp_1_combined_lr ===
Experiment 'exp_1_combined_lr' initialized at: experiments/exp_1_combined_lr
Mode: Incremental saving (5 folds)
Fold 1/5
  Val AUC: 0.9805
  Fold 1/5 saved | ROC AUC: 0.9805
Fold 2/5
  Val AUC: 0.9786
  Fold 2/5 saved | ROC AUC: 0.9786
Fold 3/5
  Val AUC: 0.9781
  Fold 3/5 saved | ROC AUC: 0.9781
Fold 4/5
  Val AUC: 0.9835
  Fold 4/5 saved | ROC AUC: 0.9835
Fold 5/5
  Val AUC: 0.9772
  Fold 5/5 saved | ROC AUC: 0.9772

Finalizing experiment...
  Predictions saved to experiments/exp_1_combined_lr/exp_1_combined_lr_prediction.csv

✓ Experiment 'exp_1_combined_lr' finalized!
  Location: experiments/exp_1_combined_lr
  Folds completed: 5
  Best fold: 4 (ROC AUC: 0.9835)
  Average ROC AUC: 0.9796 ± 0.0022
Experiment saved to experiments/exp_1_combined_lr


### 4. SVM (Combined Features)

Our combined features seem to perform better overall, telling us that both feature sets contribute useful information. Lets try using SVM to see if accuracy improves further.

In [12]:
def create_combined_svm_pipeline():
    return Pipeline([
        ('preprocessor', preprocessor),
        ('clf', SVC(kernel='linear', C=1.0, random_state=RANDOM_STATE, probability=True))
    ])

combined_svm_params = {
    'model_params': {'kernel': 'linear', 'C': 1.0, 'probability': True},
    'vectorizer_params': {'tfidf': {'max_features': 5000, 'analyzer': 'char', 'ngram_range': (3, 5)}, 'scaler': 'StandardScaler'}
}

run_cv_experiment(
    X=X_combined_df, 
    y=y, 
    X_test=X_combined_test_df,
    pipeline_creator=create_combined_svm_pipeline,
    experiment_name="exp_1_combined_svm",
    model_name="SVM",
    vectorizer_name="CombinedFeatures",
    params=combined_svm_params,
    feature_names_func=get_combined_feature_names
)


=== Running Experiment: exp_1_combined_svm ===
Experiment 'exp_1_combined_svm' initialized at: experiments/exp_1_combined_svm
Mode: Incremental saving (5 folds)
Fold 1/5
  Val AUC: 0.9842
  Fold 1/5 saved | ROC AUC: 0.9842
Fold 2/5
  Val AUC: 0.9831
  Fold 2/5 saved | ROC AUC: 0.9831
Fold 3/5
  Val AUC: 0.9830
  Fold 3/5 saved | ROC AUC: 0.9830
Fold 4/5
  Val AUC: 0.9865
  Fold 4/5 saved | ROC AUC: 0.9865
Fold 5/5
  Val AUC: 0.9821
  Fold 5/5 saved | ROC AUC: 0.9821

Finalizing experiment...
  Predictions saved to experiments/exp_1_combined_svm/exp_1_combined_svm_prediction.csv

✓ Experiment 'exp_1_combined_svm' finalized!
  Location: experiments/exp_1_combined_svm
  Folds completed: 5
  Best fold: 4 (ROC AUC: 0.9865)
  Average ROC AUC: 0.9838 ± 0.0015
Experiment saved to experiments/exp_1_combined_svm


### 5. Optuna Hyperparameter Tuning

We will use Optuna to tune the hyperparameters of our best model (SVM) to see if we can improve performance further.

In [16]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.exceptions import TrialPruned

In [14]:
def objective(trial):
    C = trial.suggest_float("C", 1e-4, 1e3, log=True)
    loss = trial.suggest_categorical("loss", ["hinge", "squared_hinge"])
    dual = True if loss == "hinge" else trial.suggest_categorical("dual", [True, False])
    tol = trial.suggest_float("tol", 1e-5, 1e-2, log=True)
    max_iter = trial.suggest_int("max_iter", 2000, 20000)

    calibration_method = trial.suggest_categorical("calibration_method", ["sigmoid", "isotonic"])
    max_features = trial.suggest_categorical("max_features", [1000, 5000, 10000])

    ngram_min = trial.suggest_int("ngram_min", 1, 3)
    ngram_max = trial.suggest_int("ngram_max", ngram_min, 5)
    ngram_range = (ngram_min, ngram_max)
    
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    fold_aucs = []

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_combined_df, y)):
        X_train_fold, X_val_fold = X_combined_df.iloc[train_idx], X_combined_df.iloc[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        preprocessor = ColumnTransformer(
            transformers=[
                ('tfidf', TfidfVectorizer(max_features=max_features, analyzer='char', ngram_range=ngram_range), 'url'),
                ('scaler', StandardScaler(), numeric_cols)
            ],
            remainder='drop'
        )

        base_estimator = LinearSVC(
            C=C,
            class_weight='balanced',
            loss=loss,
            dual=dual,
            tol=tol,
            max_iter=max_iter,
            random_state=RANDOM_STATE
        )

        calibrated_clf = CalibratedClassifierCV(
            estimator=base_estimator,
            cv=3,
            method=calibration_method,
            n_jobs=-1
        )

        model = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', calibrated_clf)
        ])

        try:
            model.fit(X_train_fold, y_train_fold)
            val_probs = model.predict_proba(X_val_fold)[:, 1]
            fold_auc = roc_auc_score(y_val_fold, val_probs)
        except ValueError as exc:
            raise TrialPruned() from exc

        fold_aucs.append(fold_auc)
        trial.report(fold_auc, step=fold_idx)

        if trial.should_prune():
            raise TrialPruned()

    return float(np.mean(fold_aucs))

In [17]:
sampler = TPESampler(seed=42)
pruner = MedianPruner(n_startup_trials=10, n_warmup_steps=5)
study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=60, show_progress_bar=True)

print('Number of finished trials:', len(study.trials))
print('Best AUC:', study.best_value)
print('Best params:', study.best_params)

[I 2025-11-21 13:44:35,172] A new study created in memory with name: no-name-9ecf2d18-2450-45ad-90df-91297b2e4164
Best trial: 0. Best value: 0.962372:   2%|▏         | 1/60 [00:10<10:14, 10.41s/it]

[I 2025-11-21 13:44:45,595] Trial 0 finished with value: 0.9623722305500246 and parameters: {'C': 0.041858227295469716, 'loss': 'hinge', 'tol': 0.0006251373574521745, 'max_iter': 4808, 'calibration_method': 'sigmoid', 'max_features': 1000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 0 with value: 0.9623722305500246.


Best trial: 1. Best value: 0.971419:   3%|▎         | 2/60 [00:22<10:45, 11.12s/it]

[I 2025-11-21 13:44:57,219] Trial 1 finished with value: 0.9714188296211186 and parameters: {'C': 67.1581131106993, 'loss': 'hinge', 'tol': 3.5498788321965036e-05, 'max_iter': 7476, 'calibration_method': 'sigmoid', 'max_features': 5000, 'ngram_min': 1, 'ngram_max': 2}. Best is trial 1 with value: 0.9714188296211186.


Best trial: 2. Best value: 0.974132:   5%|▌         | 3/60 [00:29<08:51,  9.32s/it]

[I 2025-11-21 13:45:04,387] Trial 2 finished with value: 0.9741324233601253 and parameters: {'C': 0.15577217702693025, 'loss': 'hinge', 'tol': 0.0003489018845491386, 'max_iter': 12664, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 3, 'ngram_max': 5}. Best is trial 2 with value: 0.9741324233601253.


Best trial: 2. Best value: 0.974132:   7%|▋         | 4/60 [00:31<05:58,  6.40s/it]

[I 2025-11-21 13:45:06,324] Trial 3 finished with value: 0.9628135340389076 and parameters: {'C': 0.013561145768453494, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.0003058656666978527, 'max_iter': 2619, 'calibration_method': 'sigmoid', 'max_features': 1000, 'ngram_min': 2, 'ngram_max': 2}. Best is trial 2 with value: 0.9741324233601253.


Best trial: 4. Best value: 0.983491:   8%|▊         | 5/60 [00:38<06:12,  6.78s/it]

[I 2025-11-21 13:45:13,773] Trial 4 finished with value: 0.9834907885837041 and parameters: {'C': 612.4806805925967, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.00582938454299474, 'max_iter': 3592, 'calibration_method': 'sigmoid', 'max_features': 5000, 'ngram_min': 3, 'ngram_max': 4}. Best is trial 4 with value: 0.9834907885837041.


Best trial: 4. Best value: 0.983491:  10%|█         | 6/60 [00:43<05:38,  6.27s/it]

[I 2025-11-21 13:45:19,047] Trial 5 finished with value: 0.9398445029191616 and parameters: {'C': 0.009258519973443782, 'loss': 'hinge', 'tol': 0.002550298070162891, 'max_iter': 3341, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 3, 'ngram_max': 5}. Best is trial 4 with value: 0.9834907885837041.


Best trial: 6. Best value: 0.983807:  12%|█▏        | 7/60 [00:48<05:07,  5.80s/it]

[I 2025-11-21 13:45:23,875] Trial 6 finished with value: 0.9838071134197488 and parameters: {'C': 25.054885755573522, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.0007411299781083245, 'max_iter': 7956, 'calibration_method': 'isotonic', 'max_features': 5000, 'ngram_min': 3, 'ngram_max': 4}. Best is trial 6 with value: 0.9838071134197488.


Best trial: 6. Best value: 0.983807:  13%|█▎        | 8/60 [00:51<04:19,  4.99s/it]

[I 2025-11-21 13:45:27,124] Trial 7 finished with value: 0.9226557839118581 and parameters: {'C': 0.0006873211713642718, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.00030296104428212476, 'max_iter': 11409, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 3}. Best is trial 6 with value: 0.9838071134197488.


Best trial: 8. Best value: 0.984847:  15%|█▌        | 9/60 [01:03<05:56,  7.00s/it]

[I 2025-11-21 13:45:38,543] Trial 8 finished with value: 0.9848471192628596 and parameters: {'C': 225.40591970426053, 'loss': 'squared_hinge', 'dual': True, 'tol': 1.7019223026554023e-05, 'max_iter': 7215, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 3, 'ngram_max': 3}. Best is trial 8 with value: 0.9848471192628596.


Best trial: 8. Best value: 0.984847:  17%|█▋        | 10/60 [01:10<05:56,  7.13s/it]

[I 2025-11-21 13:45:45,971] Trial 9 finished with value: 0.9670516859946948 and parameters: {'C': 176.97543538533165, 'loss': 'squared_hinge', 'dual': True, 'tol': 2.1387290754148914e-05, 'max_iter': 6103, 'calibration_method': 'isotonic', 'max_features': 1000, 'ngram_min': 2, 'ngram_max': 2}. Best is trial 8 with value: 0.9848471192628596.


Best trial: 10. Best value: 0.985846:  18%|█▊        | 11/60 [01:38<10:58, 13.44s/it]

[I 2025-11-21 13:46:13,714] Trial 10 finished with value: 0.9858455856948127 and parameters: {'C': 3.7773311371434097, 'loss': 'squared_hinge', 'dual': True, 'tol': 5.608877985158643e-05, 'max_iter': 17869, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 3}. Best is trial 10 with value: 0.9858455856948127.


Best trial: 11. Best value: 0.985916:  20%|██        | 12/60 [02:04<13:47, 17.24s/it]

[I 2025-11-21 13:46:39,665] Trial 11 finished with value: 0.9859155865925253 and parameters: {'C': 3.2061892701275747, 'loss': 'squared_hinge', 'dual': True, 'tol': 6.057866047070487e-05, 'max_iter': 18030, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 3}. Best is trial 11 with value: 0.9859155865925253.


Best trial: 11. Best value: 0.985916:  22%|██▏       | 13/60 [02:30<15:39, 19.99s/it]

[I 2025-11-21 13:47:05,971] Trial 12 finished with value: 0.985787838532229 and parameters: {'C': 5.152676708214511, 'loss': 'squared_hinge', 'dual': True, 'tol': 9.167893082962411e-05, 'max_iter': 18934, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 3}. Best is trial 11 with value: 0.9859155865925253.


Best trial: 13. Best value: 0.985918:  23%|██▎       | 14/60 [02:57<16:58, 22.13s/it]

[I 2025-11-21 13:47:33,053] Trial 13 finished with value: 0.9859177501553074 and parameters: {'C': 3.2625843757071396, 'loss': 'squared_hinge', 'dual': True, 'tol': 9.343238305157978e-05, 'max_iter': 19469, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 3}. Best is trial 13 with value: 0.9859177501553074.


Best trial: 14. Best value: 0.986695:  25%|██▌       | 15/60 [03:22<17:09, 22.88s/it]

[I 2025-11-21 13:47:57,657] Trial 14 finished with value: 0.9866954836012851 and parameters: {'C': 1.4730539661191957, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.00011924851435769837, 'max_iter': 15395, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 4}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  27%|██▋       | 16/60 [03:28<12:57, 17.67s/it]

[I 2025-11-21 13:48:03,220] Trial 15 finished with value: 0.9840245332948607 and parameters: {'C': 0.24983828920585266, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.000132413759795664, 'max_iter': 15254, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 4}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  28%|██▊       | 17/60 [03:52<14:04, 19.65s/it]

[I 2025-11-21 13:48:27,486] Trial 16 finished with value: 0.9642110167601677 and parameters: {'C': 1.211768042809346, 'loss': 'squared_hinge', 'dual': True, 'tol': 1.0880706798698207e-05, 'max_iter': 14987, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 1}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  30%|███       | 18/60 [03:56<10:27, 14.95s/it]

[I 2025-11-21 13:48:31,483] Trial 17 finished with value: 0.9179341677741549 and parameters: {'C': 0.0004842325080605757, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.00017222546879656664, 'max_iter': 16075, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 4}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  32%|███▏      | 19/60 [04:19<11:50, 17.33s/it]

[I 2025-11-21 13:48:54,366] Trial 18 finished with value: 0.9833689419816922 and parameters: {'C': 26.01641196134867, 'loss': 'hinge', 'tol': 0.002049165100252008, 'max_iter': 13281, 'calibration_method': 'isotonic', 'max_features': 5000, 'ngram_min': 2, 'ngram_max': 4}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  33%|███▎      | 20/60 [04:20<08:25, 12.63s/it]

[I 2025-11-21 13:48:56,036] Trial 19 finished with value: 0.9640918419711137 and parameters: {'C': 0.5574885393164987, 'loss': 'squared_hinge', 'dual': False, 'tol': 4.018748820180411e-05, 'max_iter': 19843, 'calibration_method': 'isotonic', 'max_features': 1000, 'ngram_min': 1, 'ngram_max': 1}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  35%|███▌      | 21/60 [04:27<07:00, 10.78s/it]

[I 2025-11-21 13:49:02,499] Trial 20 finished with value: 0.97735263472574 and parameters: {'C': 0.0670138108481491, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.00013895088453642616, 'max_iter': 16731, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 4}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  37%|███▋      | 22/60 [04:53<09:42, 15.32s/it]

[I 2025-11-21 13:49:28,412] Trial 21 finished with value: 0.9859281393906917 and parameters: {'C': 3.090632945563457, 'loss': 'squared_hinge', 'dual': True, 'tol': 7.04653506056624e-05, 'max_iter': 17735, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 3}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  38%|███▊      | 23/60 [05:21<11:45, 19.06s/it]

[I 2025-11-21 13:49:56,186] Trial 22 finished with value: 0.9857245657679259 and parameters: {'C': 6.67383323070204, 'loss': 'squared_hinge', 'dual': True, 'tol': 8.766258624965795e-05, 'max_iter': 19811, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 3}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  40%|████      | 24/60 [05:38<11:08, 18.57s/it]

[I 2025-11-21 13:50:13,608] Trial 23 finished with value: 0.9855897345186276 and parameters: {'C': 0.9066380070156199, 'loss': 'squared_hinge', 'dual': True, 'tol': 3.0340987779059696e-05, 'max_iter': 14087, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 3}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  42%|████▏     | 25/60 [06:22<15:18, 26.23s/it]

[I 2025-11-21 13:50:57,714] Trial 24 finished with value: 0.9864362506317805 and parameters: {'C': 18.097465478765297, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.00019497902594329324, 'max_iter': 17475, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 4}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  43%|████▎     | 26/60 [07:04<17:29, 30.87s/it]

[I 2025-11-21 13:51:39,406] Trial 25 finished with value: 0.9862689018371235 and parameters: {'C': 15.49227217352539, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.0006503799306652009, 'max_iter': 16396, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  45%|████▌     | 27/60 [07:35<17:00, 30.94s/it]

[I 2025-11-21 13:52:10,507] Trial 26 finished with value: 0.9859499155231953 and parameters: {'C': 33.23709801443365, 'loss': 'hinge', 'tol': 0.0009206629096774215, 'max_iter': 11471, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  47%|████▋     | 28/60 [08:03<15:59, 29.98s/it]

[I 2025-11-21 13:52:38,252] Trial 27 finished with value: 0.9862583813855421 and parameters: {'C': 14.464640334788516, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.001123062745161503, 'max_iter': 9632, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  48%|████▊     | 29/60 [08:12<12:14, 23.68s/it]

[I 2025-11-21 13:52:47,232] Trial 28 finished with value: 0.9834630298822061 and parameters: {'C': 82.31331243526326, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.0004043456480494999, 'max_iter': 16634, 'calibration_method': 'isotonic', 'max_features': 5000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  50%|█████     | 30/60 [08:35<11:50, 23.68s/it]

[I 2025-11-21 13:53:10,903] Trial 29 finished with value: 0.9705116984276584 and parameters: {'C': 578.8953707224678, 'loss': 'hinge', 'tol': 0.00022842425673853342, 'max_iter': 14400, 'calibration_method': 'sigmoid', 'max_features': 1000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  52%|█████▏    | 31/60 [08:51<10:21, 21.45s/it]

[I 2025-11-21 13:53:27,144] Trial 30 finished with value: 0.97552597495924 and parameters: {'C': 9.486924603729571, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.0006203588670536787, 'max_iter': 12637, 'calibration_method': 'isotonic', 'max_features': 1000, 'ngram_min': 3, 'ngram_max': 4}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  53%|█████▎    | 32/60 [09:21<11:07, 23.83s/it]

[I 2025-11-21 13:53:56,522] Trial 31 finished with value: 0.9863013274134028 and parameters: {'C': 12.630295024147996, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.0013355949888340947, 'max_iter': 9394, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 14. Best value: 0.986695:  55%|█████▌    | 33/60 [09:47<11:05, 24.64s/it]

[I 2025-11-21 13:54:23,051] Trial 32 finished with value: 0.9860673562437599 and parameters: {'C': 55.69002111470547, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.00268364839439926, 'max_iter': 9495, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 14 with value: 0.9866954836012851.


Best trial: 33. Best value: 0.986833:  57%|█████▋    | 34/60 [10:09<10:15, 23.68s/it]

[I 2025-11-21 13:54:44,483] Trial 33 finished with value: 0.9868330917149937 and parameters: {'C': 1.2212085716423664, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.0014476236536116484, 'max_iter': 10036, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 33 with value: 0.9868330917149937.


Best trial: 33. Best value: 0.986833:  58%|█████▊    | 35/60 [10:24<08:47, 21.12s/it]

[I 2025-11-21 13:54:59,639] Trial 34 finished with value: 0.9857956627068155 and parameters: {'C': 1.2336783232022004, 'loss': 'hinge', 'tol': 0.0014969182143268714, 'max_iter': 9459, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 4}. Best is trial 33 with value: 0.9868330917149937.


Best trial: 33. Best value: 0.986833:  60%|██████    | 36/60 [10:33<07:02, 17.62s/it]

[I 2025-11-21 13:55:09,091] Trial 35 finished with value: 0.9824775212786753 and parameters: {'C': 0.151603718484982, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.0035763026520834475, 'max_iter': 10098, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 33 with value: 0.9868330917149937.


Best trial: 33. Best value: 0.986833:  62%|██████▏   | 37/60 [10:38<05:16, 13.76s/it]

[I 2025-11-21 13:55:13,851] Trial 36 finished with value: 0.9691512473244259 and parameters: {'C': 0.02236368173261331, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.006282340164159145, 'max_iter': 8171, 'calibration_method': 'sigmoid', 'max_features': 5000, 'ngram_min': 1, 'ngram_max': 4}. Best is trial 33 with value: 0.9868330917149937.


Best trial: 33. Best value: 0.986833:  63%|██████▎   | 38/60 [10:45<04:17, 11.71s/it]

[I 2025-11-21 13:55:20,765] Trial 37 finished with value: 0.9822983133132478 and parameters: {'C': 0.47619400267080764, 'loss': 'hinge', 'tol': 0.0004592239726508282, 'max_iter': 5884, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 3, 'ngram_max': 4}. Best is trial 33 with value: 0.9868330917149937.


Best trial: 33. Best value: 0.986833:  65%|██████▌   | 39/60 [10:52<03:33, 10.15s/it]

[I 2025-11-21 13:55:27,284] Trial 38 finished with value: 0.9524286579628596 and parameters: {'C': 0.005769633814788766, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.008821901196082149, 'max_iter': 12399, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 33 with value: 0.9868330917149937.


Best trial: 33. Best value: 0.986833:  67%|██████▋   | 40/60 [10:56<02:48,  8.44s/it]

[I 2025-11-21 13:55:31,732] Trial 39 finished with value: 0.9790515585985503 and parameters: {'C': 0.09112295206943266, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.0017519334106024032, 'max_iter': 11185, 'calibration_method': 'sigmoid', 'max_features': 5000, 'ngram_min': 3, 'ngram_max': 4}. Best is trial 33 with value: 0.9868330917149937.


Best trial: 33. Best value: 0.986833:  68%|██████▊   | 41/60 [11:02<02:24,  7.60s/it]

[I 2025-11-21 13:55:37,370] Trial 40 finished with value: 0.9057478805182722 and parameters: {'C': 0.00016358479354739344, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.00022848972412995656, 'max_iter': 8824, 'calibration_method': 'isotonic', 'max_features': 1000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 33 with value: 0.9868330917149937.


Best trial: 33. Best value: 0.986833:  70%|███████   | 42/60 [11:40<05:05, 16.96s/it]

[I 2025-11-21 13:56:16,172] Trial 41 finished with value: 0.9862528727909072 and parameters: {'C': 16.15715430417773, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.000557650766223741, 'max_iter': 15852, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 33 with value: 0.9868330917149937.


Best trial: 42. Best value: 0.986836:  72%|███████▏  | 43/60 [12:06<05:31, 19.51s/it]

[I 2025-11-21 13:56:41,621] Trial 42 finished with value: 0.9868360018887806 and parameters: {'C': 1.8027673163476472, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.0008877274496626124, 'max_iter': 13648, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 42 with value: 0.9868360018887806.


Best trial: 43. Best value: 0.986864:  73%|███████▎  | 44/60 [12:28<05:25, 20.35s/it]

[I 2025-11-21 13:57:03,944] Trial 43 finished with value: 0.9868636807878058 and parameters: {'C': 2.085703671056578, 'loss': 'squared_hinge', 'dual': True, 'tol': 0.0014127248170917443, 'max_iter': 11798, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 43 with value: 0.9868636807878058.


Best trial: 43. Best value: 0.986864:  75%|███████▌  | 45/60 [12:36<04:07, 16.51s/it]

[I 2025-11-21 13:57:11,490] Trial 44 finished with value: 0.9867717055596448 and parameters: {'C': 2.091542085988045, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.004074679838147088, 'max_iter': 13652, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 2, 'ngram_max': 5}. Best is trial 43 with value: 0.9868636807878058.


Best trial: 43. Best value: 0.986864:  77%|███████▋  | 46/60 [12:44<03:16, 14.02s/it]

[I 2025-11-21 13:57:19,687] Trial 45 finished with value: 0.986776382246205 and parameters: {'C': 1.8557872508515758, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.004111247296789184, 'max_iter': 13942, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 43 with value: 0.9868636807878058.


Best trial: 43. Best value: 0.986864:  78%|███████▊  | 47/60 [12:52<02:37, 12.14s/it]

[I 2025-11-21 13:57:27,444] Trial 46 finished with value: 0.9846034841445699 and parameters: {'C': 0.29353284230952337, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.0038823478370385842, 'max_iter': 13806, 'calibration_method': 'isotonic', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 43 with value: 0.9868636807878058.


Best trial: 47. Best value: 0.986995:  80%|████████  | 48/60 [13:00<02:10, 10.88s/it]

[I 2025-11-21 13:57:35,385] Trial 47 finished with value: 0.9869945650852877 and parameters: {'C': 1.629767939304725, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.0037180076148703744, 'max_iter': 12155, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 47 with value: 0.9869945650852877.


Best trial: 47. Best value: 0.986995:  82%|████████▏ | 49/60 [13:11<02:02, 11.09s/it]

[I 2025-11-21 13:57:46,983] Trial 48 finished with value: 0.9840520990313628 and parameters: {'C': 0.6480691689694363, 'loss': 'hinge', 'tol': 0.0026226625331725534, 'max_iter': 12080, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 47 with value: 0.9869945650852877.


Best trial: 47. Best value: 0.986995:  83%|████████▎ | 50/60 [13:19<01:39,  9.98s/it]

[I 2025-11-21 13:57:54,359] Trial 49 finished with value: 0.982993868789088 and parameters: {'C': 0.16899291526873106, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.005656175301718103, 'max_iter': 10201, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 47 with value: 0.9869945650852877.


Best trial: 50. Best value: 0.987061:  85%|████████▌ | 51/60 [13:27<01:26,  9.63s/it]

[I 2025-11-21 13:58:03,168] Trial 50 finished with value: 0.9870613173682339 and parameters: {'C': 2.266814688078125, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.002146602798983493, 'max_iter': 10783, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 50 with value: 0.9870613173682339.


Best trial: 50. Best value: 0.987061:  87%|████████▋ | 52/60 [13:36<01:15,  9.42s/it]

[I 2025-11-21 13:58:12,095] Trial 51 finished with value: 0.9868938193037053 and parameters: {'C': 5.718285806450285, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.0008865954402498931, 'max_iter': 10792, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 50 with value: 0.9870613173682339.


Best trial: 50. Best value: 0.987061:  88%|████████▊ | 53/60 [13:46<01:05,  9.39s/it]

[I 2025-11-21 13:58:21,426] Trial 52 finished with value: 0.9868993139002015 and parameters: {'C': 5.779226347044324, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.0009313326554012446, 'max_iter': 11821, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 50 with value: 0.9870613173682339.


Best trial: 50. Best value: 0.987061:  90%|█████████ | 54/60 [13:54<00:54,  9.11s/it]

[I 2025-11-21 13:58:29,870] Trial 53 finished with value: 0.9868830522494001 and parameters: {'C': 5.729577890073078, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.0008841563259272664, 'max_iter': 11833, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 50 with value: 0.9870613173682339.


Best trial: 50. Best value: 0.987061:  92%|█████████▏| 55/60 [13:57<00:36,  7.28s/it]

[I 2025-11-21 13:58:32,896] Trial 54 finished with value: 0.9776921284436761 and parameters: {'C': 3.7759153121866227, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.0020407808401978556, 'max_iter': 10786, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 2}. Best is trial 50 with value: 0.9870613173682339.


Best trial: 50. Best value: 0.987061:  93%|█████████▎| 56/60 [14:05<00:30,  7.56s/it]

[I 2025-11-21 13:58:41,098] Trial 55 finished with value: 0.9868318549074871 and parameters: {'C': 6.434553164911832, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.0008405832794756759, 'max_iter': 11864, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 50 with value: 0.9870613173682339.


Best trial: 50. Best value: 0.987061:  95%|█████████▌| 57/60 [14:13<00:23,  7.68s/it]

[I 2025-11-21 13:58:49,074] Trial 56 finished with value: 0.9846788336395003 and parameters: {'C': 6.394219462901264, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.0011023437825333335, 'max_iter': 10705, 'calibration_method': 'sigmoid', 'max_features': 5000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 50 with value: 0.9870613173682339.


Best trial: 50. Best value: 0.987061:  97%|█████████▋| 58/60 [14:17<00:12,  6.48s/it]

[I 2025-11-21 13:58:52,756] Trial 57 finished with value: 0.9725034608762619 and parameters: {'C': 79.81100879691037, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.0021145969183372785, 'max_iter': 12887, 'calibration_method': 'sigmoid', 'max_features': 1000, 'ngram_min': 1, 'ngram_max': 2}. Best is trial 50 with value: 0.9870613173682339.


Best trial: 50. Best value: 0.987061:  98%|█████████▊| 59/60 [14:26<00:07,  7.13s/it]

[I 2025-11-21 13:59:01,403] Trial 58 finished with value: 0.9865633844746828 and parameters: {'C': 39.98643102586809, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.00307124181016593, 'max_iter': 11572, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 50 with value: 0.9870613173682339.


Best trial: 50. Best value: 0.987061: 100%|██████████| 60/60 [14:34<00:00, 14.58s/it]

[I 2025-11-21 13:59:09,873] Trial 59 finished with value: 0.9864456711174728 and parameters: {'C': 209.31059102822923, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.0011506256628183071, 'max_iter': 7202, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}. Best is trial 50 with value: 0.9870613173682339.
Number of finished trials: 60
Best AUC: 0.9870613173682339
Best params: {'C': 2.266814688078125, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.002146602798983493, 'max_iter': 10783, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}





In [24]:
best_params = {'C': 2.266814688078125, 'loss': 'squared_hinge', 'dual': False, 'tol': 0.002146602798983493, 'max_iter': 10783, 'calibration_method': 'sigmoid', 'max_features': 10000, 'ngram_min': 1, 'ngram_max': 5}

In [None]:
best_params = study.best_params.copy()
best_params['random_state'] = RANDOM_STATE
best_params['dual'] = best_params.get('dual', True if best_params['loss'] == 'hinge' else True)
best_params['max_iter'] = int(best_params['max_iter'])
calibration_method = best_params.pop('calibration_method') # 'sigmoid'

def create_tuned_linear_svc_pipeline():
    preprocessor = ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(
                max_features=best_params.pop('max_features'), # 10000
                analyzer='char',
                ngram_range=(best_params.pop('ngram_min'), best_params.pop('ngram_max')) # (1,5)
            ), 'url'),
            ('scaler', StandardScaler(), numeric_cols)
        ],
        remainder='drop'
    )
    base_estimator = LinearSVC(
        C=best_params['C'],
        class_weight='balanced',
        loss=best_params['loss'],
        dual=best_params['dual'],
        tol=best_params['tol'],
        max_iter=best_params['max_iter'],
        random_state=best_params['random_state']
    )
    calibrated_clf = CalibratedClassifierCV(
        estimator=base_estimator,
        cv=3,
        method=calibration_method,
        n_jobs=-1
    )
    return Pipeline([
        ('preprocessor', preprocessor),
        ('clf', calibrated_clf)
    ])

tuned_linear_svc_params = {
    'model_params': {**best_params, 'calibration_method': calibration_method, 'calibration_cv': 3},
    'vectorizer_params': {'tfidf': {'max_features': 5000, 'analyzer': 'char', 'ngram_range': (3, 5)}, 'scaler': 'StandardScaler'}
}

print("Running final experiment with tuned LinearSVC parameters...")
run_cv_experiment(
    X=X_combined_df, 
    y=y, 
    X_test=X_combined_test_df,
    pipeline_creator=create_tuned_linear_svc_pipeline,
    experiment_name="exp_1_combined_linear_svc_optuna",
    model_name="LinearSVC",
    vectorizer_name="CombinedFeatures",
    params=tuned_linear_svc_params,
    feature_names_func=get_combined_feature_names
)

Running final experiment with tuned LinearSVC parameters...

=== Running Experiment: exp_1_combined_linear_svc_optuna ===
Experiment 'exp_1_combined_linear_svc_optuna' initialized at: experiments/exp_1_combined_linear_svc_optuna
Mode: Incremental saving (5 folds)
Fold 1/5
  Val AUC: 0.9886
  Fold 1/5 saved | ROC AUC: 0.9886
Fold 2/5
  Val AUC: 0.9886
  Fold 1/5 saved | ROC AUC: 0.9886
Fold 2/5
  Val AUC: 0.9850
  Fold 2/5 saved | ROC AUC: 0.9850
Fold 3/5
  Val AUC: 0.9850
  Fold 2/5 saved | ROC AUC: 0.9850
Fold 3/5
  Val AUC: 0.9866
  Fold 3/5 saved | ROC AUC: 0.9866
Fold 4/5
  Val AUC: 0.9866
  Fold 3/5 saved | ROC AUC: 0.9866
Fold 4/5
  Val AUC: 0.9904
  Val AUC: 0.9904
  Fold 4/5 saved | ROC AUC: 0.9904
Fold 5/5
  Fold 4/5 saved | ROC AUC: 0.9904
Fold 5/5
  Val AUC: 0.9848
  Val AUC: 0.9848
  Fold 5/5 saved | ROC AUC: 0.9848

Finalizing experiment...
  Predictions saved to experiments/exp_1_combined_linear_svc_optuna/exp_1_combined_linear_svc_optuna_prediction.csv

✓ Experiment 'exp