# Loading processed datasets

In [1]:
import pandas as pd
import numpy as np

import sys
import os
import gc
import logging

sys.path.append(os.path.abspath('..'))

import utils

## Logistic Approach

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score

In [3]:
NUM_DATASETS = 3
N_ITERATIONS = 15

# --- Setup logging ---
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Console handler (stdout)
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)

# File handler (write logs to file)
file_handler = logging.FileHandler('process_log.txt', mode='w')
file_handler.setLevel(logging.INFO)

# Log format with timestamps
formatter = logging.Formatter('%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)

# Add handlers to logger
logger.addHandler(console_handler)
logger.addHandler(file_handler)

# --- Dataset IDs ---
all_indices = list(range(1, NUM_DATASETS + 1))
config_ids = [f'dataset_{str(i).zfill(3)}' for i in all_indices]
logger.info(f"Starting processing for all {len(all_indices)} datasets...")

# --- Hyperparameter search space ---
param_grid = {
    'C': [0.5, 1, 5],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'max_iter': [600, 700, 800]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score)

# --- Results tracking ---
results = {}
best_overall = {
    'dataset_id': None,
    'dataset_num': None,
    'dataset_name': None,
    'test_f1': -1,
    'best_params': None
}

# --- Process datasets one-by-one ---
for idx, cid in enumerate(config_ids, 1):
    logger.info(f"\n=== Loading Dataset {idx}/{NUM_DATASETS}: {cid} ===")
    X_train, X_test, y_train, y_test, vectorizer, config = utils.load_processed(config_id=cid)

    dataset_num = all_indices[idx - 1]
    dataset_name = config.get('dataset_name', cid)
    logger.info(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}")

    # Model & Search
    lr = LogisticRegression()
    random_search = RandomizedSearchCV(
        estimator=lr,
        param_distributions=param_grid,
        n_iter=N_ITERATIONS,
        scoring=f1_scorer,
        cv=cv,
        n_jobs=8,
        random_state=42,
        refit=True
    )

    logger.info(f"Starting RandomizedSearchCV ({N_ITERATIONS} iterations)...")
    random_search.fit(X_train, y_train)
    logger.info("RandomizedSearchCV completed.")
    logger.info(f"Best parameters found: {random_search.best_params_}")
    logger.info(f"Best CV F1 score: {random_search.best_score_:.4f}")

    # Test set evaluation
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    logger.info(f"Test F1 Score: {test_f1:.4f}")

    # Save results
    results[cid] = {
        'best_params': random_search.best_params_,
        'cv_best_f1': random_search.best_score_,
        'test_f1': test_f1
    }

    if test_f1 > best_overall['test_f1']:
        best_overall.update({
            'dataset_id': cid,
            'dataset_num': dataset_num,
            'dataset_name': dataset_name,
            'test_f1': test_f1,
            'best_params': random_search.best_params_
        })

    logger.info(f"Finished Dataset {dataset_num}.\n{'-'*50}")

    # --- Resource refresher ---
    del X_train, X_test, y_train, y_test, vectorizer, config, lr, random_search, best_model, y_pred
    gc.collect()

# --- Summary ---
logger.info("\n=== Summary of all datasets ===")
for cid, res in results.items():
    logger.info(f"{cid}: CV F1={res['cv_best_f1']:.4f}, Test F1={res['test_f1']:.4f}, Best params={res['best_params']}")

logger.info("\n=== Best Dataset Overall ===")
logger.info(f"Dataset ID: {best_overall['dataset_id']} (Dataset number: {best_overall['dataset_num']})")
logger.info(f"Dataset Name: {best_overall['dataset_name']}")
logger.info(f"Best Test F1 Score: {best_overall['test_f1']:.4f}")
logger.info(f"Best Hyperparameters: {best_overall['best_params']}")

2025-08-15 10:22:06 - Starting processing for all 3 datasets...
2025-08-15 10:22:06 - 
=== Loading Dataset 1/3: dataset_001 ===
2025-08-15 10:22:07 - Train set shape: (1341820, 10000), Test set shape: (236792, 10000)
2025-08-15 10:22:07 - Starting RandomizedSearchCV (15 iterations)...
2025-08-15 10:25:11 - RandomizedSearchCV completed.
2025-08-15 10:25:11 - Best parameters found: {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 700, 'C': 1}
2025-08-15 10:25:11 - Best CV F1 score: 0.8072
2025-08-15 10:25:12 - Test F1 Score: 0.8090
2025-08-15 10:25:12 - Finished Dataset 1.
--------------------------------------------------
2025-08-15 10:25:12 - 
=== Loading Dataset 2/3: dataset_002 ===
2025-08-15 10:25:14 - Train set shape: (1341820, 30000), Test set shape: (236792, 30000)
2025-08-15 10:25:14 - Starting RandomizedSearchCV (15 iterations)...
2025-08-15 10:28:36 - RandomizedSearchCV completed.
2025-08-15 10:28:36 - Best parameters found: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 

## XGBoost - Sequential trees

In [None]:
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from xgboost import XGBClassifier

# --- Config ---
NUM_DATASETS = 3
N_ITERATIONS = 15

# --- Setup logging ---
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Console handler (stdout)
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)

# File handler (write logs to file)
file_handler = logging.FileHandler('process_log.txt', mode='w')
file_handler.setLevel(logging.INFO)

# Log format with timestamps
formatter = logging.Formatter('%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)

# Add handlers to logger
logger.addHandler(console_handler)
logger.addHandler(file_handler)

# --- Dataset IDs ---
all_indices = list(range(1, NUM_DATASETS + 1))
config_ids = [f'dataset_{str(i).zfill(3)}' for i in all_indices]
logger.info(f"Starting processing for all {len(all_indices)} datasets...")

# --- Hyperparameter search space for XGBoost ---
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0.5, 1, 2]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score)

# --- Results tracking ---
results = {}
best_overall = {
    'dataset_id': None,
    'dataset_num': None,
    'dataset_name': None,
    'test_f1': -1,
    'best_params': None
}

# --- Process datasets one-by-one ---
for idx, cid in enumerate(config_ids, 1):
    logger.info(f"\n=== Loading Dataset {idx}/{NUM_DATASETS}: {cid} ===")
    X_train, X_test, y_train, y_test, vectorizer, config = utils.load_processed(config_id=cid)

    dataset_num = all_indices[idx - 1]
    dataset_name = config.get('dataset_name', cid)
    logger.info(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}")

    # Model & Search
    xgb = XGBClassifier(
        objective='binary:logistic', 
        eval_metric='logloss', 
        use_label_encoder=False, 
        n_jobs=8, 
        random_state=42
    )
    random_search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_grid,
        n_iter=N_ITERATIONS,
        scoring=f1_scorer,
        cv=cv,
        n_jobs=8,
        random_state=42,
        refit=True
    )

    logger.info(f"Starting RandomizedSearchCV ({N_ITERATIONS} iterations)...")
    random_search.fit(X_train, y_train)
    logger.info("RandomizedSearchCV completed.")
    logger.info(f"Best parameters found: {random_search.best_params_}")
    logger.info(f"Best CV F1 score: {random_search.best_score_:.4f}")

    # Test set evaluation
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    logger.info(f"Test F1 Score: {test_f1:.4f}")

    # Save results
    results[cid] = {
        'best_params': random_search.best_params_,
        'cv_best_f1': random_search.best_score_,
        'test_f1': test_f1
    }

    if test_f1 > best_overall['test_f1']:
        best_overall.update({
            'dataset_id': cid,
            'dataset_num': dataset_num,
            'dataset_name': dataset_name,
            'test_f1': test_f1,
            'best_params': random_search.best_params_
        })

    logger.info(f"Finished Dataset {dataset_num}.\n{'-'*50}")

    # --- Resource refresher ---
    del X_train, X_test, y_train, y_test, vectorizer, config, xgb, random_search, best_model, y_pred
    gc.collect()

# --- Summary ---
logger.info("\n=== Summary of all datasets ===")
for cid, res in results.items():
    logger.info(f"{cid}: CV F1={res['cv_best_f1']:.4f}, Test F1={res['test_f1']:.4f}, Best params={res['best_params']}")

logger.info("\n=== Best Dataset Overall ===")
logger.info(f"Dataset ID: {best_overall['dataset_id']} (Dataset number: {best_overall['dataset_num']})")
logger.info(f"Dataset Name: {best_overall['dataset_name']}")
logger.info(f"Best Test F1 Score: {best_overall['test_f1']:.4f}")
logger.info(f"Best Hyperparameters: {best_overall['best_params']}")


2025-09-30 07:03:28 - Starting processing for all 3 datasets...
2025-09-30 07:03:28 - 
=== Loading Dataset 1/3: dataset_001 ===
2025-09-30 07:03:29 - Train set shape: (1341820, 10000), Test set shape: (236792, 10000)
2025-09-30 07:03:29 - Starting RandomizedSearchCV (15 iterations)...
