In [1]:
import pandas as pd
import warnings
import logging
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, StratifiedGroupKFold
import random
import os
import numpy as np
from datetime import datetime


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


# Suppress warnings and RuntimeWarnings
warnings.filterwarnings('ignore')

# Set up logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

base_models = [
    ('xgb', XGBClassifier(objective='binary:logistic',
     n_estimators=100, n_jobs=-1, random_state=42)),
    ('cb', CatBoostClassifier(iterations=1000,
     task_type='GPU', devices='0', random_state=42, silent=True)),
    ('lgb', LGBMClassifier(objective='binary',
     n_estimators=1000, n_jobs=-1, random_state=42, verbose=-1)),
    # ('xgb_1', XGBClassifier(objective='binary:logistic',
    #  n_estimators=100, n_jobs=-1, random_state=42)),
    # ('xgb_2', XGBClassifier(objective='binary:logistic',
    #  n_estimators=150, n_jobs=-1, random_state=42)),
    # ('xgb_3', XGBClassifier(objective='binary:logistic',
    #  n_estimators=200, n_jobs=-1, random_state=42)),
    # ('xgb_4', XGBClassifier(objective='binary:logistic',
    #  n_estimators=500, n_jobs=-1, random_state=42)),
    # ('xgb_5', XGBClassifier(objective='binary:logistic',
    #  n_estimators=1000, n_jobs=-1, random_state=42)),
]

meta_model = XGBClassifier(
    n_estimators=1000,
    objective='binary:logistic',
    n_jobs=-1,
    random_state=42,
)


def prepare_data(data, use_msisdn):
    logger.info('Preparing data...')
    if use_msisdn:
        X = data.drop(columns=['is_sa'])
    else:
        X = data.drop(columns=['is_sa', 'msisdn'])
    y = data['is_sa']
    groups = data['msisdn']
    logger.info('Data preparation completed.')
    return X, y, groups


def build_stacking_classifier():
    logger.info('Building stacking classifier...')
    final_estimator = meta_model
    stacking_clf = StackingClassifier(
        estimators=base_models,
        final_estimator=final_estimator,
        n_jobs=-1,
    )
    logger.info('Stacking classifier built.')
    return stacking_clf


def build_voting_classifier():
    logger.info('Building voting classifier...')
    voting_clf = VotingClassifier(
        estimators=base_models,
        voting='soft',
        n_jobs=-1,
    )
    logger.info('Voting classifier built.')
    return voting_clf


def build_simple_classifier():
    logger.info('Building simple classifier...')
    model = base_models[0][1]
    logger.info(f'Using model: {model}')
    return model


def build_model(mode='stacking'):
    logger.info(f'Building model with mode: {mode}')
    if mode == 'stacking':
        return build_stacking_classifier()
    elif mode == 'simple':
        return build_simple_classifier()
    elif mode == 'voting':
        return build_voting_classifier()
    else:
        raise ValueError(f'Unknown mode: {mode}')


def cross_validate_model(model, X, y, groups, cv_strategy):
    logger.info('Starting cross-validation...')
    cv_scores = cross_val_score(
        model, X, y, cv=cv_strategy, groups=groups, scoring='f1')
    logger.info(f'Cross-validation completed. F1 score: {cv_scores.mean()}')
    return cv_scores


def predict(model, validation_data, output_path, use_msisdn=False):
    logger.info('Predicting and saving results...')
    msisdn = validation_data['msisdn']
    if not use_msisdn:
        validation_data = validation_data.drop(columns=['msisdn'])
    y_pred = model.predict(validation_data)
    result = pd.DataFrame({'msisdn': msisdn, 'is_sa': y_pred})
    result.to_csv(output_path, index=False)
    logger.info(f'Results saved to {output_path}')


def pseudo_labeling(train_data, validation_data, threshold=0.9, mode='voting', use_msisdn=False):
    logger.info('Starting pseudo-labeling...')
    X_train, y_train, groups_train = prepare_data(train_data, use_msisdn)
    X_unlabeled = validation_data.drop(columns=['msisdn']).copy(
    ) if not use_msisdn else validation_data.copy()

    model = build_model(mode)
    model.fit(X_train, y_train)

    y_unlabeled_pred = model.predict_proba(X_unlabeled)
    high_confidence_indices = (y_unlabeled_pred.max(axis=1) > threshold)

    pseudo_labeled_data = validation_data[high_confidence_indices].copy()
    pseudo_labeled_data['is_sa'] = y_unlabeled_pred[high_confidence_indices].argmax(
        axis=1)

    augmented_train_data = pd.concat(
        [train_data, pseudo_labeled_data], ignore_index=True)
    logger.info('Pseudo-labeling completed.')
    return augmented_train_data


def adversarial_validation(train_data, validation_data, use_msisdn=False):
    logger.info('Starting adversarial validation...')
    train_data['is_train'] = 1
    validation_data['is_train'] = 0

    combined_data = pd.concat([train_data, validation_data], axis=0)
    if use_msisdn:
        X = combined_data.drop(columns=['is_sa', 'is_train'])
    else:
        X = combined_data.drop(columns=['is_sa', 'is_train', 'msisdn'])
    y = combined_data['is_train']

    adv_model = XGBClassifier(
        objective='binary:logistic', n_estimators=100, n_jobs=-1, random_state=42)
    adv_model.fit(X, y)

    feature_importances = adv_model.feature_importances_
    feature_names = X.columns
    important_features = [feature for feature, importance in zip(
        feature_names, feature_importances) if importance > np.median(feature_importances)]
    logger.info(
        f'Removed features: {set(feature_names) - set(important_features)}')
    logger.info('Adversarial validation completed.')
    if use_msisdn:
        train_data = train_data[important_features + ['is_sa'] + ['msisdn']]
        validation_data = validation_data[important_features + ['msisdn']]
    else:
        train_data = train_data[important_features + ['is_sa']]
        validation_data = validation_data[important_features]
    logger.info(
        f'Training on {train_data.shape[0]} samples with {train_data.shape[1] - (2 if use_msisdn else 1)} features')
    return train_data, validation_data


def main(dataset_version, output_path, use_pseudo_labeling=False, threshold=0.95, mode='voting', n_fold=5, need_prediction=False, use_adversarial_validation=False, use_msisdn=False):
    logger.info('Main function started.')

    train_data = pd.read_csv(
        f'/home/hwxu/Projects/Competition/Telecom/Input/processed/train{dataset_version}.csv')
    validation_data = pd.read_csv(
        f'/home/hwxu/Projects/Competition/Telecom/Input/processed/val{dataset_version}.csv')
    logger.info(
        f"Training on {train_data.shape[0]} samples with {validation_data.shape[1]} features.")

    if use_adversarial_validation:
        train_data, validation_data = adversarial_validation(
            train_data, validation_data, use_msisdn)
    else:
        train_data = train_data
        validation_data = validation_data

    if use_pseudo_labeling:
        train_data = pseudo_labeling(
            train_data, validation_data, threshold, mode, use_msisdn)
    else:
        train_data = train_data

    X, y, groups = prepare_data(train_data, use_msisdn)
    cv_strategy = StratifiedGroupKFold(n_splits=n_fold)

    classifier = build_model(mode)

    cross_validate_model(classifier, X, y, groups, cv_strategy)

    if need_prediction:
        classifier.fit(X, y)
        predict(classifier, validation_data, output_path, use_msisdn)
    logger.info('Main function completed.')

In [2]:
dataset_version = 50
output_path = f'/home/hwxu/Projects/Competition/Telecom/Output/submissions/pred-{dataset_version}_{datetime.now()}.csv'

main(
    dataset_version,
    output_path,
    need_prediction=False,
    use_pseudo_labeling=False,
    use_adversarial_validation=False,
    use_msisdn=True,
    threshold=0.95,
    mode='voting',
    n_fold=10,
)

2024-07-27 04:21:56,089 - INFO - Main function started.
2024-07-27 04:21:56,233 - INFO - Training on 3836 samples with 51 features.
2024-07-27 04:21:56,240 - INFO - Preparing data...
2024-07-27 04:21:56,246 - INFO - Data preparation completed.
2024-07-27 04:21:56,248 - INFO - Building model with mode: voting
2024-07-27 04:21:56,249 - INFO - Building voting classifier...
2024-07-27 04:21:56,249 - INFO - Voting classifier built.
2024-07-27 04:21:56,250 - INFO - Starting cross-validation...
