In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedGroupKFold

base_models = [
    ('xgboost', XGBClassifier(learning_rate=0.1, max_depth=9,
     min_child_weight=9, n_estimators=100, n_jobs=-1, subsample=0.6)),
]
meta_model = XGBClassifier(
    n_estimators=125,
    max_depth=4,
    min_child_weight=2,
    gamma=0.9,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    n_jobs=-1,
    scale_pos_weight=1
)


def prepare_data(data):
    X = data.drop(columns=['is_sa'])
    y = data['is_sa']
    groups = data['msisdn']
    return X, y, groups


def build_stacking_classifier():
    final_estimator = LogisticRegression(max_iter=1000, n_jobs=-1)
    stacking_clf = StackingClassifier(
        estimators=base_models,
        final_estimator=final_estimator,
        n_jobs=-1,
    )
    return stacking_clf


def build_voting_classifier():
    voting_clf = VotingClassifier(
        estimators=base_models,
        voting='soft',
        n_jobs=-1,
    )
    return voting_clf


def build_simple_classifier():
    return base_models[0][1]


def build_model(mode='stacking'):
    if mode == 'stacking':
        return build_stacking_classifier()
    elif mode == 'simple':
        return build_simple_classifier()
    elif mode == 'voting':
        return build_voting_classifier()
    else:
        raise ValueError(f'Unknown mode: {mode}')


def cross_validate_model(model, X, y, groups, cv_strategy):
    cv_scores = cross_val_score(model, X, y, cv=cv_strategy, groups=groups)
    print(f'Cross-validation F1 score: {cv_scores.mean()}')
    return cv_scores


def predict_and_save(model, validation_data, output_path, threshold=0.5):
    y_proba = model.predict_proba(validation_data)
    y_pred = (y_proba[:, 1] > threshold).astype(int)
    validation_data['is_sa'] = y_pred
    validation_data[['msisdn', 'is_sa']].to_csv(output_path, index=False)


def pseudo_labeling(train_data, validation_data, threshold=0.9, mode='voting'):
    X_train, y_train, groups_train = prepare_data(train_data)
    X_unlabeled = validation_data.copy()

    model = build_model(mode)
    model.fit(X_train, y_train)

    y_unlabeled_pred = model.predict_proba(X_unlabeled)
    high_confidence_indices = (y_unlabeled_pred.max(axis=1) > threshold)

    pseudo_labeled_data = validation_data[high_confidence_indices].copy()
    pseudo_labeled_data['is_sa'] = y_unlabeled_pred[high_confidence_indices].argmax(
        axis=1)

    augmented_train_data = pd.concat(
        [train_data, pseudo_labeled_data], ignore_index=True)

    return augmented_train_data


def main(train_data, validation_data, output_path, use_pseudo_labeling=False, threshold=0.95, mode='voting', n_fold=5, need_prediction=False, pred_threshold=0.5):
    if use_pseudo_labeling:
        augmented_train_data = pseudo_labeling(
            train_data, validation_data, threshold, mode)
    else:
        augmented_train_data = train_data

    X, y, groups = prepare_data(augmented_train_data)
    cv_strategy = StratifiedGroupKFold(n_splits=n_fold)

    classifier = build_model(mode)

    cross_validate_model(classifier, X, y, groups, cv_strategy)

    if need_prediction:
        classifier.fit(X, y)
        predict_and_save(classifier, validation_data,
                         output_path, threshold=pred_threshold)

In [None]:
from datetime import datetime
output_path = f'/home/hwxu/Projects/Competition/Telecom/Output/submissions/new_pred_{datetime.now()}.csv'
train_data = pd.read_csv(
    '/home/hwxu/Projects/Competition/Telecom/Input/processed/train35.csv')
validation_data = pd.read_csv(
    '/home/hwxu/Projects/Competition/Telecom/Input/processed/val35.csv')
print(
    f"Training on {train_data.shape[0]} samples with {validation_data.shape[1]} features.")

main(
    train_data,
    validation_data,
    output_path,
    use_pseudo_labeling=False,
    threshold=0.95,
    mode='simple',
    n_fold=5,
    need_prediction=True,
    pred_threshold=0.5,
)
# 0.9170992231638418

In [10]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.feature_selection import RFE
import xgboost as xgb
from sklearn.model_selection import StratifiedGroupKFold
import pandas as pd
import numpy as np

train_data = pd.read_csv(
    '/home/hwxu/Projects/Competition/Telecom/Input/processed/train30.csv')
validation_data = pd.read_csv(
    '/home/hwxu/Projects/Competition/Telecom/Input/processed/val30.csv')
X, y = train_data.drop(columns=['is_sa']), train_data['is_sa']
print(f"Training on {X.shape[0]} samples with {validation_data.shape[1]} features.")

xgb_model = xgb.XGBClassifier(random_state=42)

search_space = {
    'tree_method': ['hist'],  
    'lambda': list(np.logspace(-3, 1, 100)),  
    'alpha': list(np.logspace(-3, 1, 100)), 
    'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],  
    'subsample': [0.4, 0.5, 0.6, 0.7, 0.8, 1.0], 
    'learning_rate': [0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02], 
    'n_estimators': [100, 200, 500, 1000], 
    'max_depth': [5, 7, 9, 11, 13, 15, 17],  
    'min_child_weight': list(np.arange(1, 301, 1)),  
    'random_state': [42],
    'device': ['gpu'],
    'n_jobs': [-1],
}

grid = HalvingGridSearchCV(
    estimator=xgb_model, 
    cv=StratifiedGroupKFold(n_splits=5), 
    param_grid=search_space,
    scoring='f1', 
    verbose=0, 
    n_jobs=-1, 
    refit=True)

grid.fit(X, y, groups=X['msisdn'])

print("Best Score:" + str(grid.best_score_))
print("Best Parameters: " + str(grid.best_params_))

best_parameters = grid.best_params_