In [20]:
import pandas as pd
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedGroupKFold
from sklearn.metrics import f1_score

base_models = [
    ('xgboost', XGBClassifier(n_jobs=-1)),
    ('rf', RandomForestClassifier(n_estimators=100, n_jobs=-1)),
    ('nn', MLPClassifier(hidden_layer_sizes=64, max_iter=1000)),
    ('catboost', CatBoostClassifier(verbose=0, thread_count=-1)),
    ('et', ExtraTreesClassifier(n_estimators=100, n_jobs=-1)),
    # ('hgb', HistGradientBoostingClassifier()),
    # ('ada', AdaBoostClassifier(algorithm='SAMME')),
    ('gb', GradientBoostingClassifier()),
]


def prepare_data(data):
    X = data.drop(columns=['is_sa'])
    y = data['is_sa']
    groups = data['msisdn']
    return X, y, groups


def build_stacking_classifier():
    final_estimator = LogisticRegression()
    stacking_clf = StackingClassifier(
        estimators=base_models,
        final_estimator=final_estimator,
        n_jobs=-1,
    )
    return stacking_clf


def bulid_voting_classifier():
    voting_clf = VotingClassifier(
        estimators=base_models,
        voting='soft',
        n_jobs=-1,
    )
    return voting_clf


def build_simple_classifier():
    return RandomForestClassifier(n_jobs=-1)


def build_model(mode='stacking'):
    if mode == 'stacking':
        return build_stacking_classifier()
    elif mode == 'simple':
        return build_simple_classifier()
    elif mode == 'voting':
        return bulid_voting_classifier()
    else:
        raise ValueError(f'Unknown mode: {mode}')


def cross_validate_model(model, X, y, groups, cv_strategy):
    cv_scores = cross_val_score(
        model, X, y, cv=cv_strategy, scoring='f1', groups=groups)
    print(f'Cross-validation F1 score: {cv_scores.mean()}')
    return cv_scores


def predict_and_save(model, validation_data, output_path):
    y_pred = model.predict(validation_data)
    validation_data['is_sa'] = y_pred
    validation_data[['msisdn', 'is_sa']].to_csv(output_path, index=False)


def pseudo_labeling(train_data, validation_data, threshold=0.9, mode='voting'):
    X_train, y_train, groups_train = prepare_data(train_data)
    X_unlabeled = validation_data.copy()

    model = build_model(mode)
    model.fit(X_train, y_train)

    y_unlabeled_pred = model.predict_proba(X_unlabeled)
    high_confidence_indices = (y_unlabeled_pred.max(axis=1) > threshold)

    pseudo_labeled_data = validation_data[high_confidence_indices].copy()
    pseudo_labeled_data['is_sa'] = y_unlabeled_pred[high_confidence_indices].argmax(
        axis=1)

    augmented_train_data = pd.concat(
        [train_data, pseudo_labeled_data], ignore_index=True)

    return augmented_train_data


def main(train_data, validation_data, output_path, use_pseudo_labeling=False, threshold=0.95, mode='voting', n_fold=5):
    if use_pseudo_labeling:
        augmented_train_data = pseudo_labeling(
            train_data, validation_data, threshold, mode)
    else:
        augmented_train_data = train_data

    X, y, groups = prepare_data(augmented_train_data)
    cv_strategy = StratifiedGroupKFold(n_splits=n_fold)

    classifier = build_model(mode)

    cross_validate_model(classifier, X, y, groups, cv_strategy)

    classifier.fit(X, y)
    predict_and_save(classifier, validation_data, output_path)

In [21]:
from datetime import datetime
output_path = f'/home/hwxu/Projects/Competition/Telecom/Output/submissions/new_pred_{datetime.now()}.csv'
train_data = pd.read_csv(
    '/home/hwxu/Projects/Competition/Telecom/Input/processed/train.csv')
validation_data = pd.read_csv(
    '/home/hwxu/Projects/Competition/Telecom/Input/processed/val.csv')
main(
    train_data,
    validation_data,
    output_path,
    use_pseudo_labeling=False,
    threshold=0.95,
    mode='stacking',
    n_fold=10,
)

Cross-validation F1 score: 0.7868947730822671
