In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import calendar
import random
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, KFold
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
labeled_path = os.path.join(parent_dir, "data", "data_final","labeled_data.csv")
labeled_data = pd.read_csv(labeled_path)
# unlabeled_path = os.path.join(parent_dir, "data", "data_final","unlabeled_data.csv")
# unlabeled_data = pd.read_csv(unlabeled_path)

In [2]:
def preprocess_user_data(df, is_train=True, scaler=None, label_encoders=None):
    df = df.copy()

    # Drop index and ID columns
    for col in ['msno', 'registration_init_time']:
        if col in df.columns:
            df.drop(columns=col, inplace=True)

    # Convert date columns to datetime
    for col in ['last_login_date_previous', 'last_login_date_current']:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')

    if all(c in df.columns for c in ['last_login_date_current', 'last_login_date_previous']):
        default_date = pd.to_datetime("1970-01-01")

        # Normal calculation
        df['days_until_month_end'] = ((df['last_login_date_current'] + pd.offsets.MonthEnd(0)) - df['last_login_date_current']).dt.days
        df['last_login_interval'] = (df['last_login_date_current'] - df['last_login_date_previous']).dt.days

        # Masks
        curr_is_default = df['last_login_date_current'] == default_date
        prev_is_default = df['last_login_date_previous'] == default_date
        both_default = curr_is_default & prev_is_default

        # Case 1: current is default
        df.loc[curr_is_default & ~both_default, 'days_until_month_end'] = 45
        df.loc[curr_is_default & ~both_default, 'last_login_interval'] = df.loc[curr_is_default & ~both_default].apply(
            lambda row: 45 + (calendar.monthrange(row['last_login_date_previous'].year, row['last_login_date_previous'].month)[1] - row['last_login_date_previous'].day)
            if row['last_login_date_previous'] != default_date else 90,
            axis=1
        )

        # Case 2: previous is default
        df.loc[prev_is_default & ~both_default, 'days_until_month_end'] = 45
        df.loc[prev_is_default & ~both_default, 'last_login_interval'] = df.loc[prev_is_default & ~both_default].apply(
            lambda row: 45 + calendar.monthrange(row['last_login_date_current'].year, row['last_login_date_current'].month)[1]
            if row['last_login_date_current'] != default_date else 90,
            axis=1
        )

        # Case 3: both are default
        df.loc[both_default, 'days_until_month_end'] = 90
        df.loc[both_default, 'last_login_interval'] = 90

        # Drop original date columns
        for col in ['last_login_date_previous', 'last_login_date_current', 'registration_init_time']:
            if col in df.columns:
                df.drop(columns=col, inplace=True)

    # Label encode categorical features
    cat_cols = ['gender', 'city', 'registered_via']
    fitted_label_encoders = {}

    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].astype(str)
            if is_train:
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col])
                fitted_label_encoders[col] = le
            else:
                le = label_encoders[col]
                df[col] = le.transform(df[col])

    # Separate target if training
    if is_train:
        y = df['is_churn'].astype(int)
        df.drop(columns=['is_churn'], inplace=True)
    else:
        y = None
        df.drop(columns=['is_churn'], inplace=True)

    # Standardize numerical features
    X = df.copy()
    if is_train:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
    else:
        X_scaled = scaler.transform(X)

    X_scaled = df.copy()
    
    if is_train:
        return X_scaled, y, scaler, fitted_label_encoders
    else:
        return X_scaled


In [3]:
X_labled, y, scaler, fitted_label_encoders = preprocess_user_data(labeled_data, is_train=True, scaler=None, label_encoders=None)

In [4]:
#%% Example Split
X_train, X_test, y_train, y_test = train_test_split(
    X_labled, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_evaluate, y_train, y_evaluate = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

In [5]:
# SMOTE + Downsampling

def resample_smote_then_downsample(X, y, downsample_ratio=1.0):
    sm = SMOTE(random_state=42)
    X_smote, y_smote = sm.fit_resample(X, y)
    df = pd.DataFrame(X_smote)
    df['label'] = y_smote
    majority = df[df['label'] == 0]
    minority = df[df['label'] == 1]
    majority_down = majority.sample(n=int(len(minority) * downsample_ratio), random_state=42)
    df_resampled = pd.concat([majority_down, minority], axis=0).sample(frac=1, random_state=42)
    return df_resampled.drop(columns='label'), df_resampled['label']


In [6]:
#%% MLP Classifier
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, dropout_rate=0.3):
        super(MLPClassifier, self).__init__()
        layers = []
        last_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(last_dim, h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            last_dim = h_dim
        layers.append(nn.Linear(last_dim, 1))
        layers.append(nn.Sigmoid())
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x).squeeze()

In [7]:
# Training Function

def train_model(model, train_loader, val_loader, threshold=0.5,
                epochs=30, lr=0.001, patience=5, device='cpu', return_logs=False):

    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()
    best_val_loss = float('inf')
    best_model_state = None
    no_improve_epochs = 0
    logs = {'train_loss': [], 'val_loss': [], 'val_f1': []}

    for epoch in range(epochs):
        model.train()
        train_losses = []
        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device).float()
            y_batch = y_batch.to(device).float()

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        model.eval()
        val_losses = []
        y_true, y_pred = [], []
        with torch.no_grad():
            for X_val, y_val in val_loader:
                X_val = X_val.to(device).float()
                y_val = y_val.to(device).float()
                outputs = model(X_val)
                loss = criterion(outputs, y_val)
                val_losses.append(loss.item())
                preds = (outputs >= threshold).int().cpu().numpy()
                y_pred.extend(preds)
                y_true.extend(y_val.cpu().numpy())

        val_loss = np.mean(val_losses)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        logs['train_loss'].append(np.mean(train_losses))
        logs['val_loss'].append(val_loss)
        logs['val_f1'].append(f1)

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {logs['train_loss'][-1]:.4f} | Val Loss: {val_loss:.4f} | F1: {f1:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            no_improve_epochs = 0
        else:
            no_improve_epochs += 1
            if no_improve_epochs >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break

    model.load_state_dict(best_model_state)
    return (model, logs) if return_logs else model

In [2]:
# Random Search with K-Fold

def run_random_search(X, y, param_grid, k=5, epochs=30, batch_size=256, patience=5, device='cpu'):
    best_params = None
    best_avg_f1 = -1
    results = []
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    save_path = "best_mlp_model.pt"
    os.makedirs(os.path.dirname(save_path) or '.', exist_ok=True)

    for params in param_grid:
        hidden_dims = params['hidden_dims']
        lr = params['lr']
        dropout = 0.3
        threshold = 0.5
        fold_f1_scores = []

        print(f"\nTesting params: hidden_dims={hidden_dims}, lr={lr}, dropout={dropout}, threshold={threshold}")

        for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
            X_tr_bal, y_tr_bal = resample_smote_then_downsample(X_tr, y_tr)

            train_dataset = TensorDataset(torch.tensor(X_tr_bal.values, dtype=torch.float32),
                                          torch.tensor(y_tr_bal.values, dtype=torch.float32))
            val_dataset = TensorDataset(torch.tensor(X_val.values, dtype=torch.float32),
                                        torch.tensor(y_val.values, dtype=torch.float32))
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

            model = MLPClassifier(input_dim=X.shape[1], hidden_dims=hidden_dims, dropout_rate=dropout)
            model, logs = train_model(model, train_loader, val_loader,
                                      threshold=threshold,
                                      epochs=epochs, lr=lr, patience=patience,
                                      device=device, return_logs=True)

            model.eval()
            y_pred = []
            with torch.no_grad():
                for X_batch, _ in val_loader:
                    X_batch = X_batch.to(device).float()
                    outputs = model(X_batch)
                    preds = (outputs >= threshold).int().cpu().numpy()
                    y_pred.extend(preds)

            y_val_np = y_val.values
            acc = accuracy_score(y_val_np, y_pred)
            prec = precision_score(y_val_np, y_pred, zero_division=0)
            rec = recall_score(y_val_np, y_pred, zero_division=0)
            f1 = f1_score(y_val_np, y_pred)
            cm = confusion_matrix(y_val_np, y_pred)

            fold_f1_scores.append(f1)
            print(f"  Fold {fold+1} - Acc: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")
            print("  Confusion Matrix:")
            print(cm)

            plt.figure(figsize=(8, 4))
            plt.plot(logs['train_loss'], label='Train Loss')
            plt.plot(logs['val_loss'], label='Val Loss')
            plt.plot(logs['val_f1'], label='Val F1')
            plt.title(f"Fold {fold + 1} Training Curve")
            plt.xlabel("Epoch")
            plt.ylabel("Loss / F1")
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.show()

        avg_f1 = np.mean(fold_f1_scores)
        results.append((params, avg_f1))
        if avg_f1 > best_avg_f1:
            best_avg_f1 = avg_f1
            best_params = params
            torch.save(model.state_dict(), save_path)  # ✅ 保存最优模型

    print(f"\nBest Params: {best_params} | Best Avg F1: {best_avg_f1:.4f}")
    return best_params, results


In [12]:
# Convert evaluation data to tensor
X_eval_tensor = torch.tensor(X_evaluate.values, dtype=torch.float32)
y_eval_tensor = torch.tensor(y_evaluate.values, dtype=torch.float32)

# Create DataLoader (no shuffle needed for evaluation)
eval_dataset = TensorDataset(X_eval_tensor, y_eval_tensor)
eval_loader = DataLoader(eval_dataset, batch_size=256, shuffle=False)


In [None]:

#%% Generate Random Param Grid
def generate_random_param_grid(n_samples=6):
    hidden_dims_choices = [[128, 64], [512, 256], [256, 128, 64], [1024, 512, 256]]
    lr_choices = [0.0001, 0.0005, 0.001]
    grid = []
    for _ in range(n_samples):
        grid.append({
            'hidden_dims': random.choice(hidden_dims_choices),
            'lr': random.choice(lr_choices),
        })
    return grid

#%% Run Random Search
param_grid = generate_random_param_grid(6)
X_resampled, y_resampled = resample_smote_then_downsample(X_train, y_train)
best_params, results = run_random_search(
    X_resampled, y_resampled,
    param_grid=param_grid,
    k=5,
    epochs=30,
    batch_size=256,
    patience=5,
    device='cpu'
)



Testing params: hidden_dims=[512, 256], lr=0.0005, dropout=0.4, threshold=0.4


KeyboardInterrupt: 