In [17]:
import os
import numpy as np
import pandas as pd
from functools import reduce
from scipy import sparse

# sklearn 相关
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.base import BaseEstimator, ClassifierMixin

# PyTorch 相关
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader

# ==== 1. 读取并过滤数据 ====
data_dir     = "./data"
module       = "FFF"
presentation = "2013J"

# 1.1 读取并筛选 assessments
assessments    = pd.read_csv(os.path.join(data_dir, "assessments.csv"))
assessments    = assessments[
    (assessments.code_module == module) &
    (assessments.code_presentation == presentation)
]
# 1.2 vle 元数据不需要过滤
vle_meta       = pd.read_csv(os.path.join(data_dir, "vle.csv"))

# 1.3 studentVle
student_vle    = pd.read_csv(os.path.join(data_dir, "studentVle.csv"))
student_vle    = student_vle[
    (student_vle.code_module == module) &
    (student_vle.code_presentation == presentation)
]

# 1.4 studentAssessment （待与 assessments 合并后筛选）
student_assess = pd.read_csv(os.path.join(data_dir, "studentAssessment.csv"))

# 1.5 studentInfo
student_info   = pd.read_csv(os.path.join(data_dir, "studentInfo.csv"))
student_info   = student_info[
    (student_info.code_module == module) &
    (student_info.code_presentation == presentation)
]

# 1.6 registration
registration   = pd.read_csv(os.path.join(data_dir, "studentRegistration.csv"))
registration   = registration[
    (registration.code_module == module) &
    (registration.code_presentation == presentation)
]

# ==== 2. 特征工程 & 多表合并 ====
# 2.1 VLE 行为（前4周）
sv = student_vle.merge(
    vle_meta[['id_site','activity_type']],
    on='id_site', how='left'
)
sv = sv[sv['date'] <= 28]
vle_agg = sv.groupby(['id_student','activity_type'])['sum_click'] \
            .sum().unstack(fill_value=0).reset_index()

# 2.2 评估表现（前4周）
sa = student_assess.merge(
    assessments[['id_assessment','date','assessment_type']],
    on='id_assessment', how='inner'
)
sa = sa[sa['date'] <= 28]
sa_agg = sa.groupby(['id_student','assessment_type']) \
           .agg(score_mean=('score','mean'),
                attempts=('id_assessment','count')) \
           .unstack(fill_value=0)
sa_agg.columns = ['_'.join(col) for col in sa_agg.columns]
sa_agg = sa_agg.reset_index()

# 2.3 注册信息
reg = registration.copy()
reg['has_withdrawn'] = reg['date_unregistration'].notna().astype(int)
reg = reg[['id_student','has_withdrawn']]

# 2.4 人口学 & 标签
info = student_info.copy()
info['label'] = info['final_result'].isin(['Fail','Withdrawn']).astype(int)
info = info[[
    'id_student','gender','region','highest_education',
    'imd_band','age_band','num_of_prev_attempts',
    'studied_credits','disability','label'
]]

# 2.5 合并所有
dfs = [vle_agg, sa_agg, reg, info]
df = reduce(lambda L,R: pd.merge(L, R, on='id_student', how='outer'), dfs)

# 填充缺失
num_cols = df.select_dtypes(include=['int','float']).columns
cat_cols = ['gender','region','highest_education','imd_band',
            'age_band','disability']
df[num_cols] = df[num_cols].fillna(0)
df[cat_cols] = df[cat_cols].fillna('missing')

# ==== 3. X, y 准备 ====
feature_cols = [c for c in df.columns if c not in ['id_student','label']]
X = df[feature_cols]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

In [19]:
# ==== 4. 逻辑回归 Pipeline ====
num_feats = X_train.select_dtypes(include='number').columns.tolist()
cat_feats = [c for c in feature_cols if c in cat_cols]

pre = ColumnTransformer([
    ('num', StandardScaler(), num_feats),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_feats)
])
pipe_lr = Pipeline([
    ('pre', pre),
    ('clf', LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42))
])

pipe_lr.fit(X_train, y_train)
y_pred_lr  = pipe_lr.predict(X_test)
y_proba_lr = pipe_lr.predict_proba(X_test)[:,1]

print("=== 逻辑回归 评估 (FFF-2013J) ===")
print(classification_report(y_test, y_pred_lr, digits=4))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_lr))

# 特征重要性（系数）
ohe = pipe_lr.named_steps['pre'].named_transformers_['cat']
ohe_names = ohe.get_feature_names_out(cat_feats)
all_feats = np.concatenate([num_feats, ohe_names])
coefs     = pipe_lr.named_steps['clf'].coef_.ravel()
imp_lr = pd.DataFrame({
    'feature': all_feats,
    'coef': coefs,
    'abs_coef': np.abs(coefs)
}).sort_values('abs_coef', ascending=False).reset_index(drop=True)

print("\n逻辑回归 Top 10 特征 (coef)：")
print(imp_lr.head(10)[['feature','coef']])

=== 逻辑回归 评估 (FFF-2013J) ===
              precision    recall  f1-score   support

           0     0.7628    0.9088    0.8294       329
           1     0.8976    0.7388    0.8105       356

    accuracy                         0.8204       685
   macro avg     0.8302    0.8238    0.8199       685
weighted avg     0.8328    0.8204    0.8196       685

ROC-AUC: 0.9014463303848912

逻辑回归 Top 10 特征 (coef)：
                                         feature      coef
0                                  has_withdrawn  3.099431
1                                 score_mean_TMA -1.282397
2  highest_education_Post Graduate Qualification  0.889866
3                                  age_band_55<=  0.693176
4                                 imd_band_10-20  0.614567
5        highest_education_A Level or Equivalent -0.590165
6              highest_education_No Formal quals -0.569961
7                               imd_band_missing -0.547830
8                                 imd_band_0-10%  0.499858
9  

In [20]:
# ==== 5. PyTorch MLP ====
def to_numpy(df_):
    arr = pre.transform(df_)
    if sparse.issparse(arr):
        arr = arr.toarray()
    return arr.astype(np.float32)

Xtr_np = to_numpy(X_train)
Xte_np = to_numpy(X_test)
ytr_np = y_train.values.astype(np.int64)
yte_np = y_test.values.astype(np.int64)

train_ds = TensorDataset(torch.from_numpy(Xtr_np), torch.from_numpy(ytr_np))
test_ds  = TensorDataset(torch.from_numpy(Xte_np), torch.from_numpy(yte_np))
train_ld = DataLoader(train_ds, batch_size=64, shuffle=True)
test_ld  = DataLoader(test_ds, batch_size=64, shuffle=False)

class MLP(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim,128), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128,64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64,2)
        )
    def forward(self, x):
        return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP(Xtr_np.shape[1]).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(1,31):
    model.train()
    total_loss = 0
    for xb, yb in train_ld:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    print(f"Epoch {epoch}/30, Loss: {total_loss/len(train_ds):.4f}")

# 5.2 MLP 评估
model.eval()
preds, probs, trues = [], [], []
with torch.no_grad():
    for xb, yb in test_ld:
        xb = xb.to(device)
        out = model(xb)
        p = torch.softmax(out, dim=1)[:,1].cpu().numpy()
        y = out.argmax(dim=1).cpu().numpy()
        probs.extend(p); preds.extend(y); trues.extend(yb.numpy())

print("\n=== MLP 评估 (FFF-2013J) ===")
print(classification_report(trues, preds, digits=4))
print("ROC-AUC:", roc_auc_score(trues, probs))

# ==== 6. MLP 置换重要性 ====
class MLPWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model, pre, device):
        self.model = model
        self.pre = pre
        self.device = device
    def fit(self, X, y=None):
        return self
    def predict(self, X):
        arr = self.pre.transform(X)
        if sparse.issparse(arr): arr = arr.toarray()
        t = torch.from_numpy(arr.astype(np.float32)).to(self.device)
        self.model.eval()
        with torch.no_grad():
            out = self.model(t)
            probs = torch.softmax(out, dim=1)[:,1].cpu().numpy()
        return (probs >= 0.5).astype(int)
    def predict_proba(self, X):
        arr = self.pre.transform(X)
        if sparse.issparse(arr): arr = arr.toarray()
        t = torch.from_numpy(arr.astype(np.float32)).to(self.device)
        self.model.eval()
        with torch.no_grad():
            out = self.model(t)
            probs = torch.softmax(out, dim=1).cpu().numpy()
        return probs

mlp_wrap = MLPWrapper(model, pre, device)
res = permutation_importance(
    estimator=mlp_wrap,
    X=X_test,
    y=y_test,
    scoring=lambda est, X, y: roc_auc_score(y, est.predict_proba(X)[:,1]),
    n_repeats=10,
    random_state=42,
    n_jobs=1
)

imp_mlp = pd.DataFrame({
    'feature': feature_cols,
    'importance': res.importances_mean
}).sort_values('importance', ascending=False).reset_index(drop=True)

print("\nMLP Top 10 特征 (permutation importance):")
print(imp_mlp.head(10))


Epoch 1/30, Loss: 0.5822
Epoch 2/30, Loss: 0.4157
Epoch 3/30, Loss: 0.3456
Epoch 4/30, Loss: 0.3243
Epoch 5/30, Loss: 0.3187
Epoch 6/30, Loss: 0.3098
Epoch 7/30, Loss: 0.3081
Epoch 8/30, Loss: 0.2923
Epoch 9/30, Loss: 0.2885
Epoch 10/30, Loss: 0.2798
Epoch 11/30, Loss: 0.2783
Epoch 12/30, Loss: 0.2739
Epoch 13/30, Loss: 0.2721
Epoch 14/30, Loss: 0.2581
Epoch 15/30, Loss: 0.2467
Epoch 16/30, Loss: 0.2513
Epoch 17/30, Loss: 0.2370
Epoch 18/30, Loss: 0.2288
Epoch 19/30, Loss: 0.2297
Epoch 20/30, Loss: 0.2212
Epoch 21/30, Loss: 0.2183
Epoch 22/30, Loss: 0.2157
Epoch 23/30, Loss: 0.2002
Epoch 24/30, Loss: 0.1930
Epoch 25/30, Loss: 0.1872
Epoch 26/30, Loss: 0.1843
Epoch 27/30, Loss: 0.1824
Epoch 28/30, Loss: 0.1792
Epoch 29/30, Loss: 0.1725
Epoch 30/30, Loss: 0.1707

=== MLP 评估 (FFF-2013J) ===
              precision    recall  f1-score   support

           0     0.7768    0.8359    0.8053       329
           1     0.8369    0.7781    0.8064       356

    accuracy                         