In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm


root = Path('/mnt/j/workspace/2025math/data')


def _concat_safe(lst):
    lst = [x for x in lst if isinstance(x, pd.DataFrame) and len(x)]
    return pd.concat(lst, ignore_index=True) if lst else pd.DataFrame()


def process_excel_files_to_csv():
    train_rows_non, train_rows_txbf = [], []
    valid_rows_non, valid_rows_txbf = [], []

    print('开始处理数据，根目录:', root)
    source_dirs = [d for d in root.iterdir() if d.is_dir()]
    print('找到', len(source_dirs), '个来源文件夹')

    all_files_to_process = []
    for source_dir in source_dirs:
        terminal_dirs = [d for d in source_dir.iterdir() if d.is_dir()]
        for terminal_dir in terminal_dirs:
            terminal_id = terminal_dir.name
            train_dir = terminal_dir / 'train'
            if train_dir.exists():
                for f in train_dir.glob('*.xlsx'):
                    all_files_to_process.append((f, 'train', terminal_id))
            valid_dir = terminal_dir / 'valid'
            if valid_dir.exists():
                for f in valid_dir.glob('*.xlsx'):
                    all_files_to_process.append((f, 'valid', terminal_id))

    print('总共需要处理', len(all_files_to_process), '个Excel文件')

    for excel_file, split_type, terminal_id in tqdm(all_files_to_process, desc='处理文件进度'):
        try:
            df = pd.read_excel(excel_file,engine='openpyxl')

            # 添加终端名称列
            df['terminal_id'] = terminal_id
            df['source_file'] = str(excel_file)

            # 波束赋型开启指示列
            df['beamforming_en'] = df.get('beamforming_en', 0)
            df['beamforming_en'] = pd.to_numeric(df['beamforming_en'], errors='coerce').fillna(0).astype(int)

            # 按条件拆分数据
            df_txbf = df[df['beamforming_en'] == 1].copy()
            df_non = df[df['beamforming_en'] == 0].copy()

            # 训练 / 验证分别追加
            if split_type == 'train':
                if len(df_txbf): train_rows_txbf.append(df_txbf)
                if len(df_non): train_rows_non.append(df_non)
            else:
                if len(df_txbf): valid_rows_txbf.append(df_txbf)
                if len(df_non): valid_rows_non.append(df_non)

        except Exception as e:
            print('错误处理文件', excel_file, ':', e)

    # 合并结果并保存为 CSV
    train_non = _concat_safe(train_rows_non)
    train_txbf = _concat_safe(train_rows_txbf)
    valid_non = _concat_safe(valid_rows_non)
    valid_txbf = _concat_safe(valid_rows_txbf)

    train_non_csv = root / 'train_all_non_txbf.csv'
    train_txbf_csv = root / 'train_all_txbf.csv'
    valid_non_csv = root / 'valid_all_non_txbf.csv'
    valid_txbf_csv = root / 'valid_all_txbf.csv'
    combined_csv = root / 'combined_all.csv'

    if len(train_non): train_non.to_csv(train_non_csv, index=False)
    if len(train_txbf): train_txbf.to_csv(train_txbf_csv, index=False)
    if len(valid_non): valid_non.to_csv(valid_non_csv, index=False)
    if len(valid_txbf): valid_txbf.to_csv(valid_txbf_csv, index=False)

    combined_all = _concat_safe([train_non, train_txbf, valid_non, valid_txbf])
    if len(combined_all): combined_all.to_csv(combined_csv, index=False)

    print('数据处理完成！')
    print('训练-非TxBF 行数:', len(train_non))
    print('训练- TxBF 行数:', len(train_txbf))
    print('验证-非TxBF 行数:', len(valid_non))
    print('验证- TxBF 行数:', len(valid_txbf))
    print('总数据行数:', len(combined_all))

if __name__ == "__main__":
    process_excel_files_to_csv()


开始处理数据，根目录: /mnt/j/workspace/2025math/data
找到 2 个来源文件夹
总共需要处理 19 个Excel文件


处理文件进度:  11%|█         | 2/19 [00:16<02:16,  8.02s/it]

错误处理文件 /mnt/j/workspace/2025math/data/notxbf_com_excels_f4/341c-f0d4-70be/valid/~$341c-f0d4-70be_0.xlsx : File is not a zip file


处理文件进度: 100%|██████████| 19/19 [01:40<00:00,  5.30s/it]


数据处理完成！
训练-非TxBF 行数: 20369
训练- TxBF 行数: 27549
验证-非TxBF 行数: 20369
验证- TxBF 行数: 27549
总数据行数: 95836


In [12]:
def quick_diagnose(sinr_list, y, alpha_probe=1.0, beta_probe=1.0, domain_hint="db"):
    import numpy as np
    from scipy.stats import spearmanr

    lens = np.array([np.isfinite(s).sum() for s in sinr_list])
    means = np.array([np.nanmean(s) for s in sinr_list])
    stds  = np.array([np.nanstd(s)  for s in sinr_list])

    print("[DGN] y stats: mean/std/min/max =", np.nanmean(y), np.nanstd(y), np.nanmin(y), np.nanmax(y))
    print("[DGN] SC len:  min/median/max    =", np.nanmin(lens), np.median(lens), np.nanmax(lens))
    print("[DGN] SC mean:  mean±std ≈       =", np.nanmean(means), np.nanstd(means))
    print("[DGN] SC std:   mean±std ≈       =", np.nanmean(stds), np.nanstd(stds))

    def eesm_xeff_list(slist, alpha, beta, db=True):
        out = []
        for s in slist:
            s = np.asarray(s, float)
            s = s[np.isfinite(s)]
            if s.size == 0: out.append(np.nan); continue
            if db:
                x = -s / max(beta,1e-12)
                m = np.max(x); lme = m + np.log(np.mean(np.exp(x - m)))
                out.append(-alpha*lme)
            else:
                s_lin = 10.0**(s/10.0)  # dB->linear
                x = -s_lin / max(beta,1e-12)
                m = np.max(x); lme = m + np.log(np.mean(np.exp(x - m)))
                out.append(-alpha*lme)
        return np.array(out, float)

    xeff_db  = eesm_xeff_list(sinr_list, alpha_probe, beta_probe, db=True)
    xeff_lin = eesm_xeff_list(sinr_list, alpha_probe, beta_probe, db=False)

    sp_db  = spearmanr(xeff_db,  y, nan_policy="omit").correlation
    sp_lin = spearmanr(xeff_lin, y, nan_policy="omit").correlation
    print(f"[DGN] Spearman(xeff_db, y)  = {sp_db:.4f}")
    print(f"[DGN] Spearman(xeff_lin, y) = {sp_lin:.4f}")
    print("[DGN] xeff_db  mean/std/min/max =", np.nanmean(xeff_db),  np.nanstd(xeff_db),  np.nanmin(xeff_db),  np.nanmax(xeff_db))
    print("[DGN] xeff_lin mean/std/min/max =", np.nanmean(xeff_lin), np.nanstd(xeff_lin), np.nanmin(xeff_lin), np.nanmax(xeff_lin))


In [4]:
!python .\models\eesm.py \
  --train '/mnt/j/workspace/2025math/datasets/train_all_non_txbf.csv' \
  --valid '/mnt/j/workspace/2025math/datasets/valid_all_non_txbf.csv' \
  --per_sc_col sinr_per_sc_non \
  --label_col mcs \
  --outdir .results/eesm_out

Grid α–β (Spearman): 100%|██████████████████| 1008/1008 [03:52<00:00,  4.34it/s]
Grid α–β (Spearman): 100%|████████████████| 11220/11220 [40:38<00:00,  4.60it/s]
[NM-rank   100] loss=0.240609 α=0.2405 β=3.195
[NM-rank   200] loss=0.24061 α=0.2405 β=3.195
[NM-rank   300] loss=0.240631 α=0.08945 β=3.094
[NM-rank   400] loss=0.24062 α=0.08945 β=3.094
[NM-rank   500] loss=0.240608 α=0.08945 β=3.094
[NM-rank   600] loss=0.240608 α=0.08945 β=3.094
[NM-rank   700] loss=0.240608 α=0.08945 β=3.094
[NM-rank   800] loss=0.240608 α=0.08945 β=3.094
[NM-rank   900] loss=0.240608 α=0.08945 β=3.094


In [13]:
import pandas as pd
from pathlib import Path



def parse_float_array_from_string(s: str):
    import numpy as np, re
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return np.full(0, np.nan)
    if isinstance(s, (list, np.ndarray)):
        return np.array(s, dtype=float)
    s = str(s).strip()
    if not s:
        return np.full(0, np.nan)
    try:
        parts = s.replace("[","").replace("]","").split(",")
        return np.array([float(p) for p in parts if p.strip() != ""], dtype=float)
    except Exception:
        ss = s.replace("[","").replace("]","")
        parts = re.split(r"[, \t]+", ss)
        vals = []
        for p in parts:
            try:
                vals.append(float(p))
            except Exception:
                continue
        return np.array(vals, dtype=float)

# -----------------------------
train_path = Path("/mnt/j/workspace/2025math/datasets/train_all_non_txbf.csv")
per_sc_col = "sinr_per_sc_non"   
label_col  = "mcs"               
# -----------------------------

df = pd.read_csv(train_path)

sinr_list = [parse_float_array_from_string(s) for s in df[per_sc_col]]
y = pd.to_numeric(df[label_col], errors="coerce").values

quick_diagnose(sinr_list, y)


[DGN] y stats: mean/std/min/max = 320.9596985615396 98.15744074583127 103.2 619.4
[DGN] SC len:  min/median/max    = 122 122.0 122
[DGN] SC mean:  mean±std ≈       = 93069.81729650633 725513.7899031631
[DGN] SC std:   mean±std ≈       = 2.964404670616789e-11 2.7615410022012386e-10




[DGN] Spearman(xeff_db, y)  = 0.7593
[DGN] Spearman(xeff_lin, y) = 0.6146
[DGN] xeff_db  mean/std/min/max = 93069.81729650633 725513.7899031631 0.238264857411604 13398617.5663131
[DGN] xeff_lin mean/std/min/max = 7.205044604308805e+298 inf 1.056395361828082 9.071596671309913e+299


  sqr = np.multiply(arr, arr, out=arr)
