In [3]:
# ==== 参数区：按需修改 ====
TRAIN_CSV = "../UJIIndoorLoc/trainingData.csv"        # 训练集路径
VALID_CSV = "../UJIIndoorLoc/validationData.csv"      # 可选；没有就设为 None 或保持不存在
OUTPUT_CSV = "./performance/uji_avg_valid_rssi_per_building.csv"

WAP_PREFIX = "WAP"                    # WAP 列名前缀
BUILDING_COL = "BUILDINGID"           # 楼栋列名
# 可选：若“同一经纬度只保留一次”，把下行改为 True
DEDUP_BY_LONLAT = False
LON_COL, LAT_COL = "LONGITUDE", "LATITUDE"

In [4]:
# ==== 下面一般不需要改 ====
import os
import pandas as pd
import numpy as np

def _load_csv(path):
    if path is None:
        return None
    if not os.path.exists(path):
        print(f"[Warn] 文件不存在：{path}（将跳过）")
        return None
    df = pd.read_csv(path)
    print(f"[Info] 加载 {path} -> 形状 {df.shape}")
    return df

def _find_wap_cols(df, prefix="WAP"):
    return [c for c in df.columns if isinstance(c, str) and c.upper().startswith(prefix.upper())]

# 1) 读取数据
df_train = _load_csv(TRAIN_CSV)
df_valid = _load_csv(VALID_CSV)

dfs = [d for d in [df_train, df_valid] if d is not None]
if not dfs:
    raise FileNotFoundError("未找到任何数据文件，请检查 TRAIN_CSV / VALID_CSV 路径。")

# 2) 合并
df_all = pd.concat(dfs, ignore_index=True) if len(dfs) > 1 else dfs[0]
print(f"[Info] 合并后总记录数: {len(df_all)}")

# 3) 识别 WAP 列
wap_cols = _find_wap_cols(df_all, prefix=WAP_PREFIX)
if not wap_cols:
    raise ValueError("未识别到任何 WAP 列，请检查列名是否以 'WAP' 开头或调整 WAP_PREFIX。")
print(f"[Info] 识别到 WAP 列数量: {len(wap_cols)}（示例：{wap_cols[:5]} ...）")

# 4) （可选）同一经纬度位置仅保留一次
if DEDUP_BY_LONLAT and LON_COL in df_all.columns and LAT_COL in df_all.columns:
    before = len(df_all)
    df_all = df_all.drop_duplicates(subset=[LON_COL, LAT_COL])
    print(f"[Info] 去重经纬度: {before} -> {len(df_all)} 记录")
elif DEDUP_BY_LONLAT:
    print(f"[Warn] 未找到 {LON_COL}/{LAT_COL} 列，跳过位置去重。")

# 5) 计算每条记录“有效 RSSI(≠100)”的数量
wap_values = df_all[wap_cols].to_numpy()
valid_mask = (wap_values != 100)           # True=有效
df_all["valid_rssi_count"] = valid_mask.sum(axis=1)

# 6) 按 BuildingID 统计平均值
if BUILDING_COL not in df_all.columns:
    raise KeyError(f"未找到楼栋列: {BUILDING_COL}")

result = (df_all.groupby(BUILDING_COL)["valid_rssi_count"]
          .agg(avg_valid_rssi_per_record="mean", n_records="count")
          .reset_index()
          .sort_values(BUILDING_COL))
result["avg_valid_rssi_per_record"] = result["avg_valid_rssi_per_record"].round(2)

# 7) 打印与保存
print("\n=== 平均每条记录的有效 RSSI 个数（按楼栋） ===")
print(result.to_string(index=False))

result.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
print(f"\n[Info] 结果已保存至: {OUTPUT_CSV}")

# 8) （可选）输出总体平均
overall_mean = round(float(df_all["valid_rssi_count"].mean()), 2)
print(f"[Info] 总体平均每条记录的有效RSSI个数: {overall_mean}")


[Info] 加载 ../UJIIndoorLoc/trainingData.csv -> 形状 (19937, 529)
[Info] 加载 ../UJIIndoorLoc/validationData.csv -> 形状 (1111, 529)
[Info] 合并后总记录数: 21048
[Info] 识别到 WAP 列数量: 520（示例：['WAP001', 'WAP002', 'WAP003', 'WAP004', 'WAP005'] ...）

=== 平均每条记录的有效 RSSI 个数（按楼栋） ===
 BUILDINGID  avg_valid_rssi_per_record  n_records
          0                      15.95       5785
          1                      16.44       5503
          2                      19.91       9760

[Info] 结果已保存至: ./performance/uji_avg_valid_rssi_per_building.csv
[Info] 总体平均每条记录的有效RSSI个数: 17.91
