# 对数学文本更细致地划分

In [None]:
import pandas as pd
import json
import re
import csv
from tqdm import tqdm
import os

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\uFFFD', '')
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'`{3}.*?`{3}', '', text, flags=re.DOTALL)
    text = re.sub(r'#+\s*', '', text)
    text = re.sub(r'[\u200e\u200f\u202a-\u202e]', '', text)
    return text.strip()

In [None]:
def process_metadata(metadata_obj):
    try:
        if isinstance(metadata_obj, dict):
            metadata = metadata_obj
        elif isinstance(metadata_obj, str):
            metadata = json.loads(metadata_obj.replace("'", "\""))
        else:
            return False
        return metadata.get("extraction_info", {}).get("found_math", False)
    except:
        return False


In [None]:
def convert_multiple_to_fasttext(
    metadata_file,          # 含有 metadata 字段的 parquet 路径
    other_files,            # 其他无 metadata 的 parquet 路径列表
    output_path,            # fastText 输出路径
    metadata_sample=5000,   # metadata 文件采样条数
    other_sample=5000       # 每个非 metadata 文件采样条数
):
    # 处理 metadata 文件
    print(f"正在处理带 metadata 的文件：{metadata_file}")
    df_meta = pd.read_parquet(metadata_file).sample(metadata_sample, random_state=42)
    df_meta = df_meta.dropna(subset=['text', 'metadata'])
    df_meta["label"] = df_meta["metadata"].apply(process_metadata).apply(
        lambda x: "__label__math" if x else "__label__non_math"
    )
    df_meta["clean_text"] = df_meta["text"].apply(clean_text)

    all_dfs = [df_meta[["label", "clean_text"]]]

    # 处理其他非 metadata 文件
    for f in other_files:
        print(f"正在处理非 metadata 文件：{f}")
        df = pd.read_parquet(f).sample(other_sample, random_state=42)
        df = df.dropna(subset=['text'])
        df["label"] = "__label__non_math"
        df["clean_text"] = df["text"].apply(clean_text)
        all_dfs.append(df[["label", "clean_text"]])

    # 合并所有数据
    combined_df = pd.concat(all_dfs, ignore_index=True)
    combined_df = combined_df[combined_df["clean_text"].str.strip() != ""]  # 去除空行

    # 写入 fastText 文件
    with open(output_path, 'w', encoding='utf-8') as f:
        for _, row in tqdm(combined_df.iterrows(), total=len(combined_df)):
            f.write(f"{row['label']} {row['clean_text']}\n")

    # 打印统计信息
    math_count = (combined_df["label"] == "__label__math").sum()
    non_math_count = (combined_df["label"] == "__label__non_math").sum()
    print(f"\n✅ 已生成 fastText 训练文件: {output_path}")
    print(f"标签分布：数学样本 {math_count}，非数学样本 {non_math_count}")


In [None]:
convert_multiple_to_fasttext(
    metadata_file="train-00000-of-00114-5a023365406cb9c4.parquet",
    other_files=[
        "000_00000.parquet",
    ],
    output_path="combined_fasttext.txt",
    metadata_sample=40000,
    other_sample=30000
)


# 训练模型

In [None]:
import fasttext

# 训练模型
model = fasttext.train_supervised(
    input="combined_fasttext.txt",  # 你的训练数据路径
    epoch=10,          # 训练轮数，可根据数据大小调整
    lr=1.0,            # 学习率
    wordNgrams=2,      # 使用2-gram特征，提升性能
    verbose=2,         
    minCount=1         # 保留所有词语（小数据建议设为1）
)

# 保存模型
model.save_model("fasttext_math_pro_classifier.bin")


In [None]:
result = model.test("combined_fasttext.txt")
print(f"测试样本数: {result[0]}")
print(f"准确率: {result[1]:.4f}")
print(f"召回率: {result[2]:.4f}")


In [None]:
from sklearn.metrics import roc_auc_score
import fasttext

y_true = []
y_score = []

with open("combined_fasttext.txt", "r", encoding="utf-8") as f:
    for line in f:
        label = line.split()[0].replace("__label__", "")
        text = " ".join(line.split()[1:])
        pred_label, pred_prob = model.predict(text, k=1)
        
        # 真实标签（math 为 1，non_math 为 0）
        y_true.append(1 if label == "math" else 0)
        
        # 预测为 "math" 的概率
        if pred_label[0] == "__label__math":
            y_score.append(pred_prob[0])
        else:
            y_score.append(1 - pred_prob[0])  # 非math的置信度取反
auc = roc_auc_score(y_true, y_score)
print(f"AUC: {auc:.4f}")


In [None]:
import fasttext
from collections import Counter

# ======== 配置区域 ========
model_path = "fasttext_math_pro_classifier.bin"            # 你的 fastText 模型路径
txt_path = "labeled_nonmath_test.txt"                # 每行一条网页文本
sample_size = 10000                      # 设置为 None 表示处理全部行
# ==========================

# 1. 加载模型
model = fasttext.load_model(model_path)

# 2. 读取文本数据（每行一条）
with open(txt_path, 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]

# 3. 抽样（如需要）
if sample_size:
    import random
    random.seed(42)
    lines = random.sample(lines, min(sample_size, len(lines)))

# 4. 预测标签
labels = []
for line in lines:
    label, prob = model.predict(line)
    labels.append(label[0])

# 5. 统计标签数量
label_counts = Counter(labels)

print("✅ fastText 分类统计结果：")
for label, count in label_counts.items():
    print(f"{label}: {count} 条")


In [None]:
# 假设 lines 是原始文本列表
predictions = []
for line in lines:
    label, prob = model.predict(line)
    if prob[0] < 0.85:
        label = "__label__non_math"  # 或设为 uncertain

    predictions.append((line, label[0]))

# 仅保留模型预测为 __label__math 的文本
math_texts = [line for line, label in predictions if label == "__label__math"]

# 打印出来
print(f"\n✅ 模型预测为 __label__math 的文本，共 {len(math_texts)} 条：\n")
for i, line in enumerate(math_texts, 1):
    print(f"{i}. {line}")
