# 配置环境

In [None]:
#!conda create -n fasttext python

In [None]:
!pip install datasets #连不上huggingface就不用装了

In [None]:
!pip install -r requirements.txt

In [None]:
!pip install pandas pyarrow

In [None]:
# 安装huggingface_hub工具包
!pip install huggingface_hub

# 设置镜像环境变量（国内推荐清华镜像）
!export HF_ENDPOINT=https://hf-mirror.com

In [None]:
!pip install fasttext

# 下载数据

In [None]:
from datasets import load_dataset
import os

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# 使用流式模式避免完整下载
math_data = load_dataset(
    "open-web-math/open-web-math",
    split="train",
    streaming=True
).take(5000)  # 只取前5000条

fineweb_data = load_dataset(
    "HuggingFaceFW/fineweb",
    split="train",
    streaming=True
).take(5000)

下载出错，手动导入

In [None]:
import pandas as pd
import json
import re

In [None]:
def clean_text(text):
    """清理文本中的特殊字符和多余空格"""
    # 移除换行符和连续空格
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    #清理乱码
    text = text.replace('\uFFFD', '')
    # 移除URL链接
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # 移除代码块标记（如```）
    text = re.sub(r'`{3}.*?`{3}', '', text, flags=re.DOTALL)
    # 移除Markdown标题符号
    text = re.sub(r'#+\s*', '', text)
    # 移除 Unicode 控制字符（如 U+200E）
    text = re.sub(r'[\u200e\u200f\u202a-\u202e]', '', text)

    return text.strip()

In [None]:
import csv
from sklearn.model_selection import train_test_split
def generate_labeled_train_test_set(parquet_path, train_path, test_path,
                                    label="__label__non_math", sample_size=1000, test_ratio=0.2):
    # 加载数据
    df = pd.read_parquet(parquet_path).sample(sample_size, random_state=42)
    df["text"] = df["text"].apply(clean_text)
    df["label"] = label
    df["fasttext"] = df["text"]

    # 划分训练和测试
    train_df, test_df = train_test_split(df["fasttext"], test_size=test_ratio, random_state=42)

    # 保存
    train_df.to_csv(train_path, index=False, header=False, quoting=csv.QUOTE_NONE,escapechar='\\')
    test_df.to_csv(test_path, index=False, header=False, quoting=csv.QUOTE_NONE,escapechar='\\')

    print(f"打标数据集生成完毕：")
    print(f"训练集（{len(train_df)} 条）已保存到：{train_path}")
    print(f"测试集（{len(test_df)} 条）已保存到：{test_path}")

In [None]:
generate_labeled_train_test_set(
    parquet_path="000_00000.parquet",
    train_path="nonmath_train.txt",
    test_path="nonmath_test.txt",
    label="__label__non_math",  # or "__label__math"
    sample_size=25000,
    test_ratio=0.2
)


In [None]:
def initial_labeling(math_path, non_math_path, output_path, sample_size=5000):
    # 加载并打乱采样
    math_df = pd.read_parquet(math_path).sample(sample_size, random_state=42)
    non_math_df = pd.read_parquet(non_math_path).sample(sample_size, random_state=42)

    # 添加标签
    math_df["label"] = "__label__math"
    non_math_df["label"] = "__label__non_math"

    # 清洗文本
    math_df["text"] = math_df["text"].apply(clean_text)
    non_math_df["text"] = non_math_df["text"].apply(clean_text)

    # 合并并转为 fastText 格式
    combined_df = pd.concat([math_df, non_math_df])
    combined_df["fasttext_format"] = combined_df["label"] + " " + combined_df["text"]

    # 保存
    combined_df["fasttext_format"].to_csv(output_path, index=False, header=False, quoting=csv.QUOTE_NONE,escapechar='\\')

    print(f"初始粗打标完成！数学样本: {len(math_df)}, 非数学样本: {len(non_math_df)}")
    print(f"已保存到: {output_path}")

In [None]:
# 使用示例
initial_labeling(
    math_path="train-00000-of-00114-5a023365406cb9c4.parquet",
    non_math_path="000_00000.parquet",
    output_path="fasttext_train_rough.txt",
    sample_size=50000
)

In [None]:
def generate_fineweb_test_set(parquet_path, output_path, exclude_path, test_size=5000):
    df = pd.read_parquet(parquet_path)
    df["text"] = df["text"].apply(clean_text)

    # 加载排除列表
    with open(exclude_path, "r", encoding="utf-8") as f:
        exclude_texts = set(line.strip() for line in f)

    # 筛选不重复的数据
    df = df[~df["text"].isin(exclude_texts)]

    # 抽样测试数据
    df = df.sample(test_size, random_state=42)

    # 保存
    df.to_csv(output_path, index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='\\')

    print(f"✅ 生成测试集 {len(df)} 条（未与训练集重复）")

In [None]:
generate_fineweb_test_set(
    parquet_path='000_00000.parquet',
    output_path='fasttext_text.txt',
    exclude_path='fasttext_train_rough.txt',
)

# 模型训练

In [None]:
import fasttext

# 训练模型
model = fasttext.train_supervised(
    input="fasttext_train_rough.txt",  # 你的训练数据路径
    epoch=10,          # 训练轮数，可根据数据大小调整
    lr=1.0,            # 学习率
    wordNgrams=2,      # 使用2-gram特征，提升性能
    verbose=2,         
    minCount=1         # 保留所有词语（小数据建议设为1）
)

# 保存模型
model.save_model("fasttext_math_classifier.bin")


In [None]:
result = model.test("labeled_nonmath_test.txt")
print(f"测试样本数: {result[0]}")
print(f"准确率: {result[1]:.4f}")
print(f"召回率: {result[2]:.4f}")


# 预测

In [None]:
pip install numpy==1.24.4 --force-reinstall

In [None]:
import numpy as np
print(np.__version__)

In [None]:
import fasttext
from collections import Counter

# ======== 配置区域 ========
model_path = "fasttext_math_classifier.bin"            # 你的 fastText 模型路径
txt_path = "labeled_nonmath_test.txt"                # 每行一条网页文本
sample_size = 10000                      # 设置为 None 表示处理全部行
# ==========================

# 1. 加载模型
model = fasttext.load_model(model_path)

# 2. 读取文本数据（每行一条）
with open(txt_path, 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]

# 3. 抽样（如需要）
if sample_size:
    import random
    random.seed(42)
    lines = random.sample(lines, min(sample_size, len(lines)))

# 4. 预测标签
labels = []
for line in lines:
    label, prob = model.predict(line)
    labels.append(label[0])

# 5. 统计标签数量
label_counts = Counter(labels)

print("✅ fastText 分类统计结果：")
for label, count in label_counts.items():
    print(f"{label}: {count} 条")


In [None]:
model_path = "fasttext_math_classifier.bin"       
txt_path = "'fasttext_text.txt'"      
sample_size = 5000      
model = fasttext.load_model(model_path)
with open(txt_path, 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]
predictions = []
for line in lines:
    label, prob = model.predict(line)
    predictions.append((line, label[0]))

# 仅保留模型预测为 __label__math 的文本
math_texts = [line for line, label in predictions if label == "__label__math"]

# 打印出来
print(f"\n✅ 模型预测为 __label__math 的文本，共 {len(math_texts)} 条：\n")
for i, line in enumerate(math_texts, 1):
    print(f"{i}. {line}")


In [None]:
# 假设你已有 lines 和 labels（或 label list）
# lines 是文本列表，labels 是模型预测的标签列表

output_path = "predicted_labeled_output.txt"

with open(output_path, "w", encoding="utf-8") as f:
    for label, line in zip(labels, lines):
        f.write(f"{label}\t{line.strip()}\n")

print(f"已保存结果到: {output_path}")
