In [20]:
import json

input_file = "filtered_processed_comments.json"
n = 10  # 读取前几条

with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)  # 一次性加载整个文件

# 确保是列表
if isinstance(data, list):
    sample = data[:n]
else:
    raise ValueError("文件内容不是 JSON 数组")

print(f"成功读取 {len(sample)} 条数据：")
for i, item in enumerate(sample, 1):
    print(f"{i}:")
    print(json.dumps(item, ensure_ascii=False, indent=2))


成功读取 10 条数据：
1:
{
  "type": "INTP",
}
2:
{
  "type": "ENTP",
  "posts": "It is just arguing semantics. To many on the consumer end, beta and production phase both share the same fact that they are not mainstream release ready|||Interesting you say that. I am a male sub. Many guys who are into that are wanting to try it but not into that regularly rather than just a fantasy to try. I also have observed that J types are more often dom than P types|||Are you a jack of all trades? probably not. Asking me to make the bot is like me asking you to figure out to treat your own child who hypothetically has a medical condition, assuming you are not a doctor.|||What to? Sorry I am exhausted and just looked for something global before I might get timed out|||One game I played actually adjusted the stamina recharge speed once hitting a certain level. For now I can just time my event choices to empty my stamina and level up and the end of every other stamina bar but I am soon going to need every oth

In [21]:
import json

# 读取原始 JSON 文件
with open("filtered_processed_comments.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 截取前 10 条数据
first_10 = data[:10]

# 保存到新文件
with open("first_10_comments.json", "w", encoding="utf-8") as f:
    json.dump(first_10, f, ensure_ascii=False, indent=2)

print("前 10 条数据已保存到 first_10_comments.json")


前 10 条数据已保存到 first_10_comments.json


In [1]:
import json
import re

MIN_WORDS = 300
MAX_WORDS = 500

def clean_text(text):
    """简单清理文本"""
    text = re.sub(r"http\S+", "", text)  # 去掉URL
    text = re.sub(r"\s+", " ", text)     # 压缩空格
    return text.strip()

def split_and_combine(posts_str):
    """按 ||| 切分并合并到  词左右"""
    units = posts_str.split("|||")
    combined = []
    buffer = []
    word_count = 0

    for unit in units:
        cleaned = clean_text(unit)
        words = cleaned.split()
        length = len(words)

        if length >= MAX_WORDS:
            # 单条超长直接输出
            if buffer:
                combined.append("|||".join(buffer))
                buffer = []
                word_count = 0
            combined.append(cleaned)
            continue

        if word_count + length <= MAX_WORDS:
            buffer.append(cleaned)
            word_count += length
        else:
            if buffer:
                combined.append("|||".join(buffer))
            buffer = [cleaned]
            word_count = length

    if buffer:
        combined.append("|||".join(buffer))

    return combined

# ====== 处理数据集 ======
input_file = "filtered_processed_comments.json"   # 原数据文件
output_file = "filtered_processed_comments_300.json" # 处理后文件

with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

new_data = []
for item in data:
    type_ = item["type"]
    posts = item["posts"]
    split_posts = split_and_combine(posts)
    for sp in split_posts:
        new_data.append({
            "type": type_,
            "posts": sp
        })

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(new_data, f, ensure_ascii=False, indent=2)

print(f"完成！输出到 {output_file}，共生成 {len(new_data)} 条记录。")


完成！输出到 filtered_processed_comments_300.json，共生成 1261235 条记录。


In [2]:
import ijson
import random
from collections import defaultdict
import json

# 参数
input_json = "filtered_processed_comments_300_cleaned.json"
output_json = "sampled_3000_per_type.json"
sample_size = 3000

# 按 type 分组（用 defaultdict 存储）
type_dict = defaultdict(list)

# 流式读取 JSON 数组
with open(input_json, 'r', encoding='utf-8') as f:
    objects = ijson.items(f, 'item')  # 针对 JSON 数组里的每个对象
    for obj in objects:
        t = obj.get('type')
        if t:
            type_dict[t].append(obj)

# 按每个 type 抽样
sampled_data = []
for t, items in type_dict.items():
    sampled_items = random.sample(items, min(len(items), sample_size))
    sampled_data.extend(sampled_items)

# 保存结果
with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(sampled_data, f, ensure_ascii=False, indent=2)

print(f"完成抽样，每个 type {sample_size} 条（不足则保留全部），结果保存到 {output_json}")


完成抽样，每个 type 3000 条（不足则保留全部），结果保存到 sampled_3000_per_type.json


In [14]:
import json

input_file = "filtered_processed_comments_cleaned.json"
output_file = "sample_10.json"

count = 0
sample_data = []

with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            try:
                obj = json.loads(line.strip())  # 尝试解析
                if isinstance(obj, dict):  # 确保是字典
                    sample_data.append(obj)
                    count += 1
            except json.JSONDecodeError:
                continue  # 解析失败直接跳过
        if count >= 10:
            break

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(sample_data, f, ensure_ascii=False, indent=2)

print(f"已保存前 10 条数据到 {output_file}")


已保存前 10 条数据到 sample_10.json


In [7]:
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

print(type(data))
print(len(data) if hasattr(data, "__len__") else data)
print(type(data[0]))
print(data[0])


<class 'list'>
10
<class 'str'>
[


In [13]:
import json
import re
import os

# 配置
input_path = "sample_10.json"   # 原始数据
output_path = "sample_10_reprocessed.json"
MIN_WORDS = 200
MAX_WORDS = 400

def clean_text(text):
    """清理文本：去掉URL、非字母数字符号、多余空格"""
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s,.!?]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def combine_with_semantics(posts_list, min_words=MIN_WORDS, max_words=MAX_WORDS):
    """按语义边界（|||）拼接到[min_words, max_words]词"""
    combined_texts = []
    buffer = []
    word_count = 0

    for post in posts_list:
        cleaned = clean_text(post)
        words = cleaned.split()

        if len(words) >= max_words:
            if buffer:
                combined_texts.append(" ".join(buffer).strip())
                buffer = []
                word_count = 0
            combined_texts.append(cleaned)
            continue

        if word_count + len(words) <= max_words:
            buffer.append(cleaned)
            word_count += len(words)
        else:
            combined_texts.append(" ".join(buffer).strip())
            buffer = [cleaned]
            word_count = len(words)

    if buffer:
        combined_texts.append(" ".join(buffer).strip())

    return combined_texts

def load_json_flexibly(path):
    """
    自动识别并加载文件：
    - 如果是 JSON 数组（被换行分隔的），拼接成一行解析
    - 如果是 JSON Lines，逐行解析
    """
    with open(path, 'r', encoding='utf-8') as f:
        first_char = f.read(1)
        f.seek(0)  # 回到文件开头

        if first_char == '[':
            # 多行 JSON 数组 → 拼接成一行解析
            json_str = ""
            for line in f:
                stripped = line.strip()
                if stripped:
                    json_str += stripped
            return json.loads(json_str)

        else:
            # JSON Lines → 逐行解析
            data = []
            for line in f:
                line = line.strip()
                if line:
                    try:
                        data.append(json.loads(line))
                    except json.JSONDecodeError:
                        print(f"跳过无法解析的行: {line[:50]}...")
            return data

# 加载数据（自动识别格式）
data = load_json_flexibly(input_path)
print(f"成功加载 {len(data)} 条数据")

# 清洗并生成新数据
new_data = []
for item in data:
    author_type = item.get("type", "")
    post_string = item.get("posts", "")

    posts_list = post_string.split("|||")
    combined_posts = combine_with_semantics(posts_list, MIN_WORDS, MAX_WORDS)
    new_post_cleaned = " ".join(combined_posts)

    new_data.append({
        "type": author_type,
        "posts": post_string,
        "posts_cleaned": new_post_cleaned
    })

# 保存结果
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(new_data, f, ensure_ascii=False, indent=2)

print(f"处理完成，已保存到 {output_path}")


成功加载 10 条数据


AttributeError: 'str' object has no attribute 'get'