# 生成config

In [2]:
import torch
import json
import os

def generate_config():
    # 确保在model_files目录下
    os.makedirs('model_files', exist_ok=True)

    # 只从第5个模型文件获取特征数量
    state_dict = torch.load('../best_model/best_hybrid_model_fold_5.pt', map_location='cpu')
    n_features = state_dict['classifier.0.weight'].shape[1] - 768

    # 创建配置
    config = {
        "architectures": ["HybridBERTModel"],
        "model_type": "hybrid-bert",
        # 基础配置
        "bert_base_model": "bert-base-uncased",
        "hidden_size": 768,
        # 模型特定配置
        "n_classes": 6,
        "n_features": n_features,
        "combined_dim": 768 + n_features,
        # 分类器配置
        "classifier_config": {
            "hidden_layers": [512, 256],
            "dropout": 0.3,
            "output_dim": 6
        },
        # 版本信息
        "model_version": "hybrid-bert-base-v1.0"
    }

    # 保存配置
    with open('config.json', 'w') as f:
        json.dump(config, f, indent=4)
    
    print("config.json has been generated in model_files directory")

if __name__ == "__main__":
    generate_config()

  state_dict = torch.load('../best_model/best_hybrid_model_fold_5.pt', map_location='cpu')


config.json has been generated in model_files directory


In [4]:
import torch
import json
import os
import shutil
from transformers import BertTokenizer

def generate_all_files():
    # 创建目录
    os.makedirs('model_files', exist_ok=True)
    
    # 1. 生成feature_config.json
    # 从你的代码中的ImprovedAdComplaintFeatures类获取配置
    feature_config = {
        "label_dict": {
            'misleading': {
                'primary': [
                    'misleading', 'false', 'incorrect', 'inaccurate', 'untrue',
                    'deceptive', 'misrepresent', 'exaggerate', 'misleads',
                    'unsubstantiated', 'wrong', 'dishonest', 'no proof'
                ],
                'phrases': [
                    'not true', 'false claim', 'misleading information',
                    'wrong information', 'cannot be substantiated'
                ]
            },
            'social_responsibility': {
                'primary': [
                    'unsafe', 'dangerous', 'harmful', 'irresponsible', 'hazard',
                    'risk', 'safety', 'health', 'alcohol', 'gambling'
                ],
                'phrases': [
                    'social responsibility', 'public safety', 'health risk',
                    'safety concern', 'unsafe practice'
                ]
            },
            'placement': {
                'primary': [
                    'location', 'place', 'position', 'display', 'billboard',
                    'visible', 'screen', 'site', 'area', 'distance'
                ],
                'phrases': [
                    'near school', 'close to', 'in front of', 'next to',
                    'wrong place'
                ]
            },
            'children': {
                'primary': [
                    'child', 'children', 'kid', 'minor', 'young', 'youth',
                    'teen', 'teenage', 'student', 'school', 'parent'
                ],
                'phrases': [
                    'target children', 'appeal to children', 'child safety',
                    'protect children', 'school area'
                ]
            },
            'taste_decency': {
                'primary': [
                    'offensive', 'inappropriate', 'vulgar', 'explicit', 'sexual',
                    'violent', 'disturbing', 'graphic', 'crude', 'tasteless'
                ],
                'phrases': [
                    'sexually suggestive', 'offensive content', 'bad taste',
                    'inappropriate content', 'adult content'
                ]
            }
        },
        "emotion_words": {
            'strong_negative': [
                'very', 'extremely', 'absolutely', 'totally', 'completely',
                'highly', 'seriously', 'strongly', 'deeply', 'gravely'
            ],
            'concern': [
                'worry', 'concern', 'afraid', 'fear', 'alarming',
                'dangerous', 'risky', 'threat', 'problem', 'issue'
            ]
        }
    }
    
    with open('model_files/feature_config.json', 'w') as f:
        json.dump(feature_config, f, indent=4)
    
    # 2. 复制模型权重文件
    shutil.copy('../best_model/best_hybrid_model_fold_5.pt', 'model_files/pytorch_model.pt')
    
    # 3. 获取tokenizer相关文件
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenizer.save_pretrained('./model_files')
    # 这会自动生成 vocab.txt, tokenizer.json 和 tokenizer_config.json
    
    print("所有文件已生成在 model_files 目录中")

if __name__ == "__main__":
    generate_all_files()

所有文件已生成在 model_files 目录中
