In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import re
import string
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import pickle
from scipy import sparse
from sklearn.preprocessing import StandardScaler, LabelEncoder
import torch
import torchvision.transforms as transforms
from torchvision import models
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

randomseed = 42

# 1. 加载测试集guid
print("=== 加载测试集 ===")

def load_test_guid(test_file_path='./data/test_without_label.txt'):
    """加载测试集的guid"""
    test_df = pd.read_csv(test_file_path, dtype={'guid': str})
    print(f"测试集加载完成，共 {len(test_df)} 条数据")
    return test_df

# 加载测试集guid
test_df = load_test_guid()

# 2. 文本预处理类
class TextPreprocessor:
    def __init__(self, language='english'):
        self.language = language
        self.stop_words = set(stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()
        
        self.url_pattern = r'https?://\S+|www\.\S+'
        self.mention_pattern = r'@\w+'
        self.hashtag_pattern = r'#\w+'
        
    def clean_text(self, text):
        if not isinstance(text, str):
            return ""
        
        text = text.lower()
        text = re.sub(self.url_pattern, '', text)
        text = re.sub(self.mention_pattern, '', text)
        text = re.sub(self.hashtag_pattern, lambda x: x.group(0)[1:], text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^\w\s#@]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize(self, text):
        tokens = word_tokenize(text)
        return tokens
    
    def remove_stopwords(self, tokens):
        filtered_tokens = [word for word in tokens if word not in self.stop_words]
        return filtered_tokens
    
    def preprocess_pipeline(self, text):
        cleaned_text = self.clean_text(text)
        tokens = self.tokenize(cleaned_text)
        tokens = self.remove_stopwords(tokens)
        processed_text = ' '.join(tokens)
        return processed_text

# 3. 图像处理类
class ImageProcessor:
    def __init__(self, image_size=(224, 224)):
        self.image_size = image_size
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.CenterCrop(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                               std=[0.229, 0.224, 0.225])
        ])
        
    def load_and_preprocess_image(self, image_path):
        image = Image.open(image_path).convert('RGB')
        image_tensor = self.transform(image)
        return image_tensor


# 4. 图像特征提取类
class ImageFeatureExtractor:
    def __init__(self, model_name='resnet50', device=None):
        self.model_name = model_name
        self.device = torch.device('cpu')
        
        self.model = self._load_pretrained_model(model_name)
        self.model.to(self.device)
        self.model.eval()
        
        self.feature_dim = self._get_feature_dimension()
        print(f"加载模型: {model_name}")
        print(f"特征维度: {self.feature_dim}")
    
    def _load_pretrained_model(self, model_name):
        model = models.resnet50(pretrained=True)
        model = torch.nn.Sequential(*list(model.children())[:-2])
        model.add_module('global_avg_pool', torch.nn.AdaptiveAvgPool2d((1, 1)))
        model.add_module('flatten', torch.nn.Flatten())
        
        for param in model.parameters():
            param.requires_grad = False
        
        return model
    
    def _get_feature_dimension(self):
        dummy_input = torch.randn(1, 3, 224, 224).to(self.device)
        with torch.no_grad():
            features = self.model(dummy_input)
        return features.shape[1]
    
    def extract_features_batch(self, image_tensors):
        if not image_tensors:
            return np.array([])
        
        batch_tensor = torch.stack(image_tensors).to(self.device)
        features_list = []
        batch_size = 32
        
        with torch.no_grad():
            for i in range(0, len(batch_tensor), batch_size):
                batch = batch_tensor[i:i+batch_size]
                features = self.model(batch)
                features_list.append(features.cpu().numpy())
        
        all_features = np.vstack(features_list)
        return all_features
    
    def extract_features_from_paths(self, image_paths, processor, batch_size=32):
        features = []
        success_indices = []
        failed_indices = []
        
        for i in tqdm(range(0, len(image_paths), batch_size), desc="提取特征"):
            batch_paths = image_paths[i:i+batch_size]
            batch_tensors = []
            batch_success_indices = []
            
            for j, img_path in enumerate(batch_paths):
                img_tensor = processor.load_and_preprocess_image(img_path)
                if img_tensor is not None:
                    batch_tensors.append(img_tensor)
                    batch_success_indices.append(i + j)
                else:
                    failed_indices.append(i + j)
            
            if batch_tensors:
                batch_features = self.extract_features_batch(batch_tensors)
                features.append(batch_features)
                success_indices.extend(batch_success_indices)
        
        if features:
            all_features = np.vstack(features)
        else:
            all_features = np.array([])
        
        print(f"特征提取完成:")
        print(f"  成功: {len(success_indices)} 张图像")
        print(f"  失败: {len(failed_indices)} 张图像")
        
        return all_features, success_indices, failed_indices

# 5. 多模态特征融合类
class MultimodalFeatureFusion:
    def __init__(self):
        self.text_weight = 0.8
        self.image_weight = 0.2

    def weighted_concat(self, text_features, image_features, text_weight=0.5, image_weight=0.5):
        print(f"加权拼接融合: 文本权重{text_weight}, 图像权重{image_weight}")
        
        text_norm = text_features / (np.linalg.norm(text_features, axis=1, keepdims=True) + 1e-8)
        image_norm = image_features / (np.linalg.norm(image_features, axis=1, keepdims=True) + 1e-8)
        
        weighted_text = text_norm * text_weight
        weighted_image = image_norm * image_weight
        
        fused_features = np.concatenate([weighted_text, weighted_image], axis=1)
        print(f"融合后维度: {fused_features.shape[1]}")
        
        return fused_features
    
    def fuse(self, text_features, image_features):
        return self.weighted_concat(text_features, image_features, self.text_weight, self.image_weight)

# 6. 加载测试集数据并处理
print("\n=== 准备测试集数据 ===")

def load_test_data(test_df, data_dir='./data'):
    """加载测试集的图片和文本数据"""
    test_records = []
    missing_files = []
    
    for _, row in test_df.iterrows():
        guid = str(row['guid']).split('.')[0]  # 确保移除可能的".0"后缀
        
        img_path = os.path.join(data_dir, f"{guid}.jpg")
        txt_path = os.path.join(data_dir, f"{guid}.txt")
            
        with open(txt_path, 'r', encoding='latin-1') as f:
            text_content = f.read().strip()

        
        test_records.append({
            'guid': guid,  # 存储处理后的guid
            'image_path': img_path,
            'text': text_content
        })
    
    test_data_df = pd.DataFrame(test_records)
    print(f"测试数据创建完成，共 {len(test_data_df)} 条有效数据")
    print(f"缺失文件: {len(missing_files)} 个")
    
    if missing_files:
        print("前10个缺失文件:", missing_files[:10])  # 只显示前10个
    
    return test_data_df

# 加载测试数据
test_data_df = load_test_data(test_df)

# 7. 预处理测试集文本
print("\n=== 预处理测试集文本 ===")

# 初始化文本预处理器
preprocessor = TextPreprocessor(language='english')

# 预处理文本
test_texts_processed = test_data_df['text'].apply(
    lambda x: preprocessor.preprocess_pipeline(x)
)

# 保存预处理后的文本
test_data_df['text_processed'] = test_texts_processed

# 8. 加载训练时保存的模型
print("\n=== 加载训练时保存的模型 ===")

# 加载文本向量化器
with open('./data/splits/text_vectorizer.pkl', 'rb') as f:
    vectorizer_data = pickle.load(f)
    vectorizer = vectorizer_data['vectorizer']

# 加载训练时保存的标准化器
fusion_dir = './data/splits/fusion'

# 加载训练集的原始文本和图像特征
train_text_features_path = os.path.join('./data/splits', 'train_features.npz')
train_text_features = sparse.load_npz(train_text_features_path).toarray()
print(f"训练集文本特征形状: {train_text_features.shape}")
    
train_image_features_path = os.path.join('./data/splits', 'train_image_features.npy')
train_image_features = np.load(train_image_features_path)
print(f"训练集图像特征形状: {train_image_features.shape}")

# 创建标准化器
text_scaler = StandardScaler()
image_scaler = StandardScaler()
    
# 拟合标准化器
text_scaler.fit(train_text_features)
image_scaler.fit(train_image_features)

# 创建特征融合器
fusion = MultimodalFeatureFusion()
    
# 标准化训练集特征
train_text_norm = text_scaler.transform(train_text_features)
train_image_norm = image_scaler.transform(train_image_features)
        
# 融合特征
train_fused = fusion.fuse(train_text_norm, train_image_norm)
        
# 创建融合特征标准化器
fused_scaler = StandardScaler()
fused_scaler.fit(train_fused)
print(f"融合特征标准化器已拟合，输入维度: {fused_scaler.n_features_in_}")
        
# PCA降维
from sklearn.decomposition import PCA
if train_fused.shape[1] > 512:
    pca = PCA(n_components=512, random_state=42)
    train_fused_reduced = pca.fit_transform(train_fused)
    print(f"PCA已拟合，解释方差比: {np.sum(pca.explained_variance_ratio_):.4f}")

# 加载分类模型
classifier = joblib.load('./data/models/baseline_logistic_quick.pkl')

# 9. 提取测试集文本特征
print("\n=== 提取测试集文本特征 ===")

test_texts = test_data_df['text_processed'].fillna('').tolist()
test_text_features = vectorizer.transform(test_texts).toarray()
print(f"测试集文本特征形状: {test_text_features.shape}")

# 10. 提取测试集图像特征
print("\n=== 提取测试集图像特征 ===")

# 初始化图像处理器和特征提取器
image_processor = ImageProcessor(image_size=(224, 224))
image_extractor = ImageFeatureExtractor(model_name='resnet50')

# 提取图像特征
test_image_paths = test_data_df['image_path'].tolist()
test_image_features, test_success_idx, test_failed_idx = image_extractor.extract_features_from_paths(
    test_image_paths, image_processor, batch_size=32
)

print(f"测试集图像特征形状: {test_image_features.shape}")

# 11. 特征标准化和融合
print("\n=== 特征标准化和融合 ===")

# 标准化文本特征
test_text_norm = text_scaler.transform(test_text_features)

# 标准化图像特征
test_image_norm = image_scaler.transform(test_image_features)

print(f"标准化后文本特征形状: {test_text_norm.shape}")
print(f"标准化后图像特征形状: {test_image_norm.shape}")

# 初始化特征融合器
fusion = MultimodalFeatureFusion()

# 融合特征
test_fused = fusion.fuse(test_text_norm, test_image_norm)
print(f"融合后特征形状: {test_fused.shape}")

# 标准化融合特征
test_fused_norm = fused_scaler.transform(test_fused)
print(f"标准化融合特征形状: {test_fused_norm.shape}")

# 12. 特征降维（如果维度大于512）
print("\n=== 特征降维 ===")

if test_fused_norm.shape[1] > 512:
    test_fused_reduced = pca.transform(test_fused_norm)
    test_fused_norm = test_fused_reduced
    print(f"PCA降维后特征形状: {test_fused_norm.shape}")

# 13. 进行预测
print("\n=== 进行预测 ===")

# 使用分类模型进行预测
test_predictions = classifier.predict(test_fused_norm)
print(f"预测完成，共 {len(test_predictions)} 个预测结果")

# 14. 将预测结果映射回原始标签
print("\n=== 映射预测标签 ===")

# 加载训练时使用的标签编码器
train_df = pd.read_csv('./data/splits/train_dataset.csv')
all_labels = train_df['tag'].unique()

# 创建标签编码器
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# 将数值预测转换为标签
test_pred_labels = label_encoder.inverse_transform(test_predictions)

# 统计预测分布
unique, counts = np.unique(test_pred_labels, return_counts=True)
print("预测标签分布:")
for label, count in zip(unique, counts):
    print(f"  {label}: {count}")

# 15. 创建提交文件
print("\n=== 创建提交文件 ===")

# 创建包含所有guid的预测结果
submission_df = pd.DataFrame({
    'guid': test_df['guid'].apply(lambda x: str(x).split('.')[0])  # 移除".0"后缀
})

# 将预测结果添加到DataFrame中
# 创建guid到预测标签的映射
prediction_dict = dict(zip(test_data_df['guid'], test_pred_labels))

# 为每个guid添加预测标签，如果文件缺失则标记为neutral
def get_prediction(guid):
    """根据guid获取预测标签"""
    guid_str = str(guid).split('.')[0]  # 移除".0"后缀
    if guid_str in prediction_dict:
        return prediction_dict[guid_str]
    else:
        return 'neutral'

submission_df['tag'] = submission_df['guid'].apply(get_prediction)

# 保存提交文件
submission_path = './data/test_predictions.csv'
submission_df.to_csv(submission_path, index=False)
print(f"提交文件已保存到: {submission_path}")

print("\n前10个预测结果:")
print(submission_df.head(10))

# 16. 生成详细报告
print("\n=== 生成详细报告 ===")

# 保存预测结果
detailed_df = test_data_df.copy()
detailed_df['predicted_tag'] = test_pred_labels
detailed_df = detailed_df[['guid', 'text', 'image_path', 'predicted_tag']]

detailed_path = './data/test_detailed_predictions.csv'
detailed_df.to_csv(detailed_path, index=False)
print(f"详细预测结果已保存到: {detailed_path}")

print("\n=== 测试集预测完成 ===")
print(f"总测试样本: {len(test_df)}")
print(f"成功处理: {len(test_data_df)}")
print(f"文件缺失: {len(test_df) - len(test_data_df)}")
print(f"提交文件: {submission_path}")

=== 加载测试集 ===
测试集加载完成，共 511 条数据

=== 准备测试集数据 ===
测试数据创建完成，共 511 条有效数据
缺失文件: 0 个

=== 预处理测试集文本 ===

=== 加载训练时保存的模型 ===
训练集文本特征形状: (3600, 2000)
训练集图像特征形状: (3600, 2048)
加权拼接融合: 文本权重0.8, 图像权重0.2
融合后维度: 4048
融合特征标准化器已拟合，输入维度: 4048
PCA已拟合，解释方差比: 0.5488

=== 提取测试集文本特征 ===
测试集文本特征形状: (511, 2000)

=== 提取测试集图像特征 ===
加载模型: resnet50
特征维度: 2048


提取特征: 100%|██████████| 16/16 [00:16<00:00,  1.02s/it]

特征提取完成:
  成功: 511 张图像
  失败: 0 张图像
测试集图像特征形状: (511, 2048)

=== 特征标准化和融合 ===
标准化后文本特征形状: (511, 2000)
标准化后图像特征形状: (511, 2048)
加权拼接融合: 文本权重0.8, 图像权重0.2
融合后维度: 4048
融合后特征形状: (511, 4048)
标准化融合特征形状: (511, 4048)

=== 特征降维 ===
PCA降维后特征形状: (511, 512)

=== 进行预测 ===
预测完成，共 511 个预测结果

=== 映射预测标签 ===
预测标签分布:
  negative: 201
  neutral: 51
  positive: 259

=== 创建提交文件 ===
提交文件已保存到: ./data/test_predictions.csv

前10个预测结果:
   guid       tag
0     8  positive
1  1576  negative
2  2320  positive
3  4912  negative
4  3821  positive
5  1306   neutral
6  4555  positive
7   259  negative
8  3216  negative
9   881  positive

=== 生成详细报告 ===
详细预测结果已保存到: ./data/test_detailed_predictions.csv

=== 测试集预测完成 ===
总测试样本: 511
成功处理: 511
文件缺失: 0
提交文件: ./data/test_predictions.csv



