In [1]:
# 数据准备
#train.txt 标签文件
import pandas as pd
import os
import cv2
import numpy as np
from tqdm import tqdm
import glob
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet152
from torchvision import transforms
from PIL import Image
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer


# 加载预训练的BERT模型和分词器
# 加载BERT模型和tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
pretrained_model = BertModel.from_pretrained("bert-base-multilingual-cased")


num_classes=3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
folder_path = "./data/"

# 图像数据预处理
transform = transforms.Compose([
    transforms.Resize((256, 256)),  # 图片缩放到统一大小
    transforms.ToTensor(),  # 将图片转换为tensor
    # transforms.Normalize(mean=(0,0,0), std=(8,8,8)),
])

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
max_length = 131  # 输入的最大文本长度
def get_valid_imagesPath_from_directory(folder_path ,df):
    image_paths = []
    for ind in df['guid']:
        image_path = folder_path+str(ind)+".jpg"
        try:
            image = cv2.imread(image_path)
            height,width,channels = image.shape
            image_paths.append(image_path)
            # print(image_path)
        except Exception as e:
            #print(f"file '{file}' not found")
            continue
    
    return image_paths

# 指定文件夹路径
def get_texts_from_textsPath(folder_path,df):
    texts=[]
    # 按dataframe遍历txt文件并将内容写入txt_list
    for ind in df['guid']:
        file = folder_path+str(ind)+".txt"
        try:
            with open(file, "r",encoding="GB18030") as infile:
                content = infile.read()
                texts.append(content)
        except FileNotFoundError:
            continue
    return texts


def text_preprocess(texts):
    # 遍历列表中的每个句
#     for i in range(len(texts)):
#         # 将句子拆分成单词
#         words = texts[i].split()
#         # 过滤以@开头的词
#         words = [word for word in words if not word.startswith('@') and not word.startswith('#') 
#                  and not word.startswith('http') and not word.startswith('|')]

#         # 将过滤后的单词重新组合成句子
#         texts[i] = ' '.join(words)

    print(len(texts))
    # 使用分词器处理文本
    print(texts[:5])
    # tokenized_texts = [tokenizer.encode_plus(texts, add_special_tokens=True, truncation=True,
    #                                                max_length=max_length, padding='max_length') for text in texts]
    tokenized_texts = [tokenizer(text,padding='max_length',max_length=max_length,truncation=True,return_tensors="pt") for text in texts]
    return tokenized_texts

#图片和文本混合数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, image_paths, tokenized_texts, labels,transform=None):
        self.image_paths = image_paths     
        self.transform = transform
        self.input_ids = [x['input_ids'] for x in tokenized_texts]
        self.attention_mask = [x['attention_mask'] for x in tokenized_texts]
        self.labels = labels

    def __getitem__(self, index):
        input_ids = torch.tensor(self.input_ids[index])
        attention_mask = torch.tensor(self.attention_mask[index])
        labels = torch.tensor(self.labels[index])
        #return input_ids, attention_mask, labels  
        image_path = self.image_paths[index]
        #label = self.labels[index]
        image = Image.open(image_path)
        #image = cv2.imread(image_path)
        image = self.transform(image)
        
        return image ,input_ids, attention_mask, labels
    def __len__(self):
        return len(self.input_ids)

In [3]:
# 数据准备
#train.txt 标签文件
train_label_path = "train.txt"
train_label_df = pd.read_csv(train_label_path,sep=",")
column_dict = {"positive": 0, "negative": 1,"neutral":2}
new_df = train_label_df.replace({"tag": column_dict})
labels = list(new_df['tag'])

image_paths = get_valid_imagesPath_from_directory(folder_path,new_df)
print(len(image_paths))

texts = get_texts_from_textsPath(folder_path,new_df)
print(len(texts))

# 划分验证集
image_paths_train, image_paths_val, texts_train, texts_val, labels_train, labels_val = train_test_split(
    image_paths, texts, labels, test_size=0.2, random_state=5)
#文本预处理
tokenized_texts_train = text_preprocess(texts_train)
tokenized_texts_val = text_preprocess(texts_val)

# 构建Dataset和DataLoader
# 创建数据集和数据加载器
dataset_train = Dataset(image_paths_train, tokenized_texts_train, labels_train, transform)
dataset_val = Dataset(image_paths_val,tokenized_texts_val, labels_val, transform)
# loader_train = DataLoader(dataset_train, batch_size=64, shuffle=True)
# loader_val = DataLoader(dataset_val, batch_size=64, shuffle=False)

4000
4000
3200
['Vibrating higher & staying out of drama is the best aging secret. http:… \n', 'RT @Jamie__T__: #In-n-Out has kinda gone full circle, from being worshipped as a junk food nirvana to bein… \n', 'RT @Payton7Anderson: Happy birthday to my main QUAD? have a cheerful day ?love ya lots ?? @Ryan_Birge http://t.co/UD8g157s6N\n', 'Camera Deals : Black WiFi 1080P Full HD Helmet Sports Action Waterproof Car Camera Camcord… \n', '#February #Winter #Rainy #Stormy #Windy #Wednesday #Evening #Love #Happy #Positive #Passionate #Calm #Fun #UK ?? ? \n']
800
['Intrepid 2016 Elite goes 9-2-1 on the summer season. Way to rep the orange and Blue! #intrepidfamily \n', '#depressed #depression #bullied #anxiety #overdosed #addict #drugs #pills #cuts #cutting #… \n', '@nyyankeefanfore "singles" and is nearly thrown out due to poor base-running #FuturesGame \n', 'RT @MYDGAdventure: MY Adventure now an approved City & Guilds centre in our own right! #delighted \n', 'Dang @MCPLMO has no chill #Deli

In [4]:
from torchvision.models import resnet50
import torch.nn.functional as F

# 特征提取模型定义
class ImageFeatureExtractor(nn.Module):
    def __init__(self):
        super(ImageFeatureExtractor, self).__init__()
        self.resnet = resnet50(pretrained=True)  # 使用预训练的ResNet-50作为图片特征提取器
        #self.resnet = ImageModel()
        #nn.AdaptiveAvgPool2d
        # self.fc = nn.Linear(1000, 256)  # 图片特征映射到256维的特征向量
    
    def forward(self, image):
        features = self.resnet(image)
        # print(features)
        # print(features.shape)
        # features = self.fc(features)
        return features

class TextFeatureExtractor(nn.Module):
    def __init__(self):
        super(TextFeatureExtractor, self).__init__()
        self.bert = pretrained_model
        # self.fc = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # 获取 pooled_output
        output = pooled_output
        # print(output)
        # output = output.reshape(output.shape[0],-1)
        # output = self.fc(output)
        return output
    

# 多模态融合模型定义
class FusionModel(nn.Module):
    def __init__(self, num_classes,option):
        super(FusionModel, self).__init__()
        self.image_extractor = ImageFeatureExtractor()
        #self.text_encoder = nn.EmbeddingBag(num_embeddings=10000, embedding_dim=256)  # 文本特征映射到256维的特征向量   
        self.text_encoder = TextFeatureExtractor()
        self.option=option
        # self.self_attention = SelfAttention(1536,1536)
        self.classifier0 = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(1000, 256),
#             #nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
             nn.Linear(256, num_classes),
#             #nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
           
        )
        self.classifier1 = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(768, 256),
            #nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            
            nn.Dropout(p=0.5),
            nn.Linear(256, num_classes),
            #nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
        )
        self.classifier2 = nn.Sequential(
            # nn.Dropout(p=0.5),
            # nn.Linear(1536, 1024),
            # #nn.BatchNorm1d(512),
            # nn.ReLU(inplace=True),

#             nn.Dropout(p=0.5),
#             nn.Linear(1024, 512),
#             #nn.BatchNorm1d(256),
#             nn.ReLU(inplace=True),
            
#             nn.Dropout(p=0.5),
#             nn.Linear(512, 128),
#             #nn.BatchNorm1d(512),
#             nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(1768, 1024),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(1024, num_classes),
            nn.ReLU(inplace=True),
           # nn.Softmax(dim=1)
        )

    
    def forward(self, image, input_ids,attention_mask):
        if(self.option==0):
            image_features = self.image_extractor(image)
            output = image_features
            output = self.classifier0(image_features)
        elif(self.option==1):
            text_features = self.text_encoder(input_ids, attention_mask)
            # print(input_ids)
            # print(attention_mask)
            output = self.classifier1(text_features)
        else:
            # print(image.shape)
            image_features = self.image_extractor(image)
            #image_features_expand = torch.unsqueeze(image_features,dim=1)
            #print(image_features_expand.shape)
            text_features = self.text_encoder(input_ids,attention_mask)
            # print(text_features.shape)
            
            fusion_features = torch.cat((text_features,image_features), dim=-1)
            # print(fusion_features.shape)
            #combined_features_flat = fusion_features.view(fusion_features.size(0), -1)
            #print(combined_features_flat.shape)
            output = self.classifier2(fusion_features)
            # print(output.shape)
        return output

In [12]:
# 训练过程
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()  
    running_loss = 0
    total_correct = 0 
    for images, input_ids, attention_mask, labels in train_loader:
        images = images.to(device)
        input_ids = input_ids.squeeze(1).to(device)
        attention_mask = attention_mask.to(device)     
        labels = labels.to(device)     
        optimizer.zero_grad()     
        outputs = model(images, input_ids,attention_mask)
        _, preds = torch.max(outputs, 1)
        total_correct += torch.sum(preds == labels)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()   
        running_loss += loss.item()
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = total_correct.item() / len(train_loader.dataset)
    return epoch_loss, epoch_acc

# 预测过程
def predict_model(model, test_loader, device):
    model.eval()
    predictions = []
    for images,input_ids, attention_mask,  _ in test_loader:
        images = images.to(device)
        #texts = texts.to(device)
        input_ids = input_ids.squeeze(1).to(device)
        attention_mask = attention_mask.to(device)
        with torch.no_grad():
            outputs = model(images, input_ids,attention_mask)
            _, preds = torch.max(outputs, 1)
        predictions.extend(preds.cpu().numpy())
    return predictions


In [6]:
# 模型训练和验证
torch.cuda.set_device(0)
criterion = nn.CrossEntropyLoss()
lrlist = [1e-5,3e-5,5e-5,7e-5]
# batch_size_list = [128,64,32,16]
batch_size = 64
best_acc = 0
# 创建数据集和数据加载器
loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
loader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)
for lr in lrlist:
    option=2
    model = FusionModel(num_classes,option)
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    #optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
    num_epochs = 6
    for epoch in range(num_epochs):
        train_loss, train_acc = train_model(model, loader_train, criterion, optimizer, device)
        val_predictions = predict_model(model, loader_val, device)
        # 计算验证集准确率    
        val_predictions = np.array(val_predictions)
        # print(val_predictions)
        val_labels = np.array(labels_val)
        val_acc = (val_predictions == val_labels).sum() / len(val_labels)
        if(val_acc>best_acc):
            best_acc = val_acc
            torch.save(model, 'multi_model.pt')
        print(f"batch size: {batch_size}, lr: {lr}, Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Best Val Acc:{best_acc:.4f}")

  input_ids = torch.tensor(self.input_ids[index])
  attention_mask = torch.tensor(self.attention_mask[index])


batch size: 64, lr: 1e-05, Epoch 1/6, Train Loss: 0.9838, Train Acc: 0.5556, Val Acc: 0.5900, Best Val Acc:0.5900
batch size: 64, lr: 1e-05, Epoch 2/6, Train Loss: 0.8685, Train Acc: 0.6209, Val Acc: 0.6362, Best Val Acc:0.6362
batch size: 64, lr: 1e-05, Epoch 3/6, Train Loss: 0.7322, Train Acc: 0.6934, Val Acc: 0.6737, Best Val Acc:0.6737
batch size: 64, lr: 1e-05, Epoch 4/6, Train Loss: 0.6033, Train Acc: 0.7631, Val Acc: 0.6600, Best Val Acc:0.6737
batch size: 64, lr: 1e-05, Epoch 5/6, Train Loss: 0.4909, Train Acc: 0.8137, Val Acc: 0.6800, Best Val Acc:0.6800
batch size: 64, lr: 1e-05, Epoch 6/6, Train Loss: 0.3976, Train Acc: 0.8638, Val Acc: 0.6863, Best Val Acc:0.6863
batch size: 64, lr: 3e-05, Epoch 1/6, Train Loss: 0.7918, Train Acc: 0.6562, Val Acc: 0.6625, Best Val Acc:0.6863
batch size: 64, lr: 3e-05, Epoch 2/6, Train Loss: 0.5019, Train Acc: 0.8184, Val Acc: 0.6775, Best Val Acc:0.6863
batch size: 64, lr: 3e-05, Epoch 3/6, Train Loss: 0.3438, Train Acc: 0.8781, Val Acc: 0.

In [13]:
#读取test文件并生成预测文件
test_path = "test_without_label.txt"
test_df = pd.read_csv(test_path,sep=",")
test_df.iloc[:,-1]=0
test_labels = np.array(test_df['tag'])

#image_paths
image_paths_test = get_valid_imagesPath_from_directory(folder_path,test_df)
test_texts = get_texts_from_textsPath(folder_path,test_df)

tokenized_texts_test = text_preprocess(test_texts)
dataset_test = Dataset(image_paths_test, tokenized_texts_test, test_labels, transform)
loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)

best_model = torch.load('multi_model.pt').to(device)
test_predictions = predict_model(best_model, loader_test, device)  
test_predictions = np.array(test_predictions)

column_dict_ = {0:"positive", 1:"negative",2:"neutral"}
test_df['tag'] = test_predictions
pre_df = test_df.replace({"tag": column_dict_})
pre_df.to_csv('predict.txt',sep=',',index=False)

511
511
511
['Energetic training today with our San Antonio New Dollars/New Partners trainees \n', 'Let your voice be heard! 18+ #endsuicide #blithe #selfharm #thinspo #bonespo #edo #hurt #cut \n', "RT @Austin_Powers__: Shark Week would be so much better if the sharks had laser beams attached to their frickin' heads. \n", '#TheTruthCaster http://t.co/S8jvqpKq5h\n', "RT @jarpad: Hey #WBSDCC look what we're up to!!!! @JensenAckles @paulwesley @iansomerhalder \n"]


  input_ids = torch.tensor(self.input_ids[index])
  attention_mask = torch.tensor(self.attention_mask[index])


511
[1 1 1 0 0 2 1 1 0 0 0 1 1 0 1 0 2 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 2 0 0 0 0
 0 0 0 0 0 0 0 0 1 2 0 1 0 1 1 1 1 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 2 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 2 0 1 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 1 0 2 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 2 1 0 0 0 0 0 0 0
 1 0 0 0 0 1 0 1 0 0 1 1 0 0 2 1 1 0 0 1 0 0 1 1 1 0 0 1 0 0 0 0 2 0 0 0 0
 1 0 1 0 0 0 1 0 2 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 2 0 1 0 0
 0 0 0 1 0 0 0 0 0 0 2 1 0 0 2 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1
 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0
 0 0 0 0 0 2 0 1 1 1 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 0 2
 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 2 1 0 0 2 0 0 1 0 0 0 0 1 0 1 0
 0 0 0 0 0 0 1 1 0 1 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1
 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0
 1 0 0 0 0 0 1 0 0 1 0 1 1 0 1 0 1 0 2 0 0 0 0 0 2 0 0 1 0 1 0 0 0 0 1 0 1
 1 0 0 0 0 0 1 0 1 2 