In [12]:
import json

def load_data(file_path):
    X_data = []
    Y_data = []
    with open(file_path, 'r') as f:
        for line in f:
            example = json.loads(line.strip())
            X_data.append(example['text'])
            Y_data.append(example['label'])
    return X_data, Y_data

X_train, Y_train = load_data('./train_all_anno.json')
X_test, Y_test = load_data('./test_all_anno.json')
X_val, Y_val = load_data('./val_all_anno.json')

X_train = [x.lower() for x in X_train]
X_test = [x.lower() for x in X_test]
X_val = [x.lower() for x in X_val]


In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load the GLOVE embeddings
embedding_path = './glove.6B.300d.txt'
embedding_index = {}
with open(embedding_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Define the tokenizer and fit on the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert the text to sequences of integers and pad to a length of 100
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100)

X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_val_padded = pad_sequences(X_val_sequences, maxlen=100)

# Create an embedding matrix for the words in the tokenizer
word_index = tokenizer.word_index
embedding_dim = 300
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.random.normal(size=(embedding_dim,))

# Convert the labels to one-hot encoding
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
Y_train = enc.fit_transform(np.array(Y_train).reshape(-1, 1)).toarray()
Y_test = enc.transform(np.array(Y_test).reshape(-1, 1)).toarray()
Y_val = enc.transform(np.array(Y_val).reshape(-1, 1)).toarray()


In [14]:
from sklearn.metrics import accuracy_score

def categorical_accuracy_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    y_true = np.argmax(y, axis=1)
    return accuracy_score(y_true, y_pred)


In [None]:
import numpy as np

# Create a dummy adjacency matrix for demonstration purposes
adj_matrix = np.zeros((X_train_padded.shape[0], 100, 100))

for i in range(X_train_padded.shape[0]):
    for j in range(100):
        adj_matrix[i, j, (j+1) % 100] = 1
        adj_matrix[i, j, (j-1) % 100] = 1

adj_matrix_val = np.zeros((X_val_padded.shape[0], 100, 100))

for i in range(X_val_padded.shape[0]):
    for j in range(100):
        adj_matrix_val[i, j, (j+1) % 100] = 1
        adj_matrix_val[i, j, (j-1) % 100] = 1


In [23]:
from tensorflow.keras.layers import Input, Embedding, Dense, BatchNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from spektral.layers import GCNConv, GlobalSumPool
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Define the GCN model
input_layer = Input(shape=(100,))
embedding_layer = Embedding(input_dim=num_words, output_dim=embedding_dim, weights=[embedding_matrix], input_length=100, trainable=False)(input_layer)
gcn1 = GCNConv(32, activation='relu')([embedding_layer, input_layer])
pooling_layer = GlobalSumPool()(gcn1)
dense_layer = Dense(128, activation='relu')(pooling_layer)
batch_norm_layer = BatchNormalization()(dense_layer)
dropout_layer = Dropout(0.2)(batch_norm_layer)
output_layer = Dense(3, activation='softmax')(dropout_layer)
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
optimizer = Adam(learning_rate=0.0005)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
# history = model.fit(x=X_train_padded, y=Y_train, validation_data=(X_val_padded, Y_val), batch_size=32, epochs=50, callbacks=[early_stopping])
history = model.fit(x=X_train_padded, y=Y_train, validation_data=(X_val_padded, Y_val), batch_size=32, epochs=50)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [24]:
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test_padded, Y_test)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

Test loss: 0.8834083080291748
Test accuracy: 0.5875576138496399


In [55]:
import numpy as np
import cv2
from tensorflow.keras.utils import Sequence
from tensorflow.keras.applications.resnet50 import preprocess_input

class DataGenerator(Sequence):
    def __init__(self, X_text, X_image_paths, Y, batch_size=32, img_size=(224, 224), shuffle=True):
        self.X_text = X_text
        self.X_image_paths = X_image_paths
        self.Y = Y
        self.batch_size = batch_size
        self.img_size = img_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.X_text) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        X_text_temp = [self.X_text[k] for k in indexes]
        X_image_paths_temp = [self.X_image_paths[k] for k in indexes]
        Y_temp = [self.Y[k] for k in indexes]

        X_text_batch = np.array(X_text_temp)
        X_image_batch = self.__generate_image_data(X_image_paths_temp)
        Y_batch = np.array(Y_temp)

        return [X_text_batch, X_image_batch], Y_batch

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.X_text))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __generate_image_data(self, image_paths):
        images = []
        for path in image_paths:
            img = cv2.imread(path)
            img = cv2.resize(img, self.img_size)
            img = preprocess_input(img)
            images.append(img)
        return np.array(images)


MGNNS

In [1]:
import json
import numpy as np

def load_data(file_path):
    X_text_data = []
    X_image_paths_data = []
    X_places = []
    Y_data = []
    with open(file_path, 'r') as f:
        for line in f:
            example = json.loads(line.strip())
            X_text_data.append(example['text'])
            X_image_paths_data.append('../'+example['image'])
            X_places.append(example['places'])
            Y_data.append(example['label'])
    return X_text_data, X_image_paths_data, X_places, Y_data

X_train_text, X_train_image_paths, X_train_places, Y_train = load_data('./train_all_anno.json')
X_test_text, X_test_image_paths, X_test_places, Y_test = load_data('./test_all_anno.json')
X_val_text, X_val_image_paths, X_val_places, Y_val = load_data('./val_all_anno.json')

X_train = [x.lower() for x in X_train_text]
X_test = [x.lower() for x in X_test_text]
X_val = [x.lower() for x in X_val_text]


def convert_scene_to_onehot(scene_list, num_scenes):
    onehot_scene_list = []
    for scene_ids in scene_list:
        onehot_scene = np.zeros(num_scenes)
        onehot_scene[scene_ids] = 1
        onehot_scene_list.append(onehot_scene)
    return np.array(onehot_scene_list)

num_scenes = 365  # 根据 place365 训练集中的总场景数设置

# 将场景ID列表转换为 one-hot 编码
X_train_places_onehot = convert_scene_to_onehot(X_train_places, num_scenes)
X_test_places_onehot = convert_scene_to_onehot(X_test_places, num_scenes)
X_val_places_onehot = convert_scene_to_onehot(X_val_places, num_scenes)


In [2]:
import numpy as np
import cv2
from tensorflow.keras.utils import Sequence
from tensorflow.keras.applications.resnet50 import preprocess_input

class DataGenerator(Sequence):
    def __init__(self, X_text, X_image_paths, X_places_onehot, Y, batch_size=32, img_size=(224, 224), shuffle=True):
        self.X_text = X_text
        self.X_image_paths = X_image_paths
        self.X_places_onehot = X_places_onehot
        self.Y = Y
        self.batch_size = batch_size
        self.img_size = img_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.X_text) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        X_text_temp = [self.X_text[k] for k in indexes]
        X_image_paths_temp = [self.X_image_paths[k] for k in indexes]
        X_places_temp = [self.X_places_onehot[k] for k in indexes]
        Y_temp = [self.Y[k] for k in indexes]

        X_text_batch = np.array(X_text_temp)
        X_image_batch = self.__generate_image_data(X_image_paths_temp)
        X_places_batch = np.array(X_places_temp)
        Y_batch = np.array(Y_temp)

        return [X_text_batch, X_image_batch, X_places_batch], Y_batch

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.X_text))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __generate_image_data(self, image_paths):
        images = []
        for path in image_paths:
            img = cv2.imread(path)
            img = cv2.resize(img, self.img_size)
            img = preprocess_input(img)
            images.append(img)
        return np.array(images)


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load the GLOVE embeddings
embedding_path = './glove.6B.300d.txt'
embedding_index = {}
with open(embedding_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Define the tokenizer and fit on the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert the text to sequences of integers and pad to a length of 100
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100)

X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_val_padded = pad_sequences(X_val_sequences, maxlen=100)

# Create an embedding matrix for the words in the tokenizer
word_index = tokenizer.word_index
embedding_dim = 300
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.random.normal(size=(embedding_dim,))


In [4]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
Y_train_encoded = lb.fit_transform(Y_train)
Y_val_encoded = lb.transform(Y_val)
Y_test_encoded = lb.transform(Y_test)

train_generator = DataGenerator(X_train_padded, X_train_image_paths, X_train_places_onehot, Y_train_encoded, batch_size=32)
val_generator = DataGenerator(X_val_padded, X_val_image_paths, X_val_places_onehot, Y_val_encoded, batch_size=32)
test_generator = DataGenerator(X_test_padded, X_test_image_paths, X_test_places_onehot, Y_test_encoded, batch_size=32)

In [5]:
from tensorflow.keras import backend as K

def f1_score(y_true, y_pred): 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [6]:
from tensorflow.keras.layers import Input, Embedding, Dense, BatchNormalization, Dropout, Conv2D, Concatenate, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.applications.resnet50 import ResNet50
import tensorflow as tf
from tensorflow.keras.layers import Input, Reshape, Dense, Conv2DTranspose
from spektral.layers import GCNConv, GlobalSumPool
from tensorflow.keras.layers import GlobalAveragePooling2D, MaxPooling2D, Flatten, Add, Reshape, GlobalAveragePooling1D

num_scenes = 5  # 根据场景数量设置
scene_dim = 32  # 嵌入维度

# Text input
text_input_layer = Input(shape=(100,))

# Image input
image_input_layer = Input(shape=(224, 224, 3))

# Scene input
scene_input_layer = Input(shape=(num_scenes,))

# Text model
embedding_layer = Embedding(input_dim=num_words, output_dim=embedding_dim, weights=[embedding_matrix], input_length=100, trainable=False)(text_input_layer)
gcn1 = GCNConv(32, activation='relu')([embedding_layer, text_input_layer])
text_pooling_layer = GlobalSumPool()(gcn1)

# Image model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))(image_input_layer)

# Scene model
scene_embedding_layer = Embedding(input_dim=num_scenes, output_dim=scene_dim, input_length=num_scenes)(scene_input_layer)
scene_gcn = GCNConv(32, activation='relu')([scene_embedding_layer, scene_input_layer])
scene_pooling_layer = GlobalSumPool()(scene_gcn)

# Prepare attention layers
text_dim = 64
image_dim = 64
scene_dim = 64

text_features = Dense(text_dim)(text_pooling_layer)
image_features = Dense(image_dim)(base_model)
scene_features = Dense(scene_dim)(scene_pooling_layer)

# 将图像特征降维以便拼接
image_features = GlobalAveragePooling2D()(image_features)

# Concatenate the features
concat_layer = Concatenate()([text_features, image_features, scene_features])

# 添加一个或多个全连接层
dense_layer1 = Dense(256, activation='relu')(concat_layer)
batch_norm_layer1 = BatchNormalization()(dense_layer1)
dropout_layer1 = Dropout(0.2)(batch_norm_layer1)

dense_layer2 = Dense(128, activation='relu')(dropout_layer1)
batch_norm_layer2 = BatchNormalization()(dense_layer2)
dropout_layer2 = Dropout(0.2)(batch_norm_layer2)

# Classification head
output_layer = Dense(3, activation='softmax')(dropout_layer2)

model = Model(inputs=[text_input_layer, image_input_layer, scene_input_layer], outputs=output_layer)

optimizer = Adam(learning_rate=0.0005)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', f1_score])


In [9]:
from tensorflow.keras.callbacks import EarlyStopping

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_f1_score', patience=5, restore_best_weights=True, mode='max')

# Train the model
history = model.fit(train_generator, epochs=50, validation_data=val_generator, callbacks=[early_stopping])
# history = model.fit(train_generator, epochs=50, validation_data=val_generator)


Epoch 1/50
Epoch 2/50
Epoch 3/50

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy, test_f1_score = model.evaluate(test_generator)
print(f"Test loss: {test_loss}, Test accuracy: {test_accuracy}, Test F1-score: {test_f1_score}")

Test loss: 0.9458478689193726, Test accuracy: 0.47836539149284363, Test F1-score: 0.3706305921077728
