#### split training and test sets of transcription ids

In [21]:
import json
from pathlib import Path
from sklearn.metrics import f1_score

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("training")
path_to_test = Path("test")

def split_dataset(validate=False):
    training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
    training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
    training_set.remove('IS1002a')
    training_set.remove('IS1005d')
    training_set.remove('TS3012c')

    test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
    test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])
    
    if validate:
        # randomly select 10% of training set as validation set
        import random
        random.seed(6969)
        validate_set = random.choices(training_set, k=int(len(training_set)*0.15))
        training_set = list(set(training_set) - set(validate_set))
        return training_set, validate_set, test_set

    return training_set, test_set

#### functions to get features

In [67]:
from sentence_transformers import SentenceTransformer
import networkx as nx
bert = SentenceTransformer('all-MiniLM-L6-v2')


def get_text_feature(dataset, path, show_progress_bar=True):
    text_feature = []
    for transcription_id in dataset:
        with open(path / f"{transcription_id}.json", "r") as text_file:
            transcription = json.load(text_file)
        
        for utterance in transcription:
            text_feature.append(utterance["speaker"] + ": " + utterance["text"])

    text_feature = bert.encode(text_feature, show_progress_bar=show_progress_bar)
    return text_feature


def get_graph_feature(dataset, path, relation_mapping=None):
    graph_feature = []
    
    if relation_mapping is None:
        relation_mapping = {'nan': 0}  # 将np.nan映射为0
        next_relation_id = 1
    
    for transcription_id in dataset:       
        with open(path / f"{transcription_id}.txt", "r") as graph_file:
            edges = []
            relations = []
            for line in graph_file:
                parts = line.split()
                source, relation, target = int(parts[0]), parts[1], int(parts[2])
                
                if relation not in relation_mapping:
                    relation_mapping[relation] = next_relation_id
                    next_relation_id += 1
                
                edges.append((source, target, {'relation': relation}))
                relations.append(relation_mapping[relation])
            
        G = nx.DiGraph()
        G.add_edges_from(edges)
        
        node_degrees = dict(G.degree())
        
        # 添加中心性度量，这里以度中心性为例
        degree_centrality = nx.degree_centrality(G)
        
        # 处理叶子节点，将关系设置为nan
        for node in G.nodes:
            if G.out_degree(node) == 0:
                relations.append(0)  # 将np.nan映射为0
        
        # 组合节点的度、关系、和中心性度量
        combined_feature = list(zip(node_degrees.values(), relations, degree_centrality.values()))
        graph_feature.extend(combined_feature)
    
    return graph_feature, relation_mapping



def get_label(dataset, label_file):
    labels = []
    with open(label_file, "r") as file:
        all_labels = json.load(file)
    for transcription_id in dataset:  
        labels += all_labels[transcription_id]
    return labels

### Use only the text feature

#### load X and y

In [34]:
training_set, validate_set, test_set = split_dataset(validate=True)

X_training = get_text_feature(training_set, path_to_training)
y_training = get_label(training_set, "training_labels.json")

X_validate = get_text_feature(validate_set, path_to_training, show_progress_bar=False)
y_validate = get_label(validate_set, "training_labels.json")

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

#### naive_baseline: 
##### all utterances are predicted important (label 1)

In [43]:
y_pred = [1] * len(y_validate)

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))
# print accuracy
print(sum([1 if y_pred[i] == y_validate[i] else 0 for i in range(len(y_validate))]) / len(y_validate))

0.0
0.8179729384733214


#### text_baseline(Decision Tree): 
##### utterances are embedded with SentenceTransformer, then train a Decision Tree classifier.

In [70]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_training, y_training)

y_pred = clf.predict(X_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))
    

0.10745233968804159


#### text_baseline(Random Forest): 
##### utterances are embedded with SentenceTransformer, then train a Random Forest classifier.

In [29]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, max_depth=5, criterion='gini', n_jobs=-1, random_state=0)
clf.fit(X_training, y_training)

y_pred = clf.predict(X_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

0.25344036697247707


In [15]:
# grid search
from sklearn.ensemble import RandomForestClassifier

best_score = 0
for n_estimators in [20, 35, 50, 75]:
    for max_depth in [25, 30, 35]:

        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, criterion='gini', n_jobs=-1, random_state=0)
        clf.fit(X_training, y_training)

        y_pred = clf.predict(X_validate).tolist()
        
        score = f1_score(y_validate, y_pred, average='binary')
        if score > best_score:
            best_score = score
            best_parameter = [n_estimators, max_depth]

# print F1 score
print("best_score: ", best_score)
print("best_parameter: ", best_parameter)

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

Running with n_estimators=100 and max_depth=10
Running with n_estimators=100 and max_depth=20
Running with n_estimators=200 and max_depth=10
Running with n_estimators=200 and max_depth=20
Running with n_estimators=300 and max_depth=10
Running with n_estimators=300 and max_depth=20
best_score:  0.26552706552706556
best_parameter:  [100, 20]


#### text_baseline(LSTM): 
##### utterances are embedded with SentenceTransformer, then train a Decision Tree classifier.

In [9]:
import torch
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        # 取最后一个时间步的输出
        lstm_out = lstm_out[:, -1, :]
        output = self.fc(lstm_out)
        output = self.sigmoid(output)
        return output

In [26]:
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

# 定义模型的超参数
sequence_length = 10 # 假设你的每个对话的长度为 sequence_length
input_size = X_training.shape[1]
hidden_size = 64
num_layers = 2
output_size = 1  # 二元分类任务

# 初始化模型
model = LSTMClassifier(input_size, hidden_size, num_layers, output_size)

# 定义损失函数和优化器
criterion = nn.BCELoss()  # 二元交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 转换为 PyTorch 张量
X_training_tensor = torch.tensor(X_training)
y_training_tensor = torch.tensor(y_training, dtype=torch.int)
X_validate_tensor = torch.tensor(X_validate)
y_validate_tensor = torch.tensor(y_validate, dtype=torch.int)

# 创建 TensorDataset
train_dataset = TensorDataset(X_training_tensor, y_training_tensor)
validate_dataset = TensorDataset(X_validate_tensor, y_validate_tensor)

# 使用 DataLoader 加载数据
batch_size = 64  # 你可以根据需要调整批量大小
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validate_dataloader = DataLoader(validate_dataset, batch_size=1, shuffle=True)

# 模型训练
num_epochs = 25
for epoch in range(num_epochs):
    for inputs, labels in train_dataloader:
        inputs = inputs.unsqueeze(1)
        labels = labels.unsqueeze(1)
        # 梯度清零
        optimizer.zero_grad()
        
        # 前向传播
        outputs = model(inputs)
        
        # 计算损失
        loss = criterion(outputs, labels.float())
        
        # 反向传播
        loss.backward()
        
        # 参数更新
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/25], Loss: 0.1977
Epoch [2/25], Loss: 0.4408
Epoch [3/25], Loss: 0.3709
Epoch [4/25], Loss: 0.3527
Epoch [5/25], Loss: 0.2323
Epoch [6/25], Loss: 0.2660
Epoch [7/25], Loss: 0.3239
Epoch [8/25], Loss: 0.4742
Epoch [9/25], Loss: 0.3395
Epoch [10/25], Loss: 0.4157
Epoch [11/25], Loss: 0.3573
Epoch [12/25], Loss: 0.2794
Epoch [13/25], Loss: 0.2279
Epoch [14/25], Loss: 0.2168
Epoch [15/25], Loss: 0.2851
Epoch [16/25], Loss: 0.3030
Epoch [17/25], Loss: 0.3077
Epoch [18/25], Loss: 0.2993
Epoch [19/25], Loss: 0.2509
Epoch [20/25], Loss: 0.3114
Epoch [21/25], Loss: 0.2200
Epoch [22/25], Loss: 0.2309
Epoch [23/25], Loss: 0.2212
Epoch [24/25], Loss: 0.3107
Epoch [25/25], Loss: 0.2190


In [58]:
from sklearn.metrics import f1_score

# 模型评估
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for inputs, labels in validate_dataloader:
        inputs = inputs.unsqueeze(1)
        labels = labels.unsqueeze(1)
        # 预测
        outputs = model(inputs)
        predictions = (outputs >= 0.5).int()
        
        # 保存预测值和标签
        all_predictions.extend(predictions.numpy())
        all_labels.extend(labels.numpy())

# 计算f1-score
f1 = f1_score(all_labels, all_predictions)
print(f'F1-Score on validation set: {f1:.4f}')


F1-Score on validation set: 0.3367


#### generate submission

In [25]:
training_set, test_set = split_dataset()

X_training = get_text_feature(training_set, path_to_training)
y_training = get_label(training_set, "training_labels.json")


Batches:   0%|          | 0/2270 [00:00<?, ?it/s]

In [30]:
test_labels = {}
model.eval()
with torch.no_grad():
    for transcription_id in test_set:
        with open(path_to_test / f"{transcription_id}.json", "r") as file:
            transcription = json.load(file)
        
        X_test = []
        for utterance in transcription:
            X_test.append(utterance["speaker"] + ": " + utterance["text"])
        
        X_test = bert.encode(X_test)
        X_test = torch.tensor(X_test).unsqueeze(1)

        # y_test = clf.predict(X_test)
        outputs = model(X_test)
        y_test = (outputs >= 0.5).int()
        y_test = y_test.squeeze(1)
        test_labels[transcription_id] = y_test.tolist()

with open("test_labels_text_baseline.json", "w") as file:
    json.dump(test_labels, file, indent=4)

### Use the combination of text and graph feature

#### combine_baseline(Random Forest): 
##### utterances are embedded with SentenceTransformer, node degrees are used as graph feature, then train a Random Forest classifier.

In [68]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np

training_set, validate_set, test_set = split_dataset(validate=True)

train_text_feature = get_text_feature(training_set, path_to_training)
train_graph_feature, relation_mapping = get_graph_feature(training_set, path_to_training)
X_training = [np.concatenate((text_feat, [graph_feat])) for text_feat, graph_feat in zip(train_text_feature, train_graph_feature)]
y_training = get_label(training_set, "training_labels.json")

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_training, y_training)

validate_text_feature = get_text_feature(validate_set, path_to_training, show_progress_bar=False)
validate_graph_feature, _ = get_graph_feature(validate_set, path_to_training, relation_mapping)
X_validate = [np.concatenate((text_feat, [graph_feat])) for text_feat, graph_feat in zip(validate_text_feature, validate_graph_feature)]
y_validate = get_label(validate_set, "training_labels.json")

y_pred = clf.predict(X_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)

### Two model

In [69]:
training_set, validate_set, test_set = split_dataset(validate=True)

text_feature_training = get_text_feature(training_set, path_to_training)
graph_feature_training, relation_mapping = get_graph_feature(training_set, path_to_training)
y_training = get_label(training_set, "training_labels.json")

text_feature_validate = get_text_feature(validate_set, path_to_training, show_progress_bar=False)
graph_feature_validate, _ = get_graph_feature(validate_set, path_to_training, relation_mapping)
y_validate = get_label(validate_set, "training_labels.json")

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

#### Use LSTM for text and Decision Tree for graph 

In [70]:
import torch
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        # 取最后一个时间步的输出
        lstm_out = lstm_out[:, -1, :]
        output = self.fc(lstm_out)
        output = self.sigmoid(output)
        return output

In [73]:
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

# 定义模型的超参数
sequence_length = 10 # 假设你的每个对话的长度为 sequence_length
input_size = text_feature_training.shape[1]
hidden_size = 64
num_layers = 2
output_size = 1  # 二元分类任务

# 初始化模型
model = LSTMClassifier(input_size, hidden_size, num_layers, output_size)

# 定义损失函数和优化器
criterion = nn.BCELoss()  # 二元交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 转换为 PyTorch 张量
X_training_tensor = torch.tensor(text_feature_training)
y_training_tensor = torch.tensor(y_training, dtype=torch.int)
X_validate_tensor = torch.tensor(text_feature_validate)
y_validate_tensor = torch.tensor(y_validate, dtype=torch.int)

# 创建 TensorDataset
train_dataset = TensorDataset(X_training_tensor, y_training_tensor)
validate_dataset = TensorDataset(X_validate_tensor, y_validate_tensor)

# 使用 DataLoader 加载数据
batch_size = 64  # 你可以根据需要调整批量大小
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validate_dataloader = DataLoader(validate_dataset, batch_size=1)

# 模型训练
num_epochs = 25
for epoch in range(num_epochs):
    for inputs, labels in train_dataloader:
        inputs = inputs.unsqueeze(1)
        labels = labels.unsqueeze(1)
        # 梯度清零
        optimizer.zero_grad()
        
        # 前向传播
        outputs = model(inputs)
        
        # 计算损失
        loss = criterion(outputs, labels.float())
        
        # 反向传播
        loss.backward()
        
        # 参数更新
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/25], Loss: 0.2749
Epoch [2/25], Loss: 0.2464
Epoch [3/25], Loss: 0.4448
Epoch [4/25], Loss: 0.4203
Epoch [5/25], Loss: 0.2241
Epoch [6/25], Loss: 0.4276
Epoch [7/25], Loss: 0.3122
Epoch [8/25], Loss: 0.4052
Epoch [9/25], Loss: 0.3080
Epoch [10/25], Loss: 0.3564
Epoch [11/25], Loss: 0.3116
Epoch [12/25], Loss: 0.3891
Epoch [13/25], Loss: 0.3477
Epoch [14/25], Loss: 0.1943
Epoch [15/25], Loss: 0.3347
Epoch [16/25], Loss: 0.3200
Epoch [17/25], Loss: 0.3741
Epoch [18/25], Loss: 0.2742
Epoch [19/25], Loss: 0.2071
Epoch [20/25], Loss: 0.3553
Epoch [21/25], Loss: 0.3274
Epoch [22/25], Loss: 0.1723
Epoch [23/25], Loss: 0.2498
Epoch [24/25], Loss: 0.4203
Epoch [25/25], Loss: 0.3000


In [74]:
# 模型评估
model.eval()
y_text_pred = []
text_prob = []

with torch.no_grad():
    for inputs, labels in validate_dataloader:
        inputs = inputs.unsqueeze(1)
        labels = labels.unsqueeze(1)
        # 预测
        outputs = model(inputs)
        predictions = (outputs >= 0.5).int()
        
        # 保存预测值和标签
        text_prob.extend(outputs.numpy())
        y_text_pred.extend(predictions.numpy().tolist())

# 计算f1-score
y_text_pred = np.array(y_text_pred).flatten().tolist()
f1 = f1_score(y_validate, y_text_pred)
print(f'F1-Score on validation set: {f1:.4f}')

F1-Score on validation set: 0.5541


In [85]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(min_samples_leaf=1, min_samples_split=6, random_state=0)
clf.fit(graph_feature_training, y_training)

y_graph_pred = clf.predict(graph_feature_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_graph_pred, average='binary'))

0.08633093525179857


In [92]:
# grid search
from sklearn.tree import DecisionTreeClassifier

best_score = 0
for min_samples_leaf in [1,3,5,7]:
    for min_samples_split in [2,6,10]:

        clf = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, random_state=0)
        clf.fit(graph_feature_training, y_training)

        y_pred = clf.predict(graph_feature_validate).tolist()
        
        score = f1_score(y_validate, y_pred, average='binary')
        if score > best_score:
            best_score = score
            best_parameter = [min_samples_leaf, min_samples_split]

# print F1 score
print("best_score: ", best_score)
print("best_parameter: ", best_parameter)

best_score:  0.09194029850746267
best_parameter:  [1, 6]


In [83]:
from xgboost import XGBClassifier

clf = XGBClassifier(n_estimators=100, max_depth=25, objective='binary:logistic', n_jobs=-1, random_state=0)
clf.fit(graph_feature_training, y_training)

y_graph_pred = clf.predict(graph_feature_validate).tolist()
graph_prob = clf.predict_proba(graph_feature_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_graph_pred, average='binary'))
graph_prob = [prob[1] for prob in graph_prob]

0.11602527283170591


In [82]:
# grid search
from xgboost import XGBClassifier

best_score = 0
for n_estimators in [25, 50, 75, 100, 125]:
    for max_depth in [5, 15, 25, 35, 45]:

        clf = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
        clf.fit(graph_feature_training, y_training)

        y_pred = clf.predict(graph_feature_validate).tolist()
        
        score = f1_score(y_validate, y_pred, average='binary')
        if score > best_score:
            best_score = score
            best_parameter = [n_estimators, max_depth]

# print F1 score
print("best_score: ", best_score)
print("best_parameter: ", best_parameter)

best_score:  0.11602527283170591
best_parameter:  [100, 25]


In [86]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 将 LSTM 和 XGBoost 的预测结果水平拼接
combined_predictions = np.column_stack((y_text_pred, y_graph_pred))

# 划分数据集为训练和测试集
X_train, X_test, y_train, y_test = train_test_split(combined_predictions, y_validate, test_size=0.2, random_state=6969)

# 初始化逻辑回归模型（或其他模型）
logistic_model = LogisticRegression()

# 训练逻辑回归模型
logistic_model.fit(X_train, y_train)

# 进行预测
final_predictions = logistic_model.predict(X_test)

# 计算 F1-Score
f1 = f1_score(y_test, final_predictions)
print(f'F1-Score on validation set: {f1:.4f}')

F1-Score on validation set: 0.5765


In [None]:
# 将 LSTM 和 XGBoost 的预测结果水平拼接
combined_predictions = np.column_stack((y_text_pred, y_graph_pred))
# 初始化逻辑回归模型（或其他模型）
logistic_model = LogisticRegression()
# 训练逻辑回归模型
logistic_model.fit(combined_predictions, y_pred)

In [4]:
training_set, validate_set, test_set = split_dataset(validate=True)
y_training = get_label(training_set, "training_labels.json")

### Use GNN for combined feature

In [2]:
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')

def get_combine_feature(dataset, path, label_file, relation_mapping=None, for_test=False):
    graph_dataset = []
    if not for_test:
        with open(label_file, "r") as file:
            all_labels = json.load(file)
        
    # 读取图数据
    if relation_mapping is None:
        relation_mapping = {'nan': 0}  # 将np.nan映射为0
        next_relation_id = 1
    for transcription_id in dataset:       
        with open(path / f"{transcription_id}.txt", "r") as graph_file:
            lines = graph_file.readlines()

        edges_list = []
        for line in lines:
            parts = line.split()
            if len(parts) == 3:
                src, relation, dest = int(parts[0]), parts[1], int(parts[2])
                if relation not in relation_mapping:
                    relation_mapping[relation] = next_relation_id
                    next_relation_id += 1
                edges_list.append((src, dest, relation_mapping[relation]))

        # 读取节点属性
        text_feature = []
        with open(path / f"{transcription_id}.json", "r") as text_file:
            transcription = json.load(text_file)
        for utterance in transcription:
            text_feature.append(utterance["speaker"] + ": " + utterance["text"])
        node_features = bert.encode(text_feature)
        # node_features = torch.ones((len(all_labels[transcription_id]), 1), dtype=torch.float)
        node_attributes = torch.tensor(node_features)

        # 创建 PyTorch Geometric Data 对象
        x = torch.tensor(node_attributes, dtype=torch.float)

        # 将边列表转换为 PyTorch Geometric edge_index
        src_nodes, dest_nodes, relations = zip(*edges_list)
        edge_index = torch.tensor([src_nodes, dest_nodes], dtype=torch.long)

        # 将边属性转换为 PyTorch Geometric edge_attr
        edge_attr = torch.tensor(relations, dtype=torch.float).view(1, -1)

        # 创建 PyTorch Geometric Data 对象
        if for_test:
            data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
        else:
            data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=torch.tensor(all_labels[transcription_id]))
        
        graph_dataset.append(data)

    return graph_dataset, relation_mapping


In [32]:
# 处理多个图的数据
training_set, validate_set, test_set = split_dataset(validate=True)

train_dataset, relation_mapping = get_combine_feature(training_set, path_to_training, "training_labels.json")
validate_dataset, _ = get_combine_feature(validate_set, path_to_training, "training_labels.json", relation_mapping)

# 创建 DataLoader 用于批处理
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
validate_loader = DataLoader(validate_dataset, batch_size=1, shuffle=True)

# 输出第一个图的信息
print(train_loader.dataset[0])
print(validate_loader.dataset[1])

  x = torch.tensor(node_attributes, dtype=torch.float)


Data(x=[1095, 384], edge_index=[2, 1094], edge_attr=[1, 1094], y=[1095])
Data(x=[1017, 384], edge_index=[2, 1016], edge_attr=[1, 1016], y=[1017])


In [53]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv, GATConv, SAGEConv
import torch.nn.functional as F
from sklearn.metrics import f1_score

def eval(model):
    # 评估模型
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in validate_loader:
            output = model(batch)
            predictions = torch.argmax(output, dim=1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch.y.cpu().numpy())

    # 计算F1-Score
    f1 = f1_score(all_labels, all_predictions, average='binary')
    return f1

# 定义 GNN 模型
class GCNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x


class GATModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_heads=1):
        super(GATModel, self).__init__()
        self.convs = nn.ModuleList([
            GATConv(input_dim, hidden_dim, heads=num_heads),
            GATConv(hidden_dim * num_heads, hidden_dim, heads=num_heads)
        ])
        # Fully connected layer for classification
        self.fc = nn.Linear(hidden_dim * num_heads, output_dim)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(p=0.3)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        for conv in self.convs:
            x = F.relu(conv(x, edge_index))
            x = self.dropout(x)
        x = self.fc(x)
        return x


class GraphSAGEModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.3):
        super(GraphSAGEModel, self).__init__()
    #     self.convs = nn.ModuleList([
    #         SAGEConv(input_dim, hidden_dim),
    #         SAGEConv(hidden_dim, output_dim)
    #     ])
    #     self.dropout = nn.Dropout(p=0.3)

    # def forward(self, data):
    #     x, edge_index, batch = data.x, data.edge_index, data.batch
    #     for conv in self.convs:
    #         x = conv(x, edge_index)
    #         x = F.relu(x)
    #         x = self.dropout(x)
    #     return x
        # List to hold multiple GraphSAGE layers
        self.convs = nn.ModuleList([
            SAGEConv(input_dim, hidden_dim*4),
            SAGEConv(hidden_dim*4, hidden_dim*2),
            SAGEConv(hidden_dim*2, hidden_dim),
            SAGEConv(hidden_dim, hidden_dim)
        ])

        # Attention mechanism
        self.attention = nn.Sequential(
            nn.Linear(hidden_dim, 1),
            nn.Tanh()
        )

        # Output layer
        self.fc = nn.Linear(hidden_dim, output_dim)

        # Dropout for regularization
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # Apply multiple GraphSAGE layers
        for conv in self.convs:
            x = F.relu(conv(x, edge_index))
            x = self.dropout(x)

        # Attention mechanism
        attention_weights = F.softmax(self.attention(x), dim=0)
        x = x * attention_weights

        # Fully connected layer for final prediction
        x = self.fc(x)

        return x


# 初始化模型、损失函数和优化器
# model = GCNModel(input_dim=384, hidden_dim=64, output_dim=2)
# model = GATModel(input_dim=384, hidden_dim=128, output_dim=2, num_heads=2)
model = GraphSAGEModel(input_dim=384, hidden_dim=32, output_dim=2, dropout=0.2)
criterion = nn.CrossEntropyLoss()
# criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# 训练模型
best_val_f1 = 0.52
num_epochs = 90
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        output = model(batch)
        y = batch.y
        # y = y.float().unsqueeze(1)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch+1) % 3 == 0:
        f1 = eval(model)
        if f1 > best_val_f1:
            best_val_f1 = f1
            torch.save(model.state_dict(), "best_model.pth")
    if (epoch+1) % 9 == 0:
        average_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}, F1-Score: {f1}")
        
print("best val f1: ", best_val_f1)

Epoch 9/90, Loss: 0.35999660566449165, F1-Score: 0.0
Epoch 18/90, Loss: 0.33149401541976703, F1-Score: 0.0027855153203342614
Epoch 27/90, Loss: 0.2943440569298608, F1-Score: 0.5412234042553192
Epoch 36/90, Loss: 0.26280111721938565, F1-Score: 0.5421605904719301
Epoch 45/90, Loss: 0.21351825006838357, F1-Score: 0.5278396436525613
Epoch 54/90, Loss: 0.18835431261963786, F1-Score: 0.5341414141414141
Epoch 63/90, Loss: 0.16288100708542125, F1-Score: 0.5144380275433141
Epoch 72/90, Loss: 0.1397091999672176, F1-Score: 0.4738206324520477
Epoch 81/90, Loss: 0.1261899534867899, F1-Score: 0.48310328415040454
Epoch 90/90, Loss: 0.11468902291796569, F1-Score: 0.4638507385332988
best val f1:  0.5549695740365111


#### generate result

In [26]:
# 处理多个图的数据
training_set, test_set = split_dataset()

train_dataset, relation_mapping = get_combine_feature(training_set, path_to_training, "training_labels.json")
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

  x = torch.tensor(node_attributes, dtype=torch.float)


In [54]:
model.load_state_dict(torch.load("best_model.pth"))

<All keys matched successfully>

In [55]:
test_labels = {}
model.eval()
with torch.no_grad():
    for transcription_id in test_set:       
        with open(path_to_test / f"{transcription_id}.txt", "r") as graph_file:
            lines = graph_file.readlines()

        edges_list = []
        for line in lines:
            parts = line.split()
            if len(parts) == 3:
                src, relation, dest = int(parts[0]), parts[1], int(parts[2])
                edges_list.append((src, dest, relation_mapping[relation]))

        # 读取节点属性
        text_feature = []
        with open(path_to_test / f"{transcription_id}.json", "r") as text_file:
            transcription = json.load(text_file)
        for utterance in transcription:
            text_feature.append(utterance["speaker"] + ": " + utterance["text"])
        node_features = bert.encode(text_feature)
        # node_features = torch.ones((len(all_labels[transcription_id]), 1), dtype=torch.float)
        node_attributes = torch.tensor(node_features)

        # 创建 PyTorch Geometric Data 对象
        x = torch.tensor(node_attributes, dtype=torch.float)

        # 将边列表转换为 PyTorch Geometric edge_index
        src_nodes, dest_nodes, relations = zip(*edges_list)
        edge_index = torch.tensor([src_nodes, dest_nodes], dtype=torch.long)

        # 将边属性转换为 PyTorch Geometric edge_attr
        edge_attr = torch.tensor(relations, dtype=torch.float).view(1, -1)

        # 创建 PyTorch Geometric Data 对象
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
        outputs = model(data)
        predictions = torch.argmax(outputs, dim=1).int()
        test_labels[transcription_id] = predictions.numpy().tolist()

with open("test_labels_text_baseline.json", "w") as file:
    json.dump(test_labels, file, indent=4)

  x = torch.tensor(node_attributes, dtype=torch.float)
