#### split training and test sets of transcription ids

In [4]:
import json
from pathlib import Path
from sklearn.metrics import f1_score

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("training")
path_to_test = Path("test")

def split_dataset(validate=False):
    training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
    training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
    training_set.remove('IS1002a')
    training_set.remove('IS1005d')
    training_set.remove('TS3012c')

    test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
    test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])
    
    if validate:
        # randomly select 10% of training set as validation set
        import random
        random.seed(6969)
        validate_set = random.choices(training_set, k=int(len(training_set)*0.1))
        training_set = list(set(training_set) - set(validate_set))
        return training_set, validate_set, test_set

    return training_set, test_set

#### functions to get features

In [66]:
from sentence_transformers import SentenceTransformer
import networkx as nx
bert = SentenceTransformer('all-MiniLM-L6-v2')


def get_text_feature(dataset, path, show_progress_bar=True):
    text_feature = []
    for transcription_id in dataset:
        with open(path / f"{transcription_id}.json", "r") as text_file:
            transcription = json.load(text_file)
        
        for utterance in transcription:
            text_feature.append(utterance["speaker"] + ": " + utterance["text"])

    text_feature = bert.encode(text_feature, show_progress_bar=show_progress_bar)
    return text_feature


import networkx as nx
import numpy as np
from itertools import chain

def get_graph_feature(dataset, path, show_progress_bar=True):
    graph_feature = []
    
    relation_mapping = {'nan': 0}  # 将np.nan映射为0
    next_relation_id = 1
    
    for transcription_id in dataset:       
        with open(path / f"{transcription_id}.txt", "r") as graph_file:
            edges = []
            relations = []
            for line in graph_file:
                parts = line.split()
                source, relation, target = int(parts[0]), parts[1], int(parts[2])
                
                if relation not in relation_mapping:
                    relation_mapping[relation] = next_relation_id
                    next_relation_id += 1
                
                edges.append((source, target, {'relation': relation}))
                relations.append(relation_mapping[relation])
            
        G = nx.DiGraph()
        G.add_edges_from(edges)
        
        node_degrees = dict(G.degree())
        
        # 添加中心性度量，这里以度中心性为例
        degree_centrality = nx.degree_centrality(G)
        
        # 处理叶子节点，将关系设置为nan
        for node in G.nodes:
            if G.out_degree(node) == 0:
                relations.append(0)  # 将np.nan映射为0
        
        # 组合节点的度、关系、和中心性度量
        combined_feature = list(zip(node_degrees.values(), relations, degree_centrality.values()))
        graph_feature.extend(combined_feature)
    
    return graph_feature



def get_label(dataset, label_file):
    labels = []
    with open(label_file, "r") as file:
        all_labels = json.load(file)
    for transcription_id in dataset:  
        labels += all_labels[transcription_id]
    return labels

### Use only the text feature

#### load X and y

In [34]:
training_set, validate_set, test_set = split_dataset(validate=True)

X_training = get_text_feature(training_set, path_to_training)
y_training = get_label(training_set, "training_labels.json")

X_validate = get_text_feature(validate_set, path_to_training, show_progress_bar=False)
y_validate = get_label(validate_set, "training_labels.json")

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

#### naive_baseline: 
##### all utterances are predicted important (label 1)

In [43]:
y_pred = [1] * len(y_validate)

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))
# print accuracy
print(sum([1 if y_pred[i] == y_validate[i] else 0 for i in range(len(y_validate))]) / len(y_validate))

0.0
0.8179729384733214


#### text_baseline(Decision Tree): 
##### utterances are embedded with SentenceTransformer, then train a Decision Tree classifier.

In [70]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_training, y_training)

y_pred = clf.predict(X_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))
    

0.10745233968804159


#### text_baseline(Random Forest): 
##### utterances are embedded with SentenceTransformer, then train a Random Forest classifier.

In [29]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, max_depth=5, criterion='gini', n_jobs=-1, random_state=0)
clf.fit(X_training, y_training)

y_pred = clf.predict(X_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

0.25344036697247707


In [15]:
# grid search
from sklearn.ensemble import RandomForestClassifier

best_score = 0
for n_estimators in [20, 35, 50, 75]:
    for max_depth in [25, 30, 35]:

        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, criterion='gini', n_jobs=-1, random_state=0)
        clf.fit(X_training, y_training)

        y_pred = clf.predict(X_validate).tolist()
        
        score = f1_score(y_validate, y_pred, average='binary')
        if score > best_score:
            best_score = score
            best_parameter = [n_estimators, max_depth]

# print F1 score
print("best_score: ", best_score)
print("best_parameter: ", best_parameter)

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

Running with n_estimators=100 and max_depth=10
Running with n_estimators=100 and max_depth=20
Running with n_estimators=200 and max_depth=10
Running with n_estimators=200 and max_depth=20
Running with n_estimators=300 and max_depth=10
Running with n_estimators=300 and max_depth=20
best_score:  0.26552706552706556
best_parameter:  [100, 20]


In [9]:
import torch
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        # 取最后一个时间步的输出
        lstm_out = lstm_out[:, -1, :]
        output = self.fc(lstm_out)
        output = self.sigmoid(output)
        return output

In [26]:
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

# 定义模型的超参数
sequence_length = 10 # 假设你的每个对话的长度为 sequence_length
input_size = X_training.shape[1]
hidden_size = 64
num_layers = 2
output_size = 1  # 二元分类任务

# 初始化模型
model = LSTMClassifier(input_size, hidden_size, num_layers, output_size)

# 定义损失函数和优化器
criterion = nn.BCELoss()  # 二元交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 转换为 PyTorch 张量
X_training_tensor = torch.tensor(X_training)
y_training_tensor = torch.tensor(y_training, dtype=torch.int)
X_validate_tensor = torch.tensor(X_validate)
y_validate_tensor = torch.tensor(y_validate, dtype=torch.int)

# 创建 TensorDataset
train_dataset = TensorDataset(X_training_tensor, y_training_tensor)
validate_dataset = TensorDataset(X_validate_tensor, y_validate_tensor)

# 使用 DataLoader 加载数据
batch_size = 64  # 你可以根据需要调整批量大小
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validate_dataloader = DataLoader(validate_dataset, batch_size=1, shuffle=True)

# 模型训练
num_epochs = 25
for epoch in range(num_epochs):
    for inputs, labels in train_dataloader:
        inputs = inputs.unsqueeze(1)
        labels = labels.unsqueeze(1)
        # 梯度清零
        optimizer.zero_grad()
        
        # 前向传播
        outputs = model(inputs)
        
        # 计算损失
        loss = criterion(outputs, labels.float())
        
        # 反向传播
        loss.backward()
        
        # 参数更新
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/25], Loss: 0.1977
Epoch [2/25], Loss: 0.4408
Epoch [3/25], Loss: 0.3709
Epoch [4/25], Loss: 0.3527
Epoch [5/25], Loss: 0.2323
Epoch [6/25], Loss: 0.2660
Epoch [7/25], Loss: 0.3239
Epoch [8/25], Loss: 0.4742
Epoch [9/25], Loss: 0.3395
Epoch [10/25], Loss: 0.4157
Epoch [11/25], Loss: 0.3573
Epoch [12/25], Loss: 0.2794
Epoch [13/25], Loss: 0.2279
Epoch [14/25], Loss: 0.2168
Epoch [15/25], Loss: 0.2851
Epoch [16/25], Loss: 0.3030
Epoch [17/25], Loss: 0.3077
Epoch [18/25], Loss: 0.2993
Epoch [19/25], Loss: 0.2509
Epoch [20/25], Loss: 0.3114
Epoch [21/25], Loss: 0.2200
Epoch [22/25], Loss: 0.2309
Epoch [23/25], Loss: 0.2212
Epoch [24/25], Loss: 0.3107
Epoch [25/25], Loss: 0.2190


In [24]:
from sklearn.metrics import f1_score

# 模型评估
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for inputs, labels in validate_dataloader:
        inputs = inputs.unsqueeze(1)
        labels = labels.unsqueeze(1)
        # 预测
        outputs = model(inputs)
        predictions = (outputs >= 0.5).int()
        
        # 保存预测值和标签
        all_predictions.extend(predictions.numpy())
        all_labels.extend(labels.numpy())

# 计算f1-score
f1 = f1_score(all_labels, all_predictions)
print(f'F1-Score on validation set: {f1:.4f}')


F1-Score on validation set: 0.4972


#### generate submission

In [25]:
training_set, test_set = split_dataset()

X_training = get_text_feature(training_set, path_to_training)
y_training = get_label(training_set, "training_labels.json")


Batches:   0%|          | 0/2270 [00:00<?, ?it/s]

In [30]:
test_labels = {}
model.eval()
with torch.no_grad():
    for transcription_id in test_set:
        with open(path_to_test / f"{transcription_id}.json", "r") as file:
            transcription = json.load(file)
        
        X_test = []
        for utterance in transcription:
            X_test.append(utterance["speaker"] + ": " + utterance["text"])
        
        X_test = bert.encode(X_test)
        X_test = torch.tensor(X_test).unsqueeze(1)

        # y_test = clf.predict(X_test)
        outputs = model(X_test)
        y_test = (outputs >= 0.5).int()
        y_test = y_test.squeeze(1)
        test_labels[transcription_id] = y_test.tolist()

with open("test_labels_text_baseline.json", "w") as file:
    json.dump(test_labels, file, indent=4)

### Use the combination of text and graph feature

#### combine_baseline(Random Forest): 
##### utterances are embedded with SentenceTransformer, node degrees are used as graph feature, then train a Random Forest classifier.

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np

training_set, validate_set, test_set = split_dataset(validate=True)

train_text_feature = get_text_feature(training_set, path_to_training)
train_graph_feature = get_graph_feature(training_set, path_to_training)
X_training = [np.concatenate((text_feat, [graph_feat])) for text_feat, graph_feat in zip(train_text_feature, train_graph_feature)]
y_training = get_label(training_set, "training_labels.json")

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_training, y_training)

validate_text_feature = get_text_feature(validate_set, path_to_training, show_progress_bar=False)
validate_graph_feature = get_graph_feature(validate_set, path_to_training)
X_validate = [np.concatenate((text_feat, [graph_feat])) for text_feat, graph_feat in zip(validate_text_feature, validate_graph_feature)]
y_validate = get_label(validate_set, "training_labels.json")

y_pred = clf.predict(X_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

0.3808854532677442
