#### split training and test sets of transcription ids

In [1]:
import json
from pathlib import Path
from sklearn.metrics import f1_score

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("training")
path_to_test = Path("test")

def split_dataset(validate=False):
    training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
    training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
    training_set.remove('IS1002a')
    training_set.remove('IS1005d')
    training_set.remove('TS3012c')

    test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
    test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])
    
    if validate:
        # randomly select 10% of training set as validation set
        import random
        random.seed(6969)
        validate_set = random.choices(training_set, k=int(len(training_set)*0.1))
        training_set = list(set(training_set) - set(validate_set))
        return training_set, validate_set, test_set

    return training_set, test_set

#### functions to get features

In [8]:
from sentence_transformers import SentenceTransformer
import networkx as nx
bert = SentenceTransformer('all-MiniLM-L6-v2')


def get_text_feature(dataset, path, show_progress_bar=True):
    text_feature = []
    for transcription_id in dataset:
        with open(path / f"{transcription_id}.json", "r") as text_file:
            transcription = json.load(text_file)
        
        for utterance in transcription:
            text_feature.append(utterance["speaker"] + ": " + utterance["text"])

    text_feature = bert.encode(text_feature, show_progress_bar=show_progress_bar)
    return text_feature


def get_graph_feature(dataset, path):
    graph_feature = []
    for transcription_id in dataset:       
        with open(path / f"{transcription_id}.txt", "r") as graph_file:
            edges = []
            for line in graph_file:
                parts = line.split()
                source, relation, target = int(parts[0]), parts[1], int(parts[2])
                edges.append((source, target, {'relation': relation}))
            
        G = nx.DiGraph()
        G.add_edges_from(edges)
        # 提取图特征（这里使用节点的度作为特征）
        node_degrees = dict(G.degree())
        graph_feature += list(node_degrees.values())
    
    return graph_feature


def get_label(dataset, label_file):
    labels = []
    with open(label_file, "r") as file:
        all_labels = json.load(file)
    for transcription_id in dataset:  
        labels += all_labels[transcription_id]
    return labels

#### naive_baseline: 
##### all utterances are predicted important (label 1)

In [3]:
training_set, validate_set, test_set = split_dataset(validate=True)

y_validate = get_label(validate_set, "training_labels.json")
y_pred = [1] * len(y_validate)

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))

0.3079913606911447


#### text_baseline(Decision Tree): 
##### utterances are embedded with SentenceTransformer, then train a Decision Tree classifier.

In [4]:
from sklearn.tree import DecisionTreeClassifier

training_set, validate_set, test_set = split_dataset(validate=True)

X_training = get_text_feature(training_set, path_to_training)
y_training = get_label(training_set, "training_labels.json")

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_training, y_training)

X_validate = get_text_feature(validate_set, path_to_training, show_progress_bar=False)
y_validate = get_label(validate_set, "training_labels.json")

y_pred = clf.predict(X_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))
    

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

Batches:   0%|          | 0/245 [00:00<?, ?it/s]

0.38149063935005295


#### text_baseline(Random Forest): 
##### utterances are embedded with SentenceTransformer, then train a Random Forest classifier.

In [29]:
from sklearn.ensemble import RandomForestClassifier

training_set, validate_set, test_set = split_dataset(validate=True)

X_training = get_text_feature(training_set, path_to_training)
y_training = get_label(training_set, "training_labels.json")

clf = RandomForestClassifier(n_estimators=50, max_depth=5, criterion='gini', n_jobs=-1, random_state=0)
clf.fit(X_training, y_training)

X_validate = get_text_feature(validate_set, path_to_training, show_progress_bar=False)
y_validate = get_label(validate_set, "training_labels.json")

y_pred = clf.predict(X_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

0.25344036697247707


In [15]:
# grid search
from sklearn.ensemble import RandomForestClassifier

training_set, validate_set, test_set = split_dataset(validate=True)

X_training = get_text_feature(training_set, path_to_training)
y_training = get_label(training_set, "training_labels.json")

best_score = 0
for n_estimators in [20, 35, 50, 75]:
    for max_depth in [25, 30, 35]:

        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, criterion='gini', n_jobs=-1, random_state=0)
        clf.fit(X_training, y_training)

        X_validate = get_text_feature(validate_set, path_to_training, show_progress_bar=False)
        y_validate = get_label(validate_set, "training_labels.json")

        y_pred = clf.predict(X_validate).tolist()
        
        score = f1_score(y_validate, y_pred, average='binary')
        if score > best_score:
            best_score = score
            best_parameter = [n_estimators, max_depth]

# print F1 score
print("best_score: ", best_score)
print("best_parameter: ", best_parameter)

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

Running with n_estimators=100 and max_depth=10
Running with n_estimators=100 and max_depth=20
Running with n_estimators=200 and max_depth=10
Running with n_estimators=200 and max_depth=20
Running with n_estimators=300 and max_depth=10
Running with n_estimators=300 and max_depth=20
best_score:  0.26552706552706556
best_parameter:  [100, 20]


#### combine_baseline(Random Forest): 
##### utterances are embedded with SentenceTransformer, node degrees are used as graph feature, then train a Random Forest classifier.

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np

training_set, validate_set, test_set = split_dataset(validate=True)

train_text_feature = get_text_feature(training_set, path_to_training)
train_graph_feature = get_graph_feature(training_set, path_to_training)
X_training = [np.concatenate((text_feat, [graph_feat])) for text_feat, graph_feat in zip(train_text_feature, train_graph_feature)]
y_training = get_label(training_set, "training_labels.json")

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_training, y_training)

validate_text_feature = get_text_feature(validate_set, path_to_training, show_progress_bar=False)
validate_graph_feature = get_graph_feature(validate_set, path_to_training)
X_validate = [np.concatenate((text_feat, [graph_feat])) for text_feat, graph_feat in zip(validate_text_feature, validate_graph_feature)]
y_validate = get_label(validate_set, "training_labels.json")

y_pred = clf.predict(X_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

0.3808854532677442
