#### split training and test sets of transcription ids

In [1]:
import json
from pathlib import Path
from sklearn.metrics import f1_score

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("training")
path_to_test = Path("test")

def split_dataset(validate=False):
    training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
    training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
    training_set.remove('IS1002a')
    training_set.remove('IS1005d')
    training_set.remove('TS3012c')

    test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
    test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])
    
    if validate:
        # randomly select 10% of training set as validation set
        import random
        random.seed(6969)
        validate_set = random.choices(training_set, k=int(len(training_set)*0.1))
        training_set = list(set(training_set) - set(validate_set))
        return training_set, validate_set, test_set

    return training_set, test_set

#### naive_baseline: 
##### all utterances are predicted important (label 1)

In [26]:
training_set, validate_set, test_set = split_dataset(validate=True)
y_validate, y_pred = [], []
with open("training_labels.json", "r") as file:
    training_labels = json.load(file)
for transcription_id in validate_set:  
    y_validate += training_labels[transcription_id]
    y_pred += [1] * len(training_labels[transcription_id])

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))

0.3079913606911447


#### text_baseline(Decision Tree): 
##### utterances are embedded with SentenceTransformer, then train a Decision Tree classifier.

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')

training_set, validate_set, test_set = split_dataset(validate=True)

y_training = []
with open("training_labels.json", "r") as file:
    training_labels = json.load(file)
X_training = []
for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    for utterance in transcription:
        X_training.append(utterance["speaker"] + ": " + utterance["text"])
    
    y_training += training_labels[transcription_id]

X_training = bert.encode(X_training, show_progress_bar=True)

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_training, y_training)

y_pred, y_validate = [], []
for transcription_id in validate_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    X_validate = []
    for utterance in transcription:
        X_validate.append(utterance["speaker"] + ": " + utterance["text"])
    
    y_validate += training_labels[transcription_id]
    
    X_validate = bert.encode(X_validate)

    y_pred += clf.predict(X_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))
    

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

0.3863235812477969


#### text_baseline(Random Forest): 
##### utterances are embedded with SentenceTransformer, then train a Random Forest classifier.

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('all-MiniLM-L6-v2')

training_set, validate_set, test_set = split_dataset(validate=True)

y_training = []
with open("training_labels.json", "r") as file:
    training_labels = json.load(file)
X_training = []
for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    for utterance in transcription:
        X_training.append(utterance["speaker"] + ": " + utterance["text"])
    
    y_training += training_labels[transcription_id]

X_training = bert.encode(X_training, show_progress_bar=True)

clf = RandomForestClassifier(n_estimators=250, max_depth=5, criterion='gini', n_jobs=-1, random_state=0)
clf.fit(X_training, y_training)

y_pred, y_validate = [], []
for transcription_id in validate_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    X_validate = []
    for utterance in transcription:
        X_validate.append(utterance["speaker"] + ": " + utterance["text"])
    
    y_validate += training_labels[transcription_id]
    
    X_validate = bert.encode(X_validate)

    y_pred += clf.predict(X_validate).tolist()

# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

0.25344036697247707


In [15]:
# grid search
bert = SentenceTransformer('all-MiniLM-L6-v2')

training_set, validate_set, test_set = split_dataset(validate=True)

y_training = []
with open("training_labels.json", "r") as file:
    training_labels = json.load(file)
X_training = []
for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    for utterance in transcription:
        X_training.append(utterance["speaker"] + ": " + utterance["text"])
    
    y_training += training_labels[transcription_id]

X_training = bert.encode(X_training, show_progress_bar=True)



Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

Running with n_estimators=100 and max_depth=10
Running with n_estimators=100 and max_depth=20
Running with n_estimators=200 and max_depth=10
Running with n_estimators=200 and max_depth=20
Running with n_estimators=300 and max_depth=10
Running with n_estimators=300 and max_depth=20
best_score:  0.26552706552706556
best_parameter:  [100, 20]


In [17]:
best_score = 0
for n_estimators in [20, 35, 50, 75]:
    for max_depth in [25, 30, 35]:
        print(f'Running with n_estimators={n_estimators} and max_depth={max_depth}')
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, criterion='gini', n_jobs=-1, random_state=0)
        clf.fit(X_training, y_training)

        y_pred, y_validate = [], []
        for transcription_id in validate_set:
            with open(path_to_training / f"{transcription_id}.json", "r") as file:
                transcription = json.load(file)
            
            X_validate = []
            for utterance in transcription:
                X_validate.append(utterance["speaker"] + ": " + utterance["text"])
            
            y_validate += training_labels[transcription_id]
            
            X_validate = bert.encode(X_validate)

            y_pred += clf.predict(X_validate).tolist()
        
        score = f1_score(y_validate, y_pred, average='binary')
        if score > best_score:
            best_score = score
            best_parameter = [n_estimators, max_depth]

# print F1 score
print("best_score: ", best_score)
print("best_parameter: ", best_parameter)

Running with n_estimators=20 and max_depth=25
Running with n_estimators=20 and max_depth=30
Running with n_estimators=20 and max_depth=35
Running with n_estimators=35 and max_depth=25
Running with n_estimators=35 and max_depth=30
Running with n_estimators=35 and max_depth=35
Running with n_estimators=50 and max_depth=25
Running with n_estimators=50 and max_depth=30
Running with n_estimators=50 and max_depth=35
Running with n_estimators=75 and max_depth=25
Running with n_estimators=75 and max_depth=30
Running with n_estimators=75 and max_depth=35
best_score:  0.3201621073961499
best_parameter:  [20, 25]


#### combine_baseline(Random Forest): 
##### utterances are embedded with SentenceTransformer, node degrees are used as graph feature, then train a Random Forest classifier.

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sentence_transformers import SentenceTransformer
import networkx as nx
import numpy as np
bert = SentenceTransformer('all-MiniLM-L6-v2')

training_set, validate_set, test_set = split_dataset(validate=True)

y_training = []
with open("training_labels.json", "r") as file:
    training_labels = json.load(file)
train_text_feature, train_graph_feature = [], []
for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as text_file:
        transcription = json.load(text_file)
    
    for utterance in transcription:
        train_text_feature.append(utterance["speaker"] + ": " + utterance["text"])
        
    with open(path_to_training / f"{transcription_id}.txt", "r") as graph_file:
        edges = []
        for line in graph_file:
            parts = line.split()
            source, relation, target = int(parts[0]), parts[1], int(parts[2])
            edges.append((source, target, {'relation': relation}))
        
    G = nx.DiGraph()
    G.add_edges_from(edges)
    # 提取图特征（这里使用节点的度作为特征）
    node_degrees = dict(G.degree())
    train_graph_feature += list(node_degrees.values())
    
    y_training += training_labels[transcription_id]

train_text_feature = bert.encode(train_text_feature, show_progress_bar=True)
X_training = [np.concatenate((text_feat, [graph_feat])) for text_feat, graph_feat in zip(train_text_feature, train_graph_feature)]

clf = RandomForestClassifier(n_estimators=250, criterion='gini', n_jobs=-1, random_state=0)
clf.fit(X_training, y_training)

y_pred, y_validate = [], []
for transcription_id in validate_set:
    validate_text_feature = []
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    
    for utterance in transcription:
        validate_text_feature.append(utterance["speaker"] + ": " + utterance["text"])
    
    with open(path_to_training / f"{transcription_id}.txt", "r") as graph_file:
        edges = []
        for line in graph_file:
            parts = line.split()
            source, relation, target = int(parts[0]), parts[1], int(parts[2])
            edges.append((source, target, {'relation': relation}))
        
    G = nx.DiGraph()
    G.add_edges_from(edges)
    node_degrees = dict(G.degree())
    validate_graph_feature = list(node_degrees.values())
    
    validate_text_feature = bert.encode(validate_text_feature)
    X_validate = [np.concatenate((text_feat, [graph_feat])) for text_feat, graph_feat in zip(validate_text_feature, validate_graph_feature)]

    y_pred += clf.predict(X_validate).tolist()
    y_validate += training_labels[transcription_id]
    
# print F1 score
print(f1_score(y_validate, y_pred, average='binary'))

Batches:   0%|          | 0/2058 [00:00<?, ?it/s]

ValueError: X has 384 features, but RandomForestClassifier is expecting 385 features as input.