In [21]:
from sentence_transformers.evaluation import LabelAccuracyEvaluator

from sentence_transformers import SentenceTransformer,  InputExample, losses, models, util
from transformers import AutoTokenizer
import pandas as pd
import os
from torch.utils.data import DataLoader
import logging
import math
from datetime import datetime
from sklearn.model_selection import train_test_split
import random
import  re
import  yaml
import torch
import numpy as np


In [22]:
model_name = 'sentence-transformers/all-MiniLM-L12-v2'



word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)



model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
logging.info("Read STSbenchmark train dataset")
# Apply mean pooling to get one fixed sized sentence vector


tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [23]:


train_batch_size = 32

num_epochs = 6

distance_limit = 10
csv_pair_size_limit = 100
split_method_index = 1
# CUDA_num = int(input("please input CUDA number "))
# device = torch.device(f"cuda:{CUDA_num}")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
negative_sample_num = 1


In [24]:
debates_path = '/Users/fanzhe/Desktop/master_thesis/Data/kialo_debatetree_data/csv_sample_nofilter'
model_save_path = '/Users/fanzhe/Desktop/master_thesis/Data/model_ouput/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# 获取文件夹中的所有文件
all_files = os.listdir(debates_path)

# 筛选出CSV文件
csv_files = [file for file in all_files if file.endswith('.csv')]

In [25]:
def skip_argument(text):
    # 定义正则表达式
    pattern = r"-> See \d+(\.\d+)+\."

    # 使用re.match进行匹配
    return bool(re.match(pattern, text))




In [26]:
def polarity_split_method_1(max_pairs_size, max_distance):
    

    all_files = os.listdir(debates_path)
    # csv_files = [file for file in all_files if file.endswith('.csv') and  (len(pd.read_csv(os.path.join(debates_path, file)))) < max_pairs_size  ]
    less_than_limit_files = []
    over_limit_files = []
    for file in all_files:
        if file.endswith('.csv'):
            if (len(pd.read_csv(os.path.join(debates_path, file)))) < max_pairs_size:
                less_than_limit_files.append(file)
            elif (len(pd.read_csv(os.path.join(debates_path, file)))) >= max_pairs_size:
                over_limit_files.append(file)
    random.shuffle(over_limit_files)
    random.shuffle(less_than_limit_files)

    # shuffled_csv_files = csv_files
    samples = []

    # 逐个读取CSV文件
    content_1_list = []

    for file in less_than_limit_files:

        file_path = os.path.join(debates_path, file)
        # 读取CSV文件
        df = pd.read_csv(file_path)

        number_of_pairs = len(df)

        # 按行处理数据
        files_has_0_distance = []
        for index, row in df.iterrows():
            if float(row['distance']) != 0 and not skip_argument(row['content_1']) and not skip_argument(row['content_2']) and float(row['distance']) <= max_distance:

                if float(row['polarity_consistency']) == 1:

                    score = 1 # Normalize score to range 0 ... 1
                    inp_example = InputExample(texts=[row['content_1'], row['content_2']], label=score)
                    samples.append(inp_example)

                if float(row['polarity_consistency']) == -1:
                    if float(row['polarity_1']) == 0:
                        # score = 1
                        # inp_example = InputExample(texts=[row['content_1'], row['content_2']], label=score)
                        pass

                    elif float(row['polarity_1']) != 0:
                        score = 0
                        inp_example = InputExample(texts=[row['content_1'], row['content_2']], label=score)

                        samples.append(inp_example)

            elif float(row['distance']) == 0:
                files_has_0_distance.append(file_path)

    for file in over_limit_files:

        file_path = os.path.join(debates_path, file)
        # 读取CSV文件
        df = pd.read_csv(file_path)
        sampled_df = df.sample(n=max_pairs_size)

        number_of_pairs = len(sampled_df)


        # 按行处理数据
        files_has_0_distance = []
        for index, row in sampled_df.iterrows():
            if float(row['distance']) != 0 and not skip_argument(row['content_1']) and not skip_argument(
                    row['content_2']) and float(row['distance']) <= max_distance:

                if float(row['polarity_consistency']) == 1:
                    score = 1  # Normalize score to range 0 ... 1
                    inp_example = InputExample(texts=[row['content_1'], row['content_2']], label=score)
                    # print(score, row['content_1'], row['content_2'])
                    samples.append(inp_example)

                if float(row['polarity_consistency']) == -1:
                    if float(row['polarity_1']) == 0:
                        # score = 1
                        # inp_example = InputExample(texts=[row['content_1'], row['content_2']], label=score)
                        pass

                    elif float(row['polarity_1']) != 0:
                        score = 0
                        inp_example = InputExample(texts=[row['content_1'], row['content_2']], label=score)

                        samples.append(inp_example)

            elif float(row['distance']) == 0:
                files_has_0_distance.append(file_path)


    file_name = "files_has_0_distance.txt"

    # 使用 'with' 语句打开文件进行写入，确保文件最后会被正确关闭
    with open(file_name, "w") as file:
        # 遍历列表，写入每一行
        for line in files_has_0_distance:
            file.write(line + "\n")  # "\n" 是换行符

    # print(samples, type(samples))

    random.shuffle(samples)
    shuffled_samples = samples
    sample_collection = shuffled_samples
   
    return sample_collection

In [27]:
def relatedness_split_method_1(max_pairs_size, max_distance):


    all_files = os.listdir(debates_path)
    # csv_files = [file for file in all_files if file.endswith('.csv') and  (len(pd.read_csv(os.path.join(debates_path, file)))) < max_pairs_size  ]
    less_than_limit_files = []
    over_limit_files = []
    for file in all_files:
        if file.endswith('.csv'):
            if (len(pd.read_csv(os.path.join(debates_path, file)))) < max_pairs_size:
                less_than_limit_files.append(file)
            elif (len(pd.read_csv(os.path.join(debates_path, file)))) >= max_pairs_size:
                over_limit_files.append(file)
    random.shuffle(over_limit_files)
    random.shuffle(less_than_limit_files)

    # shuffled_csv_files = csv_files
    samples = []

    # 逐个读取CSV文件
    content_1_list = []

    for file in less_than_limit_files:

        file_path = os.path.join(debates_path, file)
        # 读取CSV文件
        df = pd.read_csv(file_path)

        number_of_pairs = len(df)

        # 按行处理数据
        files_has_0_distance = []
        for index, row in df.iterrows():
            if float(row['distance']) != 0 and not skip_argument(row['content_1']) and not skip_argument(row['content_2']) and float(row['distance']) <= max_distance:
                score =  1/ float(row['distance']) # Normalize score to range 0 ... 1
                if type(score) is not  float:
                    print("scoretype", type(score))
                inp_example = InputExample(texts=[row['content_1'], row['content_2']], label=score)
                # print(score, row['content_1'], row['content_2'])

                samples.append(inp_example)
                file_index, extension = os.path.splitext(file)
                if row['content_1'] not in content_1_list:
                    content_1_list.append({"file_index": str(file_index), "content": row['content_1']})

            elif float(row['distance']) == 0:
                files_has_0_distance.append(file_path)

                print(file_path, row['distance'])
    for file in over_limit_files:

        file_path = os.path.join(debates_path, file)
        # 读取CSV文件
        df = pd.read_csv(file_path)
        sampled_df = df.sample(n=max_pairs_size)

        number_of_pairs = len(sampled_df)


        # 按行处理数据
        files_has_0_distance = []
        for index, row in sampled_df.iterrows():
            if float(row['distance']) != 0 and not skip_argument(row['content_1']) and not skip_argument(row['content_2']) and float(row['distance']) <= max_distance:
                score =  1/ float(row['distance']) # Normalize score to range 0 ... 1
                if type(score) is not  float:
                    print("scoretype", type(score))
                inp_example = InputExample(texts=[row['content_1'], row['content_2']], label=score)
                # print(score, row['content_1'], row['content_2'])

                samples.append(inp_example)
                file_index, extension = os.path.splitext(file)
                if row['content_1'] not in content_1_list:
                    content_1_list.append({"file_index": str(file_index), "content": row['content_1']})

            elif float(row['distance']) == 0:
                files_has_0_distance.append(file_path)

                print(file_path, row['distance'])
#     for content_1 in content_1_list:
#         # print("testhahaha", content_1,type(content_1))
#         rest_of_contents = [f for f in content_1_list if f["file_index"] != content_1["file_index"]]
#         random_negative_arguments = []
#         while len(random_negative_arguments) < negative_sample_num:
#             random_index_content = random.choice(rest_of_contents)
#             random_content = random_index_content["content"]
#             # print(random_content)
#             if random_content not in random_negative_arguments:
#                 random_negative_arguments.append(random_content)

#         for negative_argument in random_negative_arguments:
#             # print("test", content_1, negative_argument, type(negative_argument))
#             neg_inp_example = InputExample(texts=[content_1["content"], negative_argument], label=0.0)
#             samples.append(neg_inp_example)
#     print("shuffle seed test, negative", random_negative_arguments[:10])

    file_name = "files_has_0_distance.txt"

    # 使用 'with' 语句打开文件进行写入，确保文件最后会被正确关闭
    with open(file_name, "w") as file:
        # 遍历列表，写入每一行
        for line in files_has_0_distance:
            file.write(line + "\n")  # "\n" 是换行符

    # print(samples, type(samples))

    random.shuffle(samples)
    shuffled_samples = samples
    sample_collection = shuffled_samples

    return sample_collection

In [29]:
polarity_sample_collection = polarity_split_method_1(csv_pair_size_limit, distance_limit)
relatedness_sample_collection = relatedness_split_method_1(csv_pair_size_limit, distance_limit)
polarity_selected_samples = random.sample(polarity_sample_collection, 3)
relatedness_selected_samples = random.sample(relatedness_sample_collection, 3)


for idx, sample in enumerate(polarity_selected_samples, 1):
    sentence1, sentence2 = sample.texts[0], sample.texts[1]
    embeddings = model.encode([sentence1, sentence2], convert_to_tensor=True)

    cosine_similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
#     print(f"Sample {idx}:")
    print(f"Content 1: {sample.texts[0]}")
    print(f"Content 2: {sample.texts[1]}")
    print(f"cosine_similarity:{cosine_similarity}")
    print(f"Polarity Label: {sample.label}")
    print("-" * 30)
    
for idx, sample in enumerate(relatedness_selected_samples, 1):
    sentence1, sentence2 = sample.texts[0], sample.texts[1]
    embeddings = model.encode([sentence1, sentence2], convert_to_tensor=True)

    cosine_similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
#     print(f"Sample {idx}:")
    print(f"Content 1: {sample.texts[0]}")
    print(f"Content 2: {sample.texts[1]}")
    print(f"cosine_similarity:{cosine_similarity}")
    print(f"Relatedness Label: {sample.label}")
    print("-" * 30)
    

Content 1: Parents and students alike may not feel comfortable having their data potentially leaked or shared.
Content 2: If AI is able to manage many of the responsibilities of teachers, it may no longer be necessary to pay teachers as much for their work, as they will not be so essential.
cosine_similarity:0.1712367832660675
Polarity Label: 1
------------------------------
Content 1: Students have more educational flexibility in VR \(such as field trips in VR or reality\) that help students more than school
Content 2: VR adds more variety to learning.
cosine_similarity:0.7631800770759583
Polarity Label: 0
------------------------------
Content 1: A free kick awarded on this basis could lead to a counterattack.  The shift in formation would be far more dramatic than most free kicks.
Content 2: Passes to defenders and to the keeper are rarely intercepted.
cosine_similarity:0.39632412791252136
Polarity Label: 0
------------------------------
Content 1: Working in HR is a viable career o