In [8]:
# 挂在云盘
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
cd "/content/drive/MyDrive/Colab Notebooks/dlclass_data"

/content/drive/MyDrive/Colab Notebooks/dlclass_data


In [None]:
#Sentence-BERT+svm——【Test set accuracy: 0.62475】
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from scipy.spatial.distance import cosine, euclidean

# 读取数据集
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    return df

# 提取句子的嵌入
def get_sentence_embeddings(model, sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings.cpu().numpy()

# 构造特征
def create_features(embeddings1, embeddings2):
    # 计算向量差值
    diff = np.abs(embeddings1 - embeddings2)
    # 计算点积
    dot_product = np.sum(embeddings1 * embeddings2, axis=1)
    # 计算余弦相似度
    cosine_similarity = np.array([1 - cosine(a, b) for a, b in zip(embeddings1, embeddings2)])
    # 计算欧氏距离
    euclidean_distance = np.array([euclidean(a, b) for a, b in zip(embeddings1, embeddings2)])
    # 拼接特征
    features = np.column_stack((diff, dot_product, cosine_similarity, euclidean_distance))
    return features, cosine_similarity  # 返回特征和余弦相似度

# 主函数
def main():
    # 加载数据
    train_data = load_data('/content/drive/MyDrive/Colab Notebooks/dlclass_data/train.tsv')
    dev_data = load_data('/content/drive/MyDrive/Colab Notebooks/dlclass_data/dev.tsv')
    test_data = load_data('/content/drive/MyDrive/Colab Notebooks/dlclass_data/test.tsv')

    # 加载 Sentence-BERT 模型
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    # 提取训练集嵌入
    train_embeddings1 = get_sentence_embeddings(model, train_data['sentence1'].tolist())
    train_embeddings2 = get_sentence_embeddings(model, train_data['sentence2'].tolist())

    # 提取验证集和测试集嵌入
    dev_embeddings1 = get_sentence_embeddings(model, dev_data['sentence1'].tolist())
    dev_embeddings2 = get_sentence_embeddings(model, dev_data['sentence2'].tolist())
    test_embeddings1 = get_sentence_embeddings(model, test_data['sentence1'].tolist())
    test_embeddings2 = get_sentence_embeddings(model, test_data['sentence2'].tolist())

    # 构造训练集特征
    X_train, _ = create_features(train_embeddings1, train_embeddings2)
    y_train = train_data['label'].values

    # 构造验证集和测试集特征
    X_dev, _ = create_features(dev_embeddings1, dev_embeddings2)
    y_dev = dev_data['label'].values

    X_test, test_cosine_similarity = create_features(test_embeddings1, test_embeddings2)
    y_test = test_data['label'].values

    # 标准化特征
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)
    X_test = scaler.transform(X_test)

    # 训练 SVM
    svm = SVC(kernel='rbf', C=1.0, class_weight='balanced', probability=True)
    svm.fit(X_train, y_train)

    # 验证集性能
    y_dev_pred = svm.predict(X_dev)

    # 测试集性能
    y_test_pred = svm.predict(X_test)

    # 输出测试集预测结果
    print("\nTest set predictions:")
    for idx in range(len(test_data)):
        print(f"Sentence 1: {test_data['sentence1'].iloc[idx]}")
        print(f"Sentence 2: {test_data['sentence2'].iloc[idx]}")
        print(f"True label: {y_test[idx]}")
        print(f"Predicted label: {y_test_pred[idx]}")
        print(f"Cosine similarity: {test_cosine_similarity[idx]:.4f}")
        print("-" * 50)

    # 验证集性能
    print("Validation set performance:")
    print(classification_report(y_dev, y_dev_pred))

    # 测试集性能
    print("\nTest set performance:")
    print(classification_report(y_test, y_test_pred))
    print("Test set accuracy:", accuracy_score(y_test, y_test_pred))

if __name__ == '__main__':
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
#Sentence-BERT+XGBoost————Test set accuracy: 0.625625
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from scipy.spatial.distance import cosine, euclidean

# 读取数据集
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    return df

# 提取句子的嵌入
def get_sentence_embeddings(model, sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings.cpu().numpy()  # 转换为 NumPy 数组

# 增强特征构造
def create_features(embeddings1, embeddings2):
    # 计算向量差值
    diff = np.abs(embeddings1 - embeddings2)
    # 计算点积
    dot_product = np.sum(embeddings1 * embeddings2, axis=1)
    # 计算余弦相似度
    cosine_similarity = np.array([1 - cosine(a, b) for a, b in zip(embeddings1, embeddings2)])
    # 计算欧氏距离
    euclidean_distance = np.array([euclidean(a, b) for a, b in zip(embeddings1, embeddings2)])
    # 拼接特征：|emb1 - emb2|, emb1 * emb2, cosine, euclidean
    features = np.column_stack((diff, dot_product, cosine_similarity, euclidean_distance))
    return features

# 主函数
def main():
    # 加载数据
    train_data = load_data('train.tsv')
    dev_data = load_data('dev.tsv')
    test_data = load_data('test.tsv')

    # 加载预训练的 Sentence-BERT 模型
    model = SentenceTransformer('all-mpnet-base-v2')  # 使用更强大的嵌入模型

    # 提取训练集的句子嵌入
    train_embeddings1 = get_sentence_embeddings(model, train_data['sentence1'].tolist())
    train_embeddings2 = get_sentence_embeddings(model, train_data['sentence2'].tolist())

    # 提取验证集和测试集的句子嵌入
    dev_embeddings1 = get_sentence_embeddings(model, dev_data['sentence1'].tolist())
    dev_embeddings2 = get_sentence_embeddings(model, dev_data['sentence2'].tolist())
    test_embeddings1 = get_sentence_embeddings(model, test_data['sentence1'].tolist())
    test_embeddings2 = get_sentence_embeddings(model, test_data['sentence2'].tolist())

    # 创建训练集的特征
    X_train = create_features(train_embeddings1, train_embeddings2)
    y_train = train_data['label'].values

    # 创建验证集和测试集的特征
    X_dev = create_features(dev_embeddings1, dev_embeddings2)
    y_dev = dev_data['label'].values

    X_test = create_features(test_embeddings1, test_embeddings2)
    y_test = test_data['label'].values

    # 标准化特征
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)
    X_test = scaler.transform(X_test)

    # 训练 XGBoost 分类器
    xgb = XGBClassifier(scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),  # 平衡类别
                         n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
    xgb.fit(X_train, y_train)

    # 在验证集上评估性能
    y_dev_pred = xgb.predict(X_dev)


    # 在测试集上评估性能
    y_test_pred = xgb.predict(X_test)


    # 输出测试集的句子、预测结果和真实标签
    print("\nTest set predictions:")
    for idx in range(len(test_data)):
        print(f"Sentence 1: {test_data['sentence1'].iloc[idx]}")
        print(f"Sentence 2: {test_data['sentence2'].iloc[idx]}")
        print(f"True label: {y_test[idx]}")
        print(f"Predicted label: {y_test_pred[idx]}")
        print("-" * 50)

    # 验证集
    print("Validation set performance:")
    print(classification_report(y_dev, y_dev_pred))

    # 测试集
    print("\nTest set performance:")
    print(classification_report(y_test, y_test_pred))
    print("Test set accuracy:", accuracy_score(y_test, y_test_pred))

if __name__ == '__main__':
    main()


[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
True label: 1
Predicted label: 1
--------------------------------------------------
Sentence 1: They consist of nearly 30 expatriates , some members of the US Communist Party , while others are English writers or left-wing teachers .
Sentence 2: They consist of nearly 30 expatriates , some members of the US Communist Party while others are English writers or leftist teachers .
True label: 1
Predicted label: 0
--------------------------------------------------
Sentence 1: The grass is growing in the Masirah Channel , a waterway between Oman and Masirah Island on the mainland , where it is an important food for the green sea turtle .
Sentence 2: The grass grows in the Masirah Channel , a waterway between Masirah Island and mainland Oman , where it is an important food for the green sea turtle .
True label: 0
Predicted label: 1
--------------------------------------------------
Sentence 1: Panaon Island is a small island in the Philippines , in the

In [None]:
#Sentence-BERT+svm————Test set accuracy: 0.631375
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 读取数据集
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    return df

# 提取句子的嵌入
def get_sentence_embeddings(model, sentences):
    # 获取嵌入并确保在CPU上
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings.cpu().numpy()  # 转换为NumPy数组

# 特征组合：计算句子嵌入的差值、点积及拼接
def create_features(embeddings1, embeddings2):
    # 计算嵌入向量的差值
    diff = np.abs(embeddings1 - embeddings2)

    # 计算嵌入向量的点积
    dot_product = np.sum(embeddings1 * embeddings2, axis=1)

    # 特征拼接：|emb1 - emb2|, emb1 * emb2
    features = np.column_stack((diff, dot_product))

    return features

# 主函数
def main():
    # 加载数据
    train_data = load_data('train.tsv')
    dev_data = load_data('dev.tsv')
    test_data = load_data('test.tsv')

    # 加载预训练的Sentence-BERT模型
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    # 提取训练集的句子嵌入
    train_embeddings1 = get_sentence_embeddings(model, train_data['sentence1'].tolist())
    train_embeddings2 = get_sentence_embeddings(model, train_data['sentence2'].tolist())

    # 提取验证集和测试集的句子嵌入
    dev_embeddings1 = get_sentence_embeddings(model, dev_data['sentence1'].tolist())
    dev_embeddings2 = get_sentence_embeddings(model, dev_data['sentence2'].tolist())
    test_embeddings1 = get_sentence_embeddings(model, test_data['sentence1'].tolist())
    test_embeddings2 = get_sentence_embeddings(model, test_data['sentence2'].tolist())

    # 创建训练集的特征
    X_train = create_features(train_embeddings1, train_embeddings2)
    y_train = train_data['label'].values

    # 创建验证集和测试集的特征
    X_dev = create_features(dev_embeddings1, dev_embeddings2)
    y_dev = dev_data['label'].values

    X_test = create_features(test_embeddings1, test_embeddings2)
    y_test = test_data['label'].values

    # 标准化特征
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)
    X_test = scaler.transform(X_test)

    # 训练SVM模型
    svm = SVC(kernel='rbf', C=1, gamma='scale')
    svm.fit(X_train, y_train)

    # 在验证集上评估性能
    y_dev_pred = svm.predict(X_dev)


    # 在测试集上评估性能
    y_test_pred = svm.predict(X_test)

    # 输出测试集的句子、预测结果和真实标签
    print("\nTest set predictions:")
    for idx in range(len(test_data)):
        print(f"Sentence 1: {test_data['sentence1'].iloc[idx]}")
        print(f"Sentence 2: {test_data['sentence2'].iloc[idx]}")
        print(f"True label: {y_test[idx]}")
        print(f"Predicted label: {y_test_pred[idx]}")
        print("-" * 50)


    print("Validation set performance:")
    print(classification_report(y_dev, y_dev_pred))
    print("Dev set accuracy:", accuracy_score(y_test, y_test_pred))


    print("\nTest set performance:")
    print(classification_report(y_test, y_test_pred))
    print("Test set accuracy:", accuracy_score(y_test, y_test_pred))

if __name__ == '__main__':
    main()


[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
True label: 1
Predicted label: 1
--------------------------------------------------
Sentence 1: They consist of nearly 30 expatriates , some members of the US Communist Party , while others are English writers or left-wing teachers .
Sentence 2: They consist of nearly 30 expatriates , some members of the US Communist Party while others are English writers or leftist teachers .
True label: 1
Predicted label: 0
--------------------------------------------------
Sentence 1: The grass is growing in the Masirah Channel , a waterway between Oman and Masirah Island on the mainland , where it is an important food for the green sea turtle .
Sentence 2: The grass grows in the Masirah Channel , a waterway between Masirah Island and mainland Oman , where it is an important food for the green sea turtle .
True label: 0
Predicted label: 0
--------------------------------------------------
Sentence 1: Panaon Island is a small island in the Philippines , in the