In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# 加载数据
train_data = pd.read_csv('train.tsv', sep='\t')
dev_data = pd.read_csv('dev.tsv', sep='\t')
test_data = pd.read_csv('test.tsv', sep='\t')

# 查看数据
print(train_data.head())

# 计算所有数据集的最大长度
all_sentences = train_data['sentence1'].tolist() + train_data['sentence2'].tolist() + \
                dev_data['sentence1'].tolist() + dev_data['sentence2'].tolist() + \
                test_data['sentence1'].tolist() + test_data['sentence2'].tolist()

max_length = max(len(s.split()) for s in all_sentences)
print(f'Max Length: {max_length}')

# 文本预处理
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(all_sentences)
word_index = tokenizer.word_index

# 转换为序列
def preprocess_data(data, max_length):
    sequences1 = tokenizer.texts_to_sequences(data['sentence1'])
    sequences2 = tokenizer.texts_to_sequences(data['sentence2'])
    X1 = pad_sequences(sequences1, maxlen=max_length, padding='post')
    X2 = pad_sequences(sequences2, maxlen=max_length, padding='post')
    X = np.hstack((X1, X2))
    y = data['label'].values
    return X, y

X_train, y_train = preprocess_data(train_data, max_length)
X_dev, y_dev = preprocess_data(dev_data, max_length)
X_test, y_test = preprocess_data(test_data, max_length)

   id                                          sentence1  \
0   1  In Paris , in October 1560 , he secretly met t...   
1   2  The NBA season of 1975 -- 76 was the 30th seas...   
2   3  There are also specific discussions , public p...   
3   4  When comparable rates of flow can be maintaine...   
4   5  It is the seat of Zerendi District in Akmola R...   

                                           sentence2  label  
0  In October 1560 , he secretly met with the Eng...      0  
1  The 1975 -- 76 season of the National Basketba...      1  
2  There are also public discussions , profile sp...      0  
3  The results are high when comparable flow rate...      1  
4  It is the seat of the district of Zerendi in A...      1  
Max Length: 37


In [None]:
def jaccard_similarity(text1, text2):
    set1 = set(text1.split())
    set2 = set(text2.split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

# 计算Jaccard相似度
def compute_jaccard_similarities(data):
    jaccard_similarities = [jaccard_similarity(s1, s2) for s1, s2 in zip(data['sentence1'], data['sentence2'])]
    return np.array(jaccard_similarities).reshape(-1, 1)

jaccard_train = compute_jaccard_similarities(train_data)
jaccard_dev = compute_jaccard_similarities(dev_data)
jaccard_test = compute_jaccard_similarities(test_data)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# 定义带有L2正则化的模型
text_input = Input(shape=(2 * max_length,), name='text_input')
jaccard_input = Input(shape=(1,), name='jaccard_input')

x = Embedding(input_dim=len(word_index) + 1, output_dim=128)(text_input)
x = Conv1D(128, 5, activation='relu', kernel_regularizer=l2(0.001))(x)
x = GlobalMaxPooling1D()(x)
x = Dense(64, activation='relu', kernel_regularizer=l2(0.001))(x)
x = Dropout(0.5)(x)

combined = Concatenate()([x, jaccard_input])
output = Dense(1, activation='sigmoid')(combined)

model = Model(inputs=[text_input, jaccard_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# 定义早停法和学习率衰减
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001)

# 训练模型
history = model.fit([X_train, jaccard_train], y_train,
                    validation_data=([X_dev, jaccard_dev], y_dev),
                    epochs=20, batch_size=64, callbacks=[reduce_lr, early_stopping])

Epoch 1/20
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 119ms/step - accuracy: 0.5572 - loss: 0.7456 - val_accuracy: 0.5650 - val_loss: 0.6881 - learning_rate: 0.0010
Epoch 2/20
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 118ms/step - accuracy: 0.5949 - loss: 0.6788 - val_accuracy: 0.5760 - val_loss: 0.6815 - learning_rate: 0.0010
Epoch 3/20
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 120ms/step - accuracy: 0.6233 - loss: 0.6628 - val_accuracy: 0.5838 - val_loss: 0.6791 - learning_rate: 0.0010
Epoch 4/20
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 121ms/step - accuracy: 0.6430 - loss: 0.6463 - val_accuracy: 0.5845 - val_loss: 0.6741 - learning_rate: 0.0010
Epoch 5/20
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 123ms/step - accuracy: 0.6603 - loss: 0.6231 - val_accuracy: 0.6070 - val_loss: 0.6667 - learning_rate: 0.0010
Epoch 6/20
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━

In [None]:
# 评估模型
loss, accuracy = model.evaluate([X_test, jaccard_test], y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.6066 - loss: 0.6685
Test Accuracy: 60.71%


In [None]:
pip install -U sentence-transformers



In [None]:
!pip install dask-ml

Collecting dask-ml
  Downloading dask_ml-2024.4.4-py3-none-any.whl.metadata (5.9 kB)
Collecting dask-glm>=0.2.0 (from dask-ml)
  Downloading dask_glm-0.3.2-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting distributed>=2.4.0 (from dask-ml)
  Downloading distributed-2024.12.1-py3-none-any.whl.metadata (3.3 kB)
Collecting sparse>=0.7.0 (from dask-glm>=0.2.0->dask-ml)
  Downloading sparse-0.15.4-py2.py3-none-any.whl.metadata (4.5 kB)
Collecting dask-expr<1.2,>=1.1 (from dask[array,dataframe]>=2.4.0->dask-ml)
  Downloading dask_expr-1.1.21-py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of distributed to determine which version is compatible with other requirements. This could take a while.
Collecting distributed>=2.4.0 (from dask-ml)
  Downloading distributed-2024.12.0-py3-none-any.whl.metadata (3.3 kB)
  Downloading distributed-2024.11.2-py3-none-any.whl.metadata (3.3 kB)
  Downloading distributed-2024.11.1-py3-none-any.whl.metadata (3.3 kB)
  Downloading dis

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
!pip install transformers datasets scikit-learn pandas torch tqdm



In [None]:
pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu118


In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score, classification_report
import torch
from tqdm import tqdm
import numpy as np

# 加载数据
train_data = pd.read_csv('train.tsv', sep='\t')
dev_data = pd.read_csv('dev.tsv', sep='\t')
test_data = pd.read_csv('test.tsv', sep='\t')

# 初始化M3E模型
tokenizer = AutoTokenizer.from_pretrained("moka-ai/m3e-base")
model = AutoModelForSequenceClassification.from_pretrained("moka-ai/m3e-base", num_labels=2)

# 检查是否有可用的GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 数据预处理函数
def preprocess_function(examples):
    return tokenizer(
        examples['sentence1'],
        examples['sentence2'],
        padding=True,
        truncation=True,
        max_length=128  # 根据实际需求调整最大长度
    )

# 将Pandas DataFrame转换为Hugging Face Dataset
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_data),
    'validation': Dataset.from_pandas(dev_data),
    'test': Dataset.from_pandas(test_data)
})

# 应用预处理函数到整个数据集
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 设置训练参数
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to=[],  # 禁用wandb
)

# 定义评估函数
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# 使用Trainer API进行微调
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,  # 提供tokenizer以便于数据加载时使用
)

# 开始训练
trainer.train()

# 评估模型
predictions = trainer.predict(tokenized_datasets['test'])
pred_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

print(classification_report(true_labels, pred_labels))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/439k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/932 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at moka-ai/m3e-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/49401 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3193,0.415214,0.86225,0.861649
2,0.2471,0.31282,0.90125,0.901512
3,0.2112,0.382054,0.90825,0.908419


              precision    recall  f1-score   support

           0       0.92      0.89      0.90      4464
           1       0.86      0.90      0.88      3536

    accuracy                           0.89      8000
   macro avg       0.89      0.90      0.89      8000
weighted avg       0.90      0.89      0.90      8000



In [None]:
!pip install matplotlib scikit-learn



In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np
from tqdm import tqdm
import os

# 初始化GTE模型和分词器，并信任远程代码
model_name = "Alibaba-NLP/gte-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

# 检查是否有可用的GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 数据预处理函数
def preprocess_function(examples):
    return tokenizer(
        examples['sentence1'],
        examples['sentence2'],
        padding=True,
        truncation=True,
        max_length=128  # 根据实际需求调整最大长度
    )

# 将Pandas DataFrame转换为列表并进行预处理
def prepare_text_pairs(dataframe):
    text_pairs = []
    for _, row in dataframe.iterrows():
        text_pairs.append((row['sentence1'], row['sentence2']))
    return text_pairs

# 获取句子嵌入
def get_embeddings(texts, batch_size=16):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        batch_dict = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
        batch_dict = {key: val.to(device) for key, val in batch_dict.items()}

        with torch.no_grad():
            outputs = model(**batch_dict)

        batch_embeddings = outputs.last_hidden_state[:, 0]  # 使用[CLS]标记作为句子嵌入
        batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)  # 归一化嵌入向量
        embeddings.append(batch_embeddings.cpu().numpy())

        del batch_dict, outputs, batch_embeddings
        torch.cuda.empty_cache()

    return np.vstack(embeddings) if embeddings else np.array([])

# 计算相似度分数并保存中间结果
def get_similarity_scores(text_pairs, batch_size=16, save_path='similarity_scores.npy', index_save_path='index.npy'):
    all_scores = []
    start_index = 0

    if os.path.exists(save_path) and os.path.exists(index_save_path):
        all_scores = list(np.load(save_path))
        start_index = int(np.load(index_save_path))
        print(f"Resuming from index {start_index}")

    if start_index >= len(text_pairs):
        print("All similarity scores have been computed.")
        return np.array(all_scores)

    sentences1 = [pair[0] for pair in text_pairs[start_index:]]
    sentences2 = [pair[1] for pair in text_pairs[start_index:]]

    embeddings1 = get_embeddings(sentences1, batch_size=batch_size)
    embeddings2 = get_embeddings(sentences2, batch_size=batch_size)

    if embeddings1.size == 0 or embeddings2.size == 0:
        print("No new embeddings to compute.")
        return np.array(all_scores)

    scores = np.diagonal(np.dot(embeddings1, embeddings2.T))
    all_scores.extend(scores)

    np.save(save_path, np.array(all_scores))
    np.save(index_save_path, len(all_scores) + start_index)
    print(f"Saved intermediate results up to index {len(all_scores) + start_index}")

    return np.array(all_scores)

# 函数用于评估模型性能
def evaluate_model(scores, true_labels, similarity_threshold=0.5):
    pred_labels = (scores > similarity_threshold).astype(int)

    if len(pred_labels) != len(true_labels):
        raise ValueError("The number of predicted labels does not match the number of true labels.")

    accuracy = accuracy_score(true_labels, pred_labels)
    f1 = f1_score(true_labels, pred_labels, average='binary')  # 使用'binary'因为是二分类问题

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # 打印详细的分类报告（可选）
    print(classification_report(true_labels, pred_labels))

# 加载测试数据
test_data = pd.read_csv('test.tsv', sep='\t')

# 准备文本对
test_text_pairs = prepare_text_pairs(test_data)

# 获取相似度分数
scores = get_similarity_scores(test_text_pairs, batch_size=16, save_path='similarity_scores.npy', index_save_path='index.npy')

# 确保真实标签的数量与预测标签一致
true_labels = test_data['label'].values.astype(int)

# 如果有剩余的未处理样本（即最后一批次中的不成对样本），直接添加它们的真实标签
remaining_samples = len(test_data) - len(scores)
if remaining_samples > 0:
    print(f"Handling remaining samples: {remaining_samples}")
    true_labels = true_labels[:len(scores)]

evaluate_model(scores, true_labels)

Resuming from index 10000
All similarity scores have been computed.
Handling remaining samples: 6500
Accuracy: 0.5040
F1 Score: 0.3597
              precision    recall  f1-score   support

           0       0.55      0.65      0.60       836
           1       0.42      0.31      0.36       664

    accuracy                           0.50      1500
   macro avg       0.48      0.48      0.48      1500
weighted avg       0.49      0.50      0.49      1500

