In [None]:
!nvidia-smi
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/My Drive/Colab"
%ls


Tue Oct 29 11:14:09 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

# 加载数据集
train_df = pd.read_csv('/content/train.tsv', sep='\t')
dev_df = pd.read_csv('/content/dev.tsv', sep='\t')
test_df = pd.read_csv('/content/test.tsv', sep='\t')

# 载入BERT模型和Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to('cuda' if torch.cuda.is_available() else 'cpu')

# 将文本转换为BERT输入格式
def get_bert_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # 移动到GPU或CPU
    with torch.no_grad():
        outputs = model(**inputs)
    # 获取[CLS] token的嵌入作为句子的表示
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings

# 获取句子对的嵌入
def get_embeddings_pair(sentence1, sentence2):
    emb1 = get_bert_embeddings([sentence1])
    emb2 = get_bert_embeddings([sentence2])
    return emb1, emb2

# 计算余弦相似度
def cosine_sim(emb1, emb2):
    return cosine_similarity(emb1.cpu().numpy(), emb2.cpu().numpy())[0][0]

# 构建特征
def build_features(df):
    X = []
    y = []
    for index, row in df.iterrows():
        sentence1 = row['sentence1']
        sentence2 = row['sentence2']
        emb1, emb2 = get_embeddings_pair(sentence1, sentence2)
        sim_score = cosine_sim(emb1, emb2)
        X.append([sim_score])
        y.append(row['label'])
    return X, y

# 构建训练集和开发集的特征
X_train, y_train = build_features(train_df)
X_dev, y_dev = build_features(dev_df)

# 训练分类模型（例如逻辑回归）
clf = LogisticRegression()
clf.fit(X_train, y_train)

# 计算训练集上的准确率
y_train_pred = clf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy:.4f}')

# 输出训练集上的预测结果与真实标签对比
for i in range(len(y_train)):
    print(f'Index: {i}, True Label: {y_train[i]}, Predicted Label: {y_train_pred[i]}')

# 在开发集上评估
y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
dev_accuracy = accuracy_score(y_dev, y_dev_pred)
print(f'Development Accuracy: {dev_accuracy:.4f}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Index: 44401, True Label: 0, Predicted Label: 0
Index: 44402, True Label: 1, Predicted Label: 0
Index: 44403, True Label: 0, Predicted Label: 0
Index: 44404, True Label: 0, Predicted Label: 0
Index: 44405, True Label: 0, Predicted Label: 0
Index: 44406, True Label: 1, Predicted Label: 0
Index: 44407, True Label: 1, Predicted Label: 0
Index: 44408, True Label: 0, Predicted Label: 0
Index: 44409, True Label: 0, Predicted Label: 0
Index: 44410, True Label: 1, Predicted Label: 0
Index: 44411, True Label: 1, Predicted Label: 0
Index: 44412, True Label: 1, Predicted Label: 0
Index: 44413, True Label: 1, Predicted Label: 0
Index: 44414, True Label: 1, Predicted Label: 0
Index: 44415, True Label: 1, Predicted Label: 0
Index: 44416, True Label: 0, Predicted Label: 0
Index: 44417, True Label: 0, Predicted Label: 0
Index: 44418, True Label: 0, Predicted Label: 0
Index: 44419, True Label: 0, Predicted Label: 0
Index: 44420, True Label: 1, Predicted Label: 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.56      1.00      0.72      4461
           1       0.00      0.00      0.00      3539

    accuracy                           0.56      8000
   macro avg       0.28      0.50      0.36      8000
weighted avg       0.31      0.56      0.40      8000

Development Accuracy: 0.5576


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# 加载数据集
train_df = pd.read_csv('/content/train.tsv', sep='\t')
dev_df = pd.read_csv('/content/dev.tsv', sep='\t')
test_df = pd.read_csv('/content/test.tsv', sep='\t')

# 查看数据结构
print(train_df.head())

# 载入BERT模型和Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to('cuda' if torch.cuda.is_available() else 'cpu')

# 将文本转换为BERT输入格式
def get_bert_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # 移动到GPU或CPU
    with torch.no_grad():
        outputs = model(**inputs)
    # 获取[CLS] token的嵌入作为句子的表示
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings

# 获取句子对的嵌入
def get_embeddings_pair(sentence1, sentence2):
    emb1 = get_bert_embeddings([sentence1])
    emb2 = get_bert_embeddings([sentence2])
    return emb1, emb2

# 构建特征
def build_features(df):
    X = []
    y = []
    for index, row in df.iterrows():
        sentence1 = row['sentence1']
        sentence2 = row['sentence2']
        emb1, emb2 = get_embeddings_pair(sentence1, sentence2)
        sim_score = cosine_similarity(emb1.cpu().numpy(), emb2.cpu().numpy())[0][0]
        X.append(np.concatenate((emb1.cpu().numpy().flatten(), emb2.cpu().numpy().flatten())))
        y.append(row['label'])
    return np.array(X), np.array(y)

# 构建训练集和开发集的特征
X_train, y_train = build_features(train_df)
X_dev, y_dev = build_features(dev_df)

# 定义CNN模型
class CNN(torch.nn.Module):
    def __init__(self, input_dim, num_filters, filter_sizes, output_dim):
        super(CNN, self).__init__()
        self.convs = torch.nn.ModuleList([
            torch.nn.Conv1d(in_channels=input_dim, out_channels=num_filters, kernel_size=fs) for fs in filter_sizes
        ])
        self.fc = torch.nn.Linear(num_filters * len(filter_sizes), output_dim)

    def forward(self, x):
        conved = [torch.nn.functional.relu(conv(x)) for conv in self.convs]
        pooled = [torch.nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = torch.cat(pooled, dim=1)
        return self.fc(cat)

# 超参数
input_dim = X_train.shape[1]
num_filters = 100
filter_sizes = [3, 4, 5]
output_dim = 100

# 初始化CNN模型
cnn_model = CNN(input_dim, num_filters, filter_sizes, output_dim).to('cuda' if torch.cuda.is_available() else 'cpu')

# 转换数据格式
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(cnn_model.device)
X_dev_tensor = torch.tensor(X_dev, dtype=torch.float32).to(cnn_model.device)

# 训练CNN模型
optimizer = torch.optim.Adam(cnn_model.parameters())
loss_fn = torch.nn.CrossEntropyLoss()

def train_cnn(model, optimizer, loss_fn, X_train, y_train, epochs=10):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = loss_fn(outputs, torch.tensor(y_train, dtype=torch.long).to(model.device))
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

train_cnn(cnn_model, optimizer, loss_fn, X_train_tensor, y_train)

# 提取CNN特征
def extract_cnn_features(model, X):
    model.eval()
    with torch.no_grad():
        features = model(X)
    return features.cpu().numpy()

X_train_cnn = extract_cnn_features(cnn_model, X_train_tensor)
X_dev_cnn = extract_cnn_features(cnn_model, X_dev_tensor)

# 训练随机森林分类器
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_cnn, y_train)

# 计算训练集上的准确率
y_train_pred = clf.predict(X_train_cnn)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy:.4f}')

# 输出训练集上的预测结果与真实标签对比
for i in range(len(y_train)):
    print(f'Index: {i}, True Label: {y_train[i]}, Predicted Label: {y_train_pred[i]}')

# 在开发集上评估
y_dev_pred = clf.predict(X_dev_cnn)
print(classification_report(y_dev, y_dev_pred))
dev_accuracy = accuracy_score(y_dev, y_dev_pred)
print(f'Development Accuracy: {dev_accuracy:.4f}')

   id                                          sentence1  \
0   1  In Paris , in October 1560 , he secretly met t...   
1   2  The NBA season of 1975 -- 76 was the 30th seas...   
2   3  There are also specific discussions , public p...   
3   4  When comparable rates of flow can be maintaine...   
4   5  It is the seat of Zerendi District in Akmola R...   

                                           sentence2  label  
0  In October 1560 , he secretly met with the Eng...      0  
1  The 1975 -- 76 season of the National Basketba...      1  
2  There are also public discussions , profile sp...      0  
3  The results are high when comparable flow rate...      1  
4  It is the seat of the district of Zerendi in A...      1  


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Index: 44401, True Label: 0, Predicted Label: 0
Index: 44402, True Label: 1, Predicted Label: 0
Index: 44403, True Label: 0, Predicted Label: 0
Index: 44404, True Label: 0, Predicted Label: 0
Index: 44405, True Label: 0, Predicted Label: 0
Index: 44406, True Label: 1, Predicted Label: 0
Index: 44407, True Label: 1, Predicted Label: 0
Index: 44408, True Label: 0, Predicted Label: 0
Index: 44409, True Label: 0, Predicted Label: 0
Index: 44410, True Label: 1, Predicted Label: 0
Index: 44411, True Label: 1, Predicted Label: 0
Index: 44412, True Label: 1, Predicted Label: 0
Index: 44413, True Label: 1, Predicted Label: 0
Index: 44414, True Label: 1, Predicted Label: 0
Index: 44415, True Label: 1, Predicted Label: 0
Index: 44416, True Label: 0, Predicted Label: 0
Index: 44417, True Label: 0, Predicted Label: 0
Index: 44418, True Label: 0, Predicted Label: 0
Index: 44419, True Label: 0, Predicted Label: 0
Index: 44420, True Label: 1, Predicted Label: 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.56      1.00      0.72      4461
           1       0.00      0.00      0.00      3539

    accuracy                           0.56      8000
   macro avg       0.28      0.50      0.36      8000
weighted avg       0.31      0.56      0.40      8000

Development Accuracy: 0.5576
