In [None]:
import tarfile
import os

# 假设文件路径为 'x-final.tar.gz'
file_path = '/content/paws_wiki_labeled_final.tar.gz'
extract_dir = './data'  # 解压到的目录
# 解压文件
with tarfile.open(file_path, 'r:gz') as tar:
    tar.extractall(path=extract_dir)
print(f"文件解压到：{extract_dir}")

import pandas as pd
# 假设解压后的文件夹中有一个开发集文件 en-dev.tsv
data_file = os.path.join(extract_dir, '/content/data/final/dev.tsv'
# 加载数据集
df = pd.read_csv(data_file, sep='\t')
# 查看数据格式
print(df.head())

In [None]:
pip install nltk
import nltk
nltk.download('stopwords')

In [None]:
# 数据预处理
# 在加载数据后，我们可以进行文本清洗、分词、去除停用词、向量化等操作。例如，进行中文和英文文本的清洗和分词
import re
import nltk
nltk.download('punkt_tab')
# 假设使用英文文本进行预处理
def clean_text(text):
    # 清理HTML标签和非字母字符
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # 只保留字母和数字
    return text

def tokenize(text, language='english'):
    if language == 'chinese':
        return list(jieba.cut(text))
    return nltk.word_tokenize(text)

# 对英文句子进行清洗和分词
df['sentence1_clean'] = df['sentence1'].apply(clean_text)
df['sentence2_clean'] = df['sentence2'].apply(clean_text)

df['tokens1'] = df['sentence1_clean'].apply(lambda x: tokenize(x, language='english'))
df['tokens2'] = df['sentence2_clean'].apply(lambda x: tokenize(x, language='english'))

print(df[['sentence1', 'sentence2', 'tokens1', 'tokens2']].head())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 使用TF-IDF向量化器
vectorizer = TfidfVectorizer(max_features=10000)

# 将句子向量化
X = vectorizer.fit_transform(df['sentence1_clean'] + " " + df['sentence2_clean'])

print(X.shape)  # 查看向量化后的形状

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

# 创建SVM模型
svm_model = SVC(kernel='linear')

# 训练模型
svm_model.fit(X_train, y_train)

# 预测
y_pred = svm_model.predict(X_test)

# 计算准确率
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print('Classification Report:')
print(classification_report(y_test, y_pred))

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# 假设我们有预处理好的文本序列（例如，通过BERT嵌入）
def create_cnn_model(vocab_size, embedding_dim, input_length):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1, activation='sigmoid'))
    return model

# 假设我们已经将句子对转化为数字序列
vocab_size = 5000  # 词汇表大小
embedding_dim = 100  # 词嵌入维度
input_length = 100  # 输入长度（每个句子的最大词数）

# 构建CNN模型
cnn_model = create_cnn_model(vocab_size, embedding_dim, input_length)
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 假设 X_train_seq 和 X_test_seq 是经过数字化的训练和测试集
# cnn_model.fit(X_train_seq, y_train, epochs=5, batch_size=32)
# y_pred_cnn = cnn_model.predict(X_test_seq)

In [None]:
pip install transformers

In [None]:
import os
import re
import jieba
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# # 加载数据集
# data_file = os.path.join(extract_dir, '/content/data/final/train.tsv')
# df = pd.read_csv(data_file, sep='\t')
# 加载数据集
train_file = os.path.join(extract_dir, '/content/data/final/train.tsv')
test_file = os.path.join(extract_dir, '/content/data/final/test.tsv')

df_train = pd.read_csv(train_file, sep='\t')
df_test = pd.read_csv(test_file, sep='\t')

# 合并训练集和测试集
df = pd.concat([df_train, df_test], ignore_index=True)

# 数据预处理
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def tokenize(text, language='english'):
    if language == 'chinese':
        return list(jieba.cut(text))
    return nltk.word_tokenize(text)

df['sentence1_clean'] = df['sentence1'].apply(clean_text)
df['sentence2_clean'] = df['sentence2'].apply(clean_text)

# 分析词汇量和句子长度
all_texts = df['sentence1_clean'].tolist() + df['sentence2_clean'].tolist()
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(all_texts)
vocab_size = len(tokenizer.word_index) + 1  # +1 for the padding token
sequences = tokenizer.texts_to_sequences(all_texts)
max_length = max(len(seq) for seq in sequences)

# CNN模型参数
vocab_size = vocab_size  # 假设词汇表大小
embedding_dim = 200
max_length = max_length  # 假设最大序列长度

# 文本向量化
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(df['sentence1_clean'] + df['sentence2_clean'])
sequences = tokenizer.texts_to_sequences(df['sentence1_clean'] + df['sentence2_clean'])
X = pad_sequences(sequences, maxlen=max_length, padding='post')

# 划分数据集
# X_train_cnn, X_test_cnn, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)
# 划分数据集
X_train_cnn, X_test_cnn, y_train, y_test = train_test_split(X, df['label'], test_size=0.139, random_state=42)

# 构建改进的CNN模型
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
cnn_model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))  # 改变卷积核大小
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))  # 添加额外的卷积层
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(64, activation='relu'))  # 添加全连接层
cnn_model.add(tf.keras.layers.Dropout(0.5))  # 添加Dropout层
cnn_model.add(Dense(1, activation='sigmoid'))  # 修改输出层为单个单元和sigmoid激活函数

# 编译CNN模型
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 训练CNN模型以提取特征
cnn_model.fit(X_train_cnn, y_train, epochs=50, batch_size=32, validation_split=0.01, verbose=1)

# 提取CNN特征
cnn_model.layers.pop()  # 移除最后的Dense层
cnn_features_train = cnn_model.predict(X_train_cnn)
cnn_features_test = cnn_model.predict(X_test_cnn)

# 使用SVM进行分类
svm_model = SVC(kernel='linear')
svm_model.fit(cnn_features_train, y_train)
y_pred = svm_model.predict(cnn_features_test)

# 保存分类结果
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results_df.to_csv('classification_results.csv', index=False)

# 评估模型
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print('Classification Report:')
print(classification_report(y_test, y_pred))

# 保存分类结果
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results_df.to_csv('classification_results.csv', index=False)


In [None]:
# 加载和处理 dev.tsv 和 test.tsv 文件
def load_and_process_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df['sentence1_clean'] = df['sentence1'].apply(clean_text)
    df['sentence2_clean'] = df['sentence2'].apply(clean_text)
    sequences = tokenizer.texts_to_sequences(df['sentence1_clean'] + df['sentence2_clean'])
    X = pad_sequences(sequences, maxlen=max_length, padding='post')
    return X, df['label']

dev_file = os.path.join(extract_dir, '/content/data/final/dev.tsv')
test_file = os.path.join(extract_dir, '/content/data/final/test.tsv')

X_dev, y_dev = load_and_process_data(dev_file)
X_test, y_test = load_and_process_data(test_file)

# 使用训练好的SVM模型对 dev 和 test 数据集进行预测
y_pred_dev = svm_model.predict(X_dev)
y_pred_test = svm_model.predict(X_test)

# 保存 dev 和 test 的分类结果
dev_results_df = pd.DataFrame({'Actual': y_dev, 'Predicted': y_pred_dev})
test_results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_test})
dev_results_df.to_csv('dev_classification_results.csv', index=False)
test_results_df.to_csv('test_classification_results.csv', index=False)

# 评估 dev 和 test 数据集的模型性能
print("Dev Set Performance:")
print(f"Accuracy: {accuracy_score(y_dev, y_pred_dev):.4f}")
print('Classification Report:')
print(classification_report(y_dev, y_pred_dev))

print("Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")
print('Classification Report:')
print(classification_report(y_test, y_pred_test))