<a href="https://colab.research.google.com/github/KFCFKXQS/math/blob/main/weak_supervision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import numpy as np
import re
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

# 下载NLTK的相关资源
nltk.download('punkt')  # 分词所需数据
nltk.download('averaged_perceptron_tagger')  # 词性标注所需数据

# 选择设备，如果有CUDA则使用CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
## 1. glove_50d
glove_50d={}
with open("/content/drive/MyDrive/Colab Notebooks/LSTM/glove.6B.50d.txt") as f:
    next(f)
    for line in f:
        line=line.split()
        glove_50d[line[0]]=[float(xi) for xi in line[1:]]


In [3]:

## 2.1 dataset_agnews
# a、读取文档
def clean_text(sentence):
    cleaned_sentence = re.sub(r'[^\w\s]', '', sentence)  # 去除标点符号和特殊字符
    cleaned_sentence = cleaned_sentence.lower()  # 转换为小写
    tokens = nltk.word_tokenize(cleaned_sentence)  # 分词
    tagged_tokens = nltk.pos_tag(tokens)  # 词性标注
    # 保留动词和名词
    cleaned_tokens = [token for token, pos in tagged_tokens if pos.startswith('NN')]
    cleaned_sentence = ' '.join(cleaned_tokens)  # 拼接为字符串
    return cleaned_sentence

dataset_agnews_texts=[]
dataset_agnews_reallabels=[]

with open('/content/drive/MyDrive/Colab Notebooks/dataset/agnews/dataset.csv') as f:
    for line in f:
        dataset_agnews_reallabels.append(int(line[0]))
        dataset_agnews_texts.append(clean_text(line[1:]).strip().split())


In [4]:

# b、读取已知类别和关键词
dataset_agnews_classes={}
dataset_agnews_keywords={}
with open('/content/drive/MyDrive/Colab Notebooks/dataset/agnews/classes.txt') as f:
    for line in f:
        line=line.strip().split(':')
        dataset_agnews_classes[int(line[0])]=line[1]

with open('/content/drive/MyDrive/Colab Notebooks/dataset/agnews/keywords.txt') as f:
    for line in f:
        line = re.split(r"[:,\s]+", line.strip())
        dataset_agnews_keywords[int(line[0])]=set(line[1:])

# c、构建词典和Word2Vec映射表.并将原文档中不在glove里的未知词改为<unk>
dataset_agnews_words=set()
dataset_agnews_word2vec={}
dataset_agnews_vec2word={}
dataset_agnews_processed_texts=[]

unk_vector = [0] * 50
for sentence in dataset_agnews_texts:
    processed_sentence=[]
    for word in sentence:
        if word in glove_50d.keys():
            dataset_agnews_words.add(word)
            vec = glove_50d[word]
            dataset_agnews_word2vec[word] = vec
            dataset_agnews_vec2word[tuple(vec)] = word
            processed_sentence.append(word)

        else:
            # 把不在glove50d的词 改成<unk>
            dataset_agnews_words.add("<unk>")
            dataset_agnews_word2vec["<unk>"] = unk_vector
            dataset_agnews_vec2word[tuple(unk_vector)] = "<unk>"
            processed_sentence.append("<unk>")
    dataset_agnews_processed_texts.append(processed_sentence)

cosine_similarity_matrix=cosine_similarity([vec for vec in dataset_agnews_word2vec.values()])
keys_list = list(dataset_agnews_word2vec.keys())

In [22]:
print(list(dataset_agnews_word2vec.items())[0])

[('wall', [0.26382, 0.32453, 0.74185, -0.37095, 0.65957, -0.49222, -0.55538, -0.23779, -0.44918, -0.12702, -0.86794, -0.4006, -0.80488, 0.48755, -0.18839, 0.53307, -0.23213, -1.2418, -0.34996, -0.80586, 0.65294, -0.49259, -0.8745, -0.81071, -0.087246, -1.2377, -0.65882, 1.1209, 0.13363, -0.23701, 3.0263, -0.71435, 1.4986, -0.033124, -1.0149, -0.15854, -0.040294, -0.17169, 0.58463, -0.63653, -0.062352, -0.078485, -0.16274, 0.5391, 0.78765, -0.095975, 0.30811, -0.77773, 0.16744, -0.81749]), ('st', [0.64859, 2.4722, -0.64446, -1.114, 0.23142, 0.019663, -0.91858, 0.19075, -0.19415, -0.49484, 0.23414, 0.73106, -0.61235, -1.2222, -0.93782, 0.1332, -0.35044, -0.96254, -1.2712, 0.44081, -0.11185, 0.1422, -0.80163, 0.46084, -0.43391, -0.28229, 0.030046, -0.53431, -1.0732, 0.40196, 1.6818, 0.47278, 1.0622, -0.38899, 0.59502, -0.37821, 1.1789, 0.071788, 0.82684, 0.22042, 0.75696, -0.39883, -0.29256, -0.065231, -0.23903, 1.7483, -0.74774, -1.522, 0.59868, -0.56331]), ('bears', [-0.34941, 0.87778, 

In [5]:
# 扩充关键词
for label in dataset_agnews_keywords.keys():
    expanded_keywords = set(dataset_agnews_keywords[label])
    for word in dataset_agnews_keywords[label]:
        word_index = keys_list.index(word)
        similar_indices = np.where(cosine_similarity_matrix[word_index] > 0.8)[0]
        for similar_index in similar_indices:
            similar_word = keys_list[similar_index]
            expanded_keywords.add(similar_word)
    dataset_agnews_keywords[label] = expanded_keywords


In [6]:

# 计算每个句子的类别频率
sentence_class_freq = []
for sentence in dataset_agnews_processed_texts:
    freq = [0]*len(dataset_agnews_classes)
    for word in sentence:
        for key, keywords in dataset_agnews_keywords.items():
            if word in keywords:
                freq[key] += 1
    sentence_class_freq.append(freq)

# 根据频率赋予句子类别标签
sentence_class_pseudolabels = []
for freq in sentence_class_freq:
    max_index = np.argmax(freq)
    max_value = freq[max_index]
    sum_other_frequencies = sum(f for i, f in enumerate(freq) if i != max_index)
    if max_value >2 * sum_other_frequencies:
        sentence_class_pseudolabels.append(max_index)
    else:
        sentence_class_pseudolabels.append(-1)


In [8]:

# 计算准确率
correct_classified=0
total_classified=0

for assigned_label, real_label in zip(sentence_class_pseudolabels, dataset_agnews_reallabels):
    if assigned_label != -1:
        total_classified += 1
        if assigned_label == real_label:
            correct_classified += 1

print("Total classified: ", total_classified)
print("Correct classified: ", correct_classified)
print("Accuracy: ", correct_classified / total_classified)


Total classified:  14794
Correct classified:  11535
Accuracy:  0.7797079897255644


In [9]:
# 把每句话的词转化为数字编码
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return idxs

inputs = [prepare_sequence(sent,dataset_agnews_word2vec) for sent in dataset_agnews_processed_texts]


In [10]:
# 前面标记-1的表示未打标签，去掉它们
origin_training_texts=[]
origin_training_labels=[]
for i in range(len(inputs)):
  if sentence_class_pseudolabels[i]!=-1:
    origin_training_texts.append(list(inputs[i]))
    origin_training_labels.append(list(sentence_class_pseudolabels)[i])

In [11]:
input_size = 50
# 隐藏层的维度为64
hidden_size = 64
# 输出的维度4
output_size = len(list(dataset_agnews_classes.keys()))


In [13]:
# 定义模型
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.hidden2out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, lengths):

        packed_x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)
        packed_output, _ = self.lstm(packed_x)
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        output = self.hidden2out(output[:, -1, :])
        output = self.softmax(output)
        return output


# 构建模型
model = LSTMClassifier(input_size, hidden_size, output_size).to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.05)


In [14]:
from torch.nn.utils.rnn import pad_sequence


def collate_fn(batch):
    # 对输入序列进行填充
    sorted_batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    sequences = [torch.tensor(x[0]) for x in sorted_batch]
    sequences_padded = pad_sequence(sequences, batch_first=True)
    lengths = torch.LongTensor([len(x[0]) for x in sorted_batch])
    labels = torch.LongTensor([x[1] for x in sorted_batch])
    return sequences_padded, labels, lengths


# 原始数据和标签
origin_data = list(zip(origin_training_texts, origin_training_labels))

# 划分训练集和测试集
train_data, test_data = train_test_split(origin_data, test_size=0.2, random_state=42)

# 创建DataLoader
train_loader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False, collate_fn=collate_fn)



In [17]:
def train(model, train_loader, test_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        for batch_inputs, batch_labels, batch_lengths in train_loader:
            batch_inputs, batch_labels = batch_inputs.to(device), batch_labels.to(device)
            # Keep lengths on CPU
            optimizer.zero_grad()
            output = model(batch_inputs, batch_lengths)
            loss = criterion(output, batch_labels)
            loss.backward()
            optimizer.step()

        test_loss, test_acc = evaluate(model, test_loader, criterion)
        print(f'Epoch {epoch + 1}/{epochs} Loss: {loss.item()} Test Loss: {test_loss} Test Acc: {test_acc}')


def evaluate(model, test_loader, criterion):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_count = 0

    with torch.no_grad():
        for batch_inputs, batch_labels, batch_lengths in test_loader:
            batch_inputs, batch_labels = batch_inputs.to(device), batch_labels.to(device)
            # Keep lengths on CPU
            output = model(batch_inputs, batch_lengths)
            loss = criterion(output, batch_labels)
            total_loss += loss.item() * batch_inputs.size(0)
            predicted = torch.argmax(output, axis=1)
            correct = (predicted == batch_labels).sum().item()
            total_correct += correct
            total_count += batch_inputs.size(0)

    # 计算平均损失和精度
    avg_loss = total_loss / total_count
    accuracy = total_correct / total_count

    return avg_loss, accuracy



# 开始训练
train(model, train_loader, test_loader, criterion, optimizer, epochs=100)


Epoch 1/100 Loss: 1.3255268335342407 Test Loss: 1.3673081962028522 Test Acc: 0.32105441027374115
Epoch 2/100 Loss: 1.422743558883667 Test Loss: 1.3658901293078722 Test Acc: 0.32679959445758705
Epoch 3/100 Loss: 1.3469586372375488 Test Loss: 1.3653037614941637 Test Acc: 0.32848935451165934
Epoch 4/100 Loss: 1.402023196220398 Test Loss: 1.3649049394233035 Test Acc: 0.3281514025008449
Epoch 5/100 Loss: 1.3424988985061646 Test Loss: 1.3642396450687317 Test Acc: 0.32848935451165934
Epoch 6/100 Loss: 1.36738920211792 Test Loss: 1.3636983347003546 Test Acc: 0.32848935451165934
Epoch 7/100 Loss: 1.3678536415100098 Test Loss: 1.3633923924988531 Test Acc: 0.32848935451165934
Epoch 8/100 Loss: 1.328890085220337 Test Loss: 1.3629593900346644 Test Acc: 0.32848935451165934
Epoch 9/100 Loss: 1.3498376607894897 Test Loss: 1.3627372884637563 Test Acc: 0.32848935451165934
Epoch 10/100 Loss: 1.3527532815933228 Test Loss: 1.3623677165413999 Test Acc: 0.32848935451165934
Epoch 11/100 Loss: 1.40343415737152

KeyboardInterrupt: ignored

In [None]:
# 自学习阶段
high_confidence_threshold = 0.9

while True:
    high_confidence_samples = []
    for i, text in enumerate(dataset_agnews_processed_texts):
        if sentence_class_pseudolabels[i] == -1:  # 未标签的数据
            with torch.no_grad():
                text_tensor = torch.tensor([prepare_sequence(text, dataset_agnews_word2vec)]).float().to(device)
                prediction = model(text_tensor, [len(text)])
                probabilities, predicted = torch.max(torch.exp(prediction), axis=1)
                if probabilities.item() > high_confidence_threshold:
                    high_confidence_samples.append((text, predicted.item()))

    if len(high_confidence_samples) == 0:
        break

    # 伪标签数据
    X_pseudo_labeled, y_pseudo_labeled = zip(*high_confidence_samples)
    X_pseudo_labeled = [prepare_sequence(text, dataset_agnews_word2vec) for text in X_pseudo_labeled]
    y_pseudo_labeled = list(y_pseudo_labeled)

    # 从未标签的数据池中移除伪标签的数据
    dataset_agnews_processed_texts = [text for i, text in enumerate(dataset_agnews_processed_texts) if sentence_class_pseudolabels[i] == -1]
    sentence_class_pseudolabels = [label for label in sentence_class_pseudolabels if label == -1]

    # 将伪标签数据添加到训练集
    train_data.extend(list(zip(X_pseudo_labeled, y_pseudo_labeled)))

    # 使用增强的数据集重新创建DataLoader
    train_loader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collate_fn)

    # 使用伪标签的数据重新训练模型
    train(model, train_loader, test_loader, criterion, optimizer, epochs=5)
