<a href="https://colab.research.google.com/github/KFCFKXQS/math/blob/main/weak_supervision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import numpy as np
import re
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

# 下载NLTK的相关资源
nltk.download('punkt')  # 分词所需数据
nltk.download('averaged_perceptron_tagger')  # 词性标注所需数据

# 选择设备，如果有CUDA则使用CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
## 1. glove_50d
glove_50d={}
with open("/content/drive/MyDrive/Colab Notebooks/LSTM/glove.6B.50d.txt") as f:
    next(f)
    for line in f:
        line=line.split()
        glove_50d[line[0]]=[float(xi) for xi in line[1:]]


In [4]:

## 2.1 dataset_agnews
# a、读取文档
def clean_text(sentence):
    cleaned_sentence = re.sub(r'[^\w\s]', '', sentence)  # 去除标点符号和特殊字符
    cleaned_sentence = cleaned_sentence.lower()  # 转换为小写
    tokens = nltk.word_tokenize(cleaned_sentence)  # 分词
    tagged_tokens = nltk.pos_tag(tokens)  # 词性标注
    # 保留动词和名词
    cleaned_tokens = [token for token, pos in tagged_tokens if pos.startswith('NN')]
    cleaned_sentence = ' '.join(cleaned_tokens)  # 拼接为字符串
    return cleaned_sentence

dataset_agnews_texts=[]
dataset_agnews_reallabels=[]

with open('/content/drive/MyDrive/Colab Notebooks/dataset/agnews/dataset.csv') as f:
    for line in f:
        dataset_agnews_reallabels.append(int(line[0]))
        dataset_agnews_texts.append(clean_text(line[1:]).strip().split())


In [5]:

# b、读取已知类别和关键词
dataset_agnews_classes={}
dataset_agnews_keywords={}
with open('/content/drive/MyDrive/Colab Notebooks/dataset/agnews/classes.txt') as f:
    for line in f:
        line=line.strip().split(':')
        dataset_agnews_classes[int(line[0])]=line[1]

with open('/content/drive/MyDrive/Colab Notebooks/dataset/agnews/keywords.txt') as f:
    for line in f:
        line = re.split(r"[:,\s]+", line.strip())
        dataset_agnews_keywords[int(line[0])]=set(line[1:])

# c、构建词典和Word2Vec映射表.并将原文档中不在glove里的未知词改为<unk>
dataset_agnews_words=set()
dataset_agnews_word2vec={}
dataset_agnews_vec2word={}
dataset_agnews_processed_texts=[]

unk_vector = [0] * 50
for sentence in dataset_agnews_texts:
    processed_sentence=[]
    for word in sentence:
        if word in glove_50d.keys():
            dataset_agnews_words.add(word)
            vec = glove_50d[word]
            dataset_agnews_word2vec[word] = vec
            dataset_agnews_vec2word[tuple(vec)] = word
            processed_sentence.append(word)

        else:
            # 把不在glove50d的词 改成<unk>
            dataset_agnews_words.add("<unk>")
            dataset_agnews_word2vec["<unk>"] = unk_vector
            dataset_agnews_vec2word[tuple(unk_vector)] = "<unk>"
            processed_sentence.append("<unk>")
    dataset_agnews_processed_texts.append(processed_sentence)

cosine_similarity_matrix=cosine_similarity([vec for vec in dataset_agnews_word2vec.values()])
keys_list = list(dataset_agnews_word2vec.keys())

In [7]:
print(list(dataset_agnews_word2vec.items())[0])
print(list(dataset_agnews_word2vec.items())[1])

('wall', [0.26382, 0.32453, 0.74185, -0.37095, 0.65957, -0.49222, -0.55538, -0.23779, -0.44918, -0.12702, -0.86794, -0.4006, -0.80488, 0.48755, -0.18839, 0.53307, -0.23213, -1.2418, -0.34996, -0.80586, 0.65294, -0.49259, -0.8745, -0.81071, -0.087246, -1.2377, -0.65882, 1.1209, 0.13363, -0.23701, 3.0263, -0.71435, 1.4986, -0.033124, -1.0149, -0.15854, -0.040294, -0.17169, 0.58463, -0.63653, -0.062352, -0.078485, -0.16274, 0.5391, 0.78765, -0.095975, 0.30811, -0.77773, 0.16744, -0.81749])
('st', [0.64859, 2.4722, -0.64446, -1.114, 0.23142, 0.019663, -0.91858, 0.19075, -0.19415, -0.49484, 0.23414, 0.73106, -0.61235, -1.2222, -0.93782, 0.1332, -0.35044, -0.96254, -1.2712, 0.44081, -0.11185, 0.1422, -0.80163, 0.46084, -0.43391, -0.28229, 0.030046, -0.53431, -1.0732, 0.40196, 1.6818, 0.47278, 1.0622, -0.38899, 0.59502, -0.37821, 1.1789, 0.071788, 0.82684, 0.22042, 0.75696, -0.39883, -0.29256, -0.065231, -0.23903, 1.7483, -0.74774, -1.522, 0.59868, -0.56331])


In [8]:
# 扩充关键词
for label in dataset_agnews_keywords.keys():
    expanded_keywords = set(dataset_agnews_keywords[label])
    for word in dataset_agnews_keywords[label]:
        word_index = keys_list.index(word)
        similar_indices = np.where(cosine_similarity_matrix[word_index] > 0.8)[0]
        for similar_index in similar_indices:
            similar_word = keys_list[similar_index]
            expanded_keywords.add(similar_word)
    dataset_agnews_keywords[label] = expanded_keywords


In [9]:
# 计算每个句子的类别频率
sentence_class_freq = []
for sentence in dataset_agnews_processed_texts:
    freq = [0]*len(dataset_agnews_classes)
    for word in sentence:
        for key, keywords in dataset_agnews_keywords.items():
            if word in keywords:
                freq[key] += 1
    sentence_class_freq.append(freq)

# 根据频率赋予句子类别标签
sentence_class_pseudolabels = []
for freq in sentence_class_freq:
    max_index = np.argmax(freq)
    max_value = freq[max_index]
    sum_other_frequencies = sum(f for i, f in enumerate(freq) if i != max_index)
    if max_value >2 * sum_other_frequencies:
        sentence_class_pseudolabels.append(max_index)
    else:
        sentence_class_pseudolabels.append(-1)


In [10]:
# 计算准确率（用于评估关键词选择算法）
correct_classified=0
total_classified=0

for assigned_label, real_label in zip(sentence_class_pseudolabels, dataset_agnews_reallabels):
    if assigned_label != -1:
        total_classified += 1
        if assigned_label == real_label:
            correct_classified += 1

print("Total classified: ", total_classified)
print("Correct classified: ", correct_classified)
print("Accuracy: ", correct_classified / total_classified)


Total classified:  14794
Correct classified:  11535
Accuracy:  0.7797079897255644


In [11]:
# 把每句话的词转化为vec
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return idxs

inputs = [prepare_sequence(sent,dataset_agnews_word2vec) for sent in dataset_agnews_processed_texts]


In [41]:
for i in range(len(inputs[1])):
  print(dataset_agnews_vec2word[tuple(inputs[1][i])])


aerospace
investment
firm
carlyle
group
reputation
plays
defense
industry
bets
part
market


In [28]:
assert (len(sentence_class_pseudolabels)==len(inputs)),'not compare'

In [29]:
# 前面标记-1的表示未打标签，去掉它们
origin_training_texts=[]
origin_training_labels=[]
for i in range(len(inputs)):
  if sentence_class_pseudolabels[i]!=-1:
    origin_training_texts.append(list(inputs[i]))
    origin_training_labels.append(list(sentence_class_pseudolabels)[i])

In [30]:
print(sentence_class_pseudolabels[:3])

[-1, 2, 2]


In [42]:
assert (origin_training_texts[0]==inputs[1]),'a'
print(len(origin_training_texts[0]))

12


In [32]:
input_size = 50
# 隐藏层维度64
hidden_size = 64
# 输出的维度4
output_size = len(list(dataset_agnews_classes.keys()))


In [63]:
# 定义模型
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.hidden2out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, lengths):
        packed_x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)
        packed_output, _ = self.lstm(packed_x)
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        output = self.hidden2out(output[:, -1, :])
        output = self.softmax(output)
        return output


# 构建模型
model = LSTMClassifier(input_size, hidden_size, output_size).to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.05)


In [64]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # 对输入序列进行填充
    sorted_batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    sequences = [torch.tensor(x[0]) for x in sorted_batch]
    sequences_padded = pad_sequence(sequences, batch_first=True)
    lengths = torch.LongTensor([len(x[0]) for x in sorted_batch])
    labels = torch.LongTensor([x[1] for x in sorted_batch])
    return sequences_padded, labels, lengths


# 原始数据和标签
origin_data = list(zip(origin_training_texts, origin_training_labels))

# 划分训练集和测试集
train_data, test_data = train_test_split(origin_data, test_size=0.2, random_state=42)


In [45]:
for i in range(len(train_data[0][0])):
  print(dataset_agnews_vec2word[tuple(train_data[0][0][i])])

print(len(train_data),len(test_data))

rookie
kazmir
outduels
confidence
environment
baseball
pitcher
sense
wonder
11835 2959


In [65]:
# 创建DataLoader
train_loader = DataLoader(train_data, batch_size=1, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=collate_fn)

In [57]:
i=0
for batch in train_loader:
    if i>=1:
      break
    i+=1
    sequences_padded, labels, lengths = batch
    # 打印每个批次的数据
    # 精度变化导致不能直接作为key查找vec2word，搜最接近的词向量
    for i in range(len(sequences_padded[0])):
        min_distance = float('inf')  # 初始化最小距离为正无穷
        closest_word = None

        for word_vector in dataset_agnews_vec2word.keys():
            distance = np.linalg.norm(np.array(sequences_padded[0][i]) - np.array(word_vector))  # 计算欧几里德距离
            if distance < min_distance:
                min_distance = distance
                closest_word = tuple(word_vector)

        print("Closest Word:", dataset_agnews_vec2word[closest_word])
    #print("Sequences Padded:", sequences_padded[0])
    print("Labels:", labels)
    print("Lengths:", lengths)


Closest Word: cell
Closest Word: phone
Closest Word: maker
Closest Word: year
Closest Word: s
Closest Word: reports
Closest Word: batteries
Closest Word: phones
Closest Word: concern
Closest Word: radiation
Closest Word: devices
Closest Word: siemens
Closest Word: software
Closest Word: defect
Closest Word: range
Closest Word: phones
Labels: tensor([3])
Lengths: tensor([16])


In [67]:
def train(model, train_loader, test_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        for batch_inputs, batch_labels, batch_lengths in train_loader:
            batch_inputs, batch_labels = batch_inputs.to(device), batch_labels.to(device)
            # Keep lengths on CPU
            optimizer.zero_grad()
            output = model(batch_inputs, batch_lengths)
            loss = criterion(output, batch_labels)
            loss.backward()
            optimizer.step()

        test_loss, test_acc = evaluate(model, test_loader, criterion)
        print(f'After Epoch {epoch + 1}/{epochs} Loss: {loss.item()} Test Loss: {test_loss} Test Acc: {test_acc}')


def evaluate(model, test_loader, criterion):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_count = 0

    with torch.no_grad():
        for batch_inputs, batch_labels, batch_lengths in test_loader:
            batch_inputs, batch_labels = batch_inputs.to(device), batch_labels.to(device)
            # Keep lengths on CPU
            output = model(batch_inputs, batch_lengths)
            loss = criterion(output, batch_labels)
            total_loss += loss.item() * batch_inputs.size(0)
            predicted = torch.argmax(output, axis=1)
            correct = (predicted == batch_labels).sum().item()
            total_correct += correct
            total_count += batch_inputs.size(0)

    # 计算平均损失和精度
    avg_loss = total_loss / total_count
    accuracy = total_correct / total_count

    return avg_loss, accuracy



# 开始训练
train(model, train_loader, test_loader, criterion, optimizer, epochs=5)


Epoch 1/5 Loss: 2.264974000354414e-06 Test Loss: 0.015330512642520106 Test Acc: 0.994930719837783


KeyboardInterrupt: ignored

In [70]:
# 自学习阶段
high_confidence_threshold = 0.9

while True:
    high_confidence_samples = []
    high_confidence_indices = [] # 保存高置信度样本id

    for i, text in enumerate(dataset_agnews_processed_texts):
        if sentence_class_pseudolabels[i] == -1:  # 未标签的
            with torch.no_grad():
                text_tensor = torch.tensor([prepare_sequence(text, dataset_agnews_word2vec)]).float().to(device)
                prediction = model(text_tensor, [len(text)])
                probabilities, predicted = torch.max(torch.exp(prediction), axis=1)
                if probabilities.item() > high_confidence_threshold:
                    high_confidence_samples.append((text, predicted.item()))
                    high_confidence_indices.append(i) # 保存索引

    if len(high_confidence_samples) == 0:
        break
    print('rest', len(high_confidence_samples))

    # 更新伪标签
    for index in high_confidence_indices:
        sentence_class_pseudolabels[index] = 1 # 或者任何非-1的值，以表示这个样本已经被标签化了

    # 伪标签数据
    X_pseudo_labeled, y_pseudo_labeled = zip(*high_confidence_samples)
    X_pseudo_labeled = [prepare_sequence(text, dataset_agnews_word2vec) for text in X_pseudo_labeled]
    y_pseudo_labeled = list(y_pseudo_labeled)

    # 从未标签的数据池中移除伪标签的数据
    dataset_agnews_processed_texts = [text for i, text in enumerate(dataset_agnews_processed_texts) if sentence_class_pseudolabels[i] == -1]
    sentence_class_pseudolabels = [label for label in sentence_class_pseudolabels if label == -1]

    # 将伪标签数据添加到训练集
    train_data.extend(list(zip(X_pseudo_labeled, y_pseudo_labeled)))
    train_loader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collate_fn)

    # 重新训练模型
    train(model, train_loader, test_loader, criterion, optimizer, epochs=5)


rest 17397
Epoch 1/5 Loss: 1.2688857316970825 Test Loss: 0.013609577720462116 Test Acc: 0.9942548158161542
Epoch 2/5 Loss: 1.341914415359497 Test Loss: 0.013663483065301235 Test Acc: 0.9942548158161542
Epoch 3/5 Loss: 1.2029669284820557 Test Loss: 0.014560110023546423 Test Acc: 0.9942548158161542
Epoch 4/5 Loss: 1.2604386806488037 Test Loss: 0.014588519719128994 Test Acc: 0.9942548158161542
Epoch 5/5 Loss: 1.2512760162353516 Test Loss: 0.01467829977687649 Test Acc: 0.9942548158161542
rest 424
Epoch 1/5 Loss: 1.1459002494812012 Test Loss: 0.014878939543280375 Test Acc: 0.9932409597837107
Epoch 2/5 Loss: 1.1532992124557495 Test Loss: 0.015181252793088584 Test Acc: 0.9932409597837107
Epoch 3/5 Loss: 1.1518064737319946 Test Loss: 0.015317580952003298 Test Acc: 0.9939168638053396
Epoch 4/5 Loss: 1.0419443845748901 Test Loss: 0.015190526503364525 Test Acc: 0.9939168638053396
Epoch 5/5 Loss: 1.0992405414581299 Test Loss: 0.015728231827660248 Test Acc: 0.9925650557620818
rest 334
Epoch 1/5 Los

KeyboardInterrupt: ignored

In [103]:
from sklearn.metrics import accuracy_score, f1_score

predicted_labels = []
for sentence in inputs:
  sentence = torch.tensor(sentence).to(device)
  a = model(sentence.unsqueeze(0), lengths=[len(sentence)])
  _, predicted = torch.max(a, 1)  # 获取预测的类别
  predicted_labels.extend(predicted.cpu().numpy())  # 将预测的标签加到列表

# 计算acc和F1分
acc = accuracy_score(dataset_agnews_reallabels, predicted_labels)
f1 = f1_score(dataset_agnews_reallabels, predicted_labels, average='macro')  # 或者 'micro'

print('Accuracy:', acc)
print('F1 Score:', f1)


Accuracy: 0.73135
F1 Score: 0.7280739136087936
