In [53]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import xml.etree.ElementTree as ET
import nltk

In [193]:
def read_train_data(file_path, num=None):
    with open(file_path, 'r', encoding="utf-8") as f:
        xml_string = f.read()
    
    root = ET.fromstring(xml_string)
    texts = []
    labels = []
    
    for doc in root.findall('.//doc'):
        parti_elem = doc.find('.//PARTI')
        if parti_elem is not None:
            party = parti_elem.get('valeur')
            text_elem = doc.find('.//texte')
            if text_elem is not None:
                paragraphs = [p.text for p in text_elem.findall('p') if p.text]
                full_text = ' '.join(paragraphs)
                texts.append(full_text)
                labels.append(party)
    if num:
        return texts[:num], labels[:num]
    else:
        return texts, labels

def read_test_data(xml_file_path, ref_file_path, num=None):
    
    id_to_texts = {}
    texts = []
    labels = []
    
    with open(xml_file_path, 'r', encoding="utf-8") as f:
        xml_string = f.read()
        
    root = ET.fromstring(xml_string)
    
    for doc in root.findall('.//doc'):
        text_id = doc.get('id')
        text_elem = doc.find('.//texte')
        if text_elem is not None:
            paragraphs = [p.text for p in text_elem.findall('p') if p.text]
            full_text = ' '.join(paragraphs)
            id_to_texts[text_id] = full_text
            
    with open(ref_file_path, 'r', encoding="utf-8") as f:
        ref_string = f.read()
        
    for line in ref_string.split("\n"):
        line_elem = line.strip().split("\t")
        if line_elem[0] in id_to_texts:
            texts.append(id_to_texts[line_elem[0]])
            labels.append(line_elem[1])
            
    if num:
        return texts[:num], labels[:num]
    else:
        return texts, labels

def build_corpus(train_texts, embedding_dim):
    word_2_index = {"<PAD>": 0, "<UNK>": 1}
    for text in train_texts:
        for word in text:
            word_2_index[word] = word_2_index.get(word, len(word_2_index))
    return word_2_index, nn.Embedding(len(word_2_index), embedding_dim)

In [194]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, word_2_index, max_len):
        self.texts = texts
        self.labels = labels
        self.word_2_index = word_2_index
        self.max_len = max_len
        
    def __getitem__(self, index):
        text = self.texts[index][:self.max_len]
        label = self.labels[index]
        
        text_idx = [self.word_2_index.get(word, 1) for word in text]
        text_idx = text_idx + [0] * (self.max_len - len(text_idx))
        text_idx = torch.tensor(text_idx).unsqueeze(0)
        
        return text_idx, label
    
    def __len__(self):
        return len(self.texts)

In [206]:
class Block(nn.Module):
    def __init__(self, kernel_s, embedding_dim, max_len):
        super().__init__()
        # in_channel(输入形状): 1 * 1 * 100 * 5 (batch * in_channel * max_len * emb_dim)
        self.cnn = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(kernel_s, embedding_dim)) 
        self.act = nn.ReLU()
        self.mxp = nn.MaxPool1d(kernel_size=(max_len-kernel_s+1))
        
    def forward(self, batch_emb):
        c = self.cnn.forward(batch_emb)
        a = self.act.forward(c)
        a = a.squeeze(dim=-1)
        m = self.mxp.forward(a)
        m = m.squeeze(dim=-1)
        return m
    
# 模型部分
class TextCNN(nn.Module):
    def __init__(self, emb_matrix, max_len, class_num):
        super().__init__()
        self.emb_matrix = emb_matrix
        self.embedding_dim = self.emb_matrix.weight.shape[1]
        self.block1 = Block(2, self.embedding_dim, max_len)
        self.block2 = Block(3, self.embedding_dim, max_len)
        self.block3 = Block(4, self.embedding_dim, max_len)
        
        self.classifier = nn.Linear(6,  class_num)
        self.loss_fun = nn.CrossEntropyLoss()
    
    def forward(self, batch_idx, batch_label=None):
        batch_emb = self.emb_matrix(batch_idx)
        b1_result = self.block1.forward(batch_emb)
        b2_result = self.block2.forward(batch_emb)
        b3_result = self.block3.forward(batch_emb)
        
        feature = torch.cat([b1_result, b2_result, b3_result], dim=1) # 32 * 6 (batch_size * 6) 6为线性层的输入维度，固定的，因为是三个卷积后经过maxpooling的结果
        prediction = self.classifier(feature)
        if batch_label is not None:
            loss = self.loss_fun(prediction, batch_label)
            return loss
        else:
            return torch.argmax(prediction, dim=-1)

In [204]:
label_mapping = {
    "ELDR": 0,
    "GUE-NGL": 1,
    "PPE-DE": 2,
    "PSE": 3,
    "Verts-ALE": 4
}

train_texts, train_labels = read_train_data("./Corpus d_apprentissage/deft09_parlement_appr_en.xml")
test_texts, test_labels = read_test_data("./Corpus de test/deft09_parlement_test_en.xml", "./Données de référence/deft09_parlement_ref_en.txt", 5000)

train_texts = [[token.lower() for token in nltk.word_tokenize(text)] for text in train_texts]
train_labels = [label_mapping[name] for name in train_labels]
# train_texts必须是这样的：[["I", "am", "taotao"], ["she", "is", "laolao"]]
# train_labels必须是这样的：[1, 2, 0]

test_texts = [[token.lower() for token in nltk.word_tokenize(text)] for text in test_texts]
test_labels = [label_mapping[name] for name in test_labels]

In [207]:
embedding_dim = 20
max_len = 200
class_num = len(set(train_labels))
batch_size = 64
epoch = 50
lr = 0.01
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

word_2_index, words_embedding = build_corpus(train_texts, embedding_dim)

train_dataset = TextDataset(train_texts, train_labels, word_2_index, max_len=max_len)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False)
# dataloader只是用来给train_dataset分batch，不改变其中的数据

test_dataset = TextDataset(test_texts, test_labels, word_2_index, max_len=max_len)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

model = TextCNN(words_embedding, max_len, class_num).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=lr)

for e in range(epoch):
    for batch_idx, batch_label in train_loader:
        batch_idx = batch_idx.to(device)
        batch_label = batch_label.to(device)
        loss = model.forward(batch_idx, batch_label)
        # 后面分batch将数据喂入模型
        loss.backward()
        opt.step()
        opt.zero_grad()
        
    print(f"epoch: {e+1}; loss: {loss:.3f}")
    
    right_num = 0
    for batch_idx, batch_label in test_loader:
        batch_idx = batch_idx.to(device)
        batch_label = batch_label.to(device)
        pre = model.forward(batch_idx)
        right_num += int(torch.sum(pre == batch_label).item())
        
    print(f"accuracy: {right_num/len(test_dataset)*100:.2f}%")

epoch: 1; loss: 1.610
accuracy: 37.90%
epoch: 2; loss: 1.648
accuracy: 39.12%
epoch: 3; loss: 1.576
accuracy: 40.56%
epoch: 4; loss: 1.449
accuracy: 41.92%
epoch: 5; loss: 1.183
accuracy: 46.02%
epoch: 6; loss: 1.052
accuracy: 49.08%
epoch: 7; loss: 0.895
accuracy: 51.06%
epoch: 8; loss: 0.809
accuracy: 52.70%
epoch: 9; loss: 0.694
accuracy: 53.40%
epoch: 10; loss: 0.803
accuracy: 54.28%
epoch: 11; loss: 0.601
accuracy: 58.26%
epoch: 12; loss: 0.702
accuracy: 57.64%
epoch: 13; loss: 0.591
accuracy: 55.26%
epoch: 14; loss: 0.576
accuracy: 57.60%
epoch: 15; loss: 0.516
accuracy: 61.22%
epoch: 16; loss: 0.586
accuracy: 62.20%
epoch: 17; loss: 0.285
accuracy: 63.56%
epoch: 18; loss: 0.258
accuracy: 64.50%
epoch: 19; loss: 0.464
accuracy: 63.04%
epoch: 20; loss: 0.225
accuracy: 64.64%
epoch: 21; loss: 0.227
accuracy: 65.70%
epoch: 22; loss: 0.312
accuracy: 65.32%
epoch: 23; loss: 0.328
accuracy: 66.90%
epoch: 24; loss: 0.188
accuracy: 66.58%
epoch: 25; loss: 0.222
accuracy: 66.92%
epoch: 26

In [135]:
words_embedding(torch.tensor([1,2,2]))

tensor([[-0.1740, -0.2303,  0.2380,  1.1727, -0.1217],
        [-0.2571, -0.6706,  1.2511, -0.4760, -0.9126],
        [-0.2571, -0.6706,  1.2511, -0.4760, -0.9126]],
       grad_fn=<EmbeddingBackward0>)