In [50]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
import jieba
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
from gensim.models import Word2Vec
from torch.nn.utils.rnn import pad_sequence
from gensim.corpora.dictionary import Dictionary
from sklearn.model_selection import train_test_split
import warnings
from sklearn.preprocessing import LabelEncoder
warnings.simplefilter('ignore')


In [51]:
data_dir = 'https://mirror.coggle.club/dataset/coggle-competition/'
train_data = pd.read_csv(data_dir + 'intent-classify/train.csv', sep='\t', header=None)
test_data = pd.read_csv(data_dir + 'intent-classify/test.csv', sep='\t', header=None)
cn_stopwords = pd.read_csv('https://mirror.coggle.club/stopwords/baidu_stopwords.txt', header=None)[0].values

le = LabelEncoder()
train_data[1] = le.fit_transform(train_data[1])

train_data['text'] = train_data[0]
train_data['label'] = train_data[1]
train_data.drop(columns=[0, 1], inplace=True)

test_data['text'] = test_data[0]
test_data.drop(columns=[0], inplace=True)

In [52]:
corpus = train_data['text']

texts = []

for i in range(len(corpus)):
    content = ''.join(corpus.iloc[i])
    words = jieba.lcut(content)
    words = [word for word in words if word not in cn_stopwords]
    texts.append(words)    

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]

In [148]:
vocab_dim = 200
hidden_dim = 50
output_dim = 12
dropout = 0.1
batch_size = 32
num_layers = 1
max_length = 20

In [149]:
word2vector_model = Word2Vec.load("word2vec_model.model")
word2vector_model.train(processed_corpus, total_examples=len(processed_corpus), epochs=10)

(590231, 670700)

In [150]:
def create_dictionaries(model, corpus, max_length):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.wv.index_to_key, allow_update=True)
    w2indx = {v: k+1 for k, v in gensim_dict.items()}
    w2vec = {word: model.wv[word] for word in w2indx.keys()}
    
    def parse_dataset(corpus):
        data = []
        for sentence in corpus:
            new_txt = []
            for word in sentence:
                try:
                    new_txt.append(w2indx[word])
                except:
                    new_txt.append(0)
            new_txt = torch.tensor(new_txt)
            data.append(new_txt)
        return data
    
    corpus = parse_dataset(corpus)
    corpus = pad_sequence(corpus)[:max_length, :]
    return w2indx, w2vec, corpus

In [151]:
def word2vec_train(corpus):
    index_dict, word_vectors, conbinds = create_dictionaries(model=word2vector_model, corpus=corpus, max_length=max_length)
    return index_dict, word_vectors, conbinds

In [152]:
def get_data(index_dict, word_vectors, corpus, y):
    n_symbols = len(index_dict) + 1
    embedding_weight = np.zeros((n_symbols, vocab_dim))
    
    for word, index in index_dict.items():
        embedding_weight[index, :] = word_vectors[word]
    X_train, X_val, y_train, y_val = train_test_split(corpus, y, test_size=0.2)
    
    return n_symbols, embedding_weight, X_train, X_val, y_train, y_val

In [153]:
index_dict, word_vectors, conbinds = word2vec_train(processed_corpus)
vocab_size, embedding_weight, X_train, X_val, y_train, y_val = get_data(index_dict, word_vectors, conbinds.T, train_data['label'])

In [154]:
def convert_hidden_shape(hidden, batch_size):
    tensor_list = []

    for i in range(batch_size):
        ts = hidden[i,: , :].reshape(1, -1)
        tensor_list.append(ts)

    ts = torch.cat(tensor_list)
    return ts

In [155]:
# Define a custom LSTM model
class LSTMmodel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, dropout, output_size, num_layers, max_length):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, dropout=dropout, num_layers=num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_size * max_length, 128)
        self.fc2 = nn.Linear(128, output_size)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        out = self.embedding(x)
        out, _ = self.lstm(out)
        out = convert_hidden_shape(out, out.shape[0])
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        
        return out

In [156]:
model = LSTMmodel(vocab_size, vocab_dim, hidden_dim, dropout, output_dim, num_layers=num_layers, max_length=max_length)

In [157]:
model.embedding.weight.data.copy_(torch.from_numpy(embedding_weight))

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-1.1459,  0.4482,  0.5943,  ..., -0.2281,  0.3203,  0.2932],
        [-0.0192,  0.0020,  0.0376,  ..., -0.0782, -0.0137, -0.0263],
        ...,
        [-0.0625, -0.0014,  0.0504,  ..., -0.0599,  0.0166,  0.0681],
        [-0.2156,  0.1499, -0.5560,  ..., -0.2428, -1.3503, -0.0564],
        [ 0.3185, -0.0058, -0.2687,  ..., -0.3950, -0.8556,  0.6132]])

In [158]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [159]:
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)
# 训练模型

for epoch in range(15):
    model.train()
    total_loss = 0
    for i in range(0, len(X_train) - batch_size, batch_size):
        optimizer.zero_grad()
        x_batch = X_train[i:i+batch_size]
        y_batch_tensor = y_train_tensor[i:i+batch_size]
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch_tensor)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    model.eval()
    with torch.no_grad():
        outputs = model(X_train)
        _, predicted = torch.max(outputs.data, 1)
        train_accuracy = (predicted == y_train_tensor).sum().item() / len(y_train_tensor)
        
        
        outputs = model(X_val)
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == y_val_tensor).sum().item() / len(y_val_tensor)
        
        print(f'Epoch {epoch+1}, Loss: {total_loss / X_train.shape[0]}', f"Train Accuracy:{train_accuracy}", f'Validation Accuracy: {accuracy}')
        
model.eval()
with torch.no_grad():
    outputs = model(X_val)
    _, predicted = torch.max(outputs.data, 1)
    accuracy = (predicted == y_val_tensor).sum().item() / len(y_val_tensor)
    print(f'Validation Accuracy: {accuracy}')

Epoch 1, Loss: 0.01570174281662407 Train Accuracy:0.9360537190082645 Validation Accuracy: 0.8966942148760331
Epoch 2, Loss: 0.007125678287062809 Train Accuracy:0.9574380165289256 Validation Accuracy: 0.8946280991735537
Epoch 3, Loss: 0.004594638801472308 Train Accuracy:0.9630165289256198 Validation Accuracy: 0.8896694214876033
Epoch 4, Loss: 0.003986751025128669 Train Accuracy:0.9727272727272728 Validation Accuracy: 0.8909090909090909
Epoch 5, Loss: 0.003563374232316698 Train Accuracy:0.9727272727272728 Validation Accuracy: 0.8805785123966943
Epoch 6, Loss: 0.0030818681416604696 Train Accuracy:0.9808884297520661 Validation Accuracy: 0.8830578512396694
Epoch 7, Loss: 0.002454325674465843 Train Accuracy:0.984504132231405 Validation Accuracy: 0.890495867768595
Epoch 8, Loss: 0.0018688649398849613 Train Accuracy:0.9865702479338843 Validation Accuracy: 0.8876033057851239
Epoch 9, Loss: 0.002112989450033923 Train Accuracy:0.9869834710743801 Validation Accuracy: 0.8921487603305785
Epoch 10, L