In [13]:
import jieba
import os
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt

import time
from datetime import timedelta
from tqdm.auto import tqdm, trange
from collections import Counter
import random

from nltk.tokenize import RegexpTokenizer
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.model_selection import KFold

In [42]:
data = pd.read_csv('use_data/data_3500_cleaned.csv')
data = data[['context', 'emotion']]
print('Original length', len(data))
data = data.dropna()
print('After dropping', len(data))
data['emotion'] = data['emotion'].astype('int64')
data.head()

Original length 3409
After dropping 3396


Unnamed: 0,context,emotion
0,冬奥 事 冰壶 比赛 充满 尖叫 摩擦 O 冬奥 事 冰壶 比赛 充满 尖...,0
1,包头 疫情 北京 冬奥 令人 一生 难忘 刷到 家门口 疫情,0
2,肖战 期待 冬奥 赛场 抹 中国 红 加油 加油,1
3,北京 冬奥会 闭幕式 冬奥 再见,0
4,北京 冬奥会 闭幕式 期待 下次 冬奥,1


In [46]:
stopwords = []
with open("data/stopwords.txt", "r", encoding="utf8") as f:
    for w in f:
        stopwords.append(w.strip())

In [None]:
# data_2 is used for binary classification
data_2 = data[(data['emotion'] == -1) | (data['emotion'] == 1)].reset_index()
data_2['emo'] = data_2['emotion'].map({-1:0, 1:1})
data_2 = data_2[['context','emo']]
print(len(data_2))

# data_3 is used for three-class classification
data_3 = data
data_3['emo'] = data_3['emotion'].map({-1:0, 0:1, 1:2})
data_3 = data_3[['context','emo']]
print(len(data_3))

dataset_2 = np.array(data_2)
dataset_3 = np.array(data_3)
dataset_3[90]

In [153]:
# Download the pre-trained word embedding from   https://github.com/Embedding/Chinese-Word-Vectors

In [44]:
# preprocess the input of the model
class Corpus:
    
    def __init__(self):

        
        self.comment_list = []

        self.word_to_index = {} # word to unique-id
        self.index_to_word = {} # unique-id to word

        # How many times each word occurs in our data after filtering
        self.word_counts = Counter()
        self.paired_num = 0  # to store how many words are paired in the pretrained word vectors
        
        self.word_embedd = np.array  #300 is the embedd dimension which is fixed here
        self.result = []
        self.data_train = []
        self.x_input = []

    def load_data(self, data_arr, stop_words):        

        # 1: Tokennize and do stop words removal
        for n in range(len(data_arr)):
            words = data_arr[n].split()
            # remove stopwords
            words = [w for w in words if w not in stop_words]
            self.comment_list.append(words)
            
        # Construct the whole word list and do stop words removing
        total_words = []
        for w in self.comment_list:
            total_words = total_words + w  # load the whole corpus
            
        self.word_counts = Counter(total_words)
            
        # 2: Creat word to id mapping
        word_list = list(self.word_counts.keys())
        for i in range(len(word_list)):
            self.word_to_index[word_list[i]] = i
            self.index_to_word[i] = word_list[i]
        
        self.word_to_index.update({'<UNK>': len(self.word_to_index), '<PAD>': len(self.word_to_index)+1})
        self.index_to_word.update({len(self.word_to_index): '<UNK>', len(self.word_to_index)+1 :'<PAD>'})
            
    def load_pre_trained_embedding(self, file_path, dimmension):
        self.word_embedd = np.random.rand(len(self.word_to_index), dimmension)  

        f = open(file_path, "r", encoding='UTF-8')
        for i, vec in enumerate(f.readlines()):
            vector = vec.strip().split(" ")
            if vector[0] in self.word_to_index:
                index = self.word_to_index[vector[0]]
                emb = [float(x) for x in vector[1:301]]  # extract the pretrained embedding
                self.paired_num += 1
                self.word_embedd[index] = np.asarray(emb, dtype='float32')                
        f.close()
        np.savez_compressed('pretrained_embedd', embeddings=self.word_embedd)
                
    
    def gen_dataset(self, label_arr, pad_size=24):

        for c in range(len(self.comment_list)):
            
            comment = self.comment_list[c]
            comment_id = []
            if len(comment) < pad_size:
                comment.extend(['<PAD>'] * (pad_size - len(comment)))
            else:
                comment = comment[:pad_size]
            # convert word to id    
            for w in comment:
                comment_id.append(self.word_to_index[w])
            # the format is [([1,2,3],2), ([2,3,4],0),,,,]    
            self.result.append((np.array(comment_id), np.array(int(label_arr[c]))))
        
            

In [47]:
weibo = Corpus()
weibo.load_data(dataset_2[:,0], stopwords)

weibo.load_pre_trained_embedding('data/sgns.weibo.char', 300)
weibo.gen_dataset(dataset_2[:, 1])

len(weibo.word_to_index)

In [80]:
total_data = weibo.result
total_data

In [82]:
len(total_data)

2324

In [57]:
class TextCNN(nn.Module):
    
    def __init__(self, embedding_dimen, sentence_len, num_filters, dropout):
        super(TextCNN, self).__init__()
        self.conv3 = nn.Conv2d(1, num_filters, (3, embedding_dimen))
        self.conv4 = nn.Conv2d(1, num_filters, (4, embedding_dimen))
        self.conv5 = nn.Conv2d(1, num_filters, (5, embedding_dimen))
        self.Max3_pool = nn.MaxPool2d((sentence_len-3+1, 1))
        self.Max4_pool = nn.MaxPool2d((sentence_len-4+1, 1))
        self.Max5_pool = nn.MaxPool2d((sentence_len-5+1, 1))
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(3*num_filters, 2)   # 2 is the number of class
        
        
    def forward(self, x):
        batch = x.shape[0]
        # Convolution
        x = x.unsqueeze(1)
        x1 = F.relu(self.conv3(x))
        x2 = F.relu(self.conv4(x))
        x3 = F.relu(self.conv5(x))

        # Pooling
        x1 = self.Max3_pool(x1)
        x2 = self.Max4_pool(x2)
        x3 = self.Max5_pool(x3)

        # capture and concatenate the features
        x = torch.cat((x1, x2, x3), -1)
        # print(x.shape)
        x = x.view(batch, 1, -1)
        
        x = self.dropout(x)

        # project the features to the labels
        x = self.fc(x)
        x = x.view(-1, 2)  # 3 is the number of the label

        return x


In [93]:
kf = KFold(n_splits=5, shuffle=False)

use_data = {}

num = 0
for train_index, test_index in kf.split(dataset_2):
    train = torch.utils.data.DataLoader(dataset=[total_data[i] for i in train_index],    # load the data
                                               batch_size=5, 
                                               shuffle=True)
    test = torch.utils.data.DataLoader(dataset=[total_data[i] for i in test_index],    # load the data
                                               batch_size=5, 
                                               shuffle=True)
    use_data[num] = [train, test]
    num +=1


In [103]:
learning_rate = 0.0001
num_epoch = 50

model = TextCNN(300, 24, 10, 0.5)
weight = torch.FloatTensor(weibo.word_embedd)
embeds = nn.Embedding.from_pretrained(weight)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

count = 0
loss_sum = 0

for epoch in tqdm(range(num_epoch)):
    for data, label in use_data[3][0]:  # use_data[0][0] is the train part
        
        input_data = embeds(data)
        out = model(input_data)
        loss = criterion(out, label)
        
        loss_sum += loss
        count += 1

        if count % 100 == 0:
            print("epoch", epoch, end='  ')
            print("The loss is: %.5f" % (loss_sum/1000))

            loss_sum = 0
            count = 0

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

  0%|          | 0/50 [00:00<?, ?it/s]

epoch 0  The loss is: 0.05943
epoch 0  The loss is: 0.05160
epoch 0  The loss is: 0.05507
epoch 1  The loss is: 0.05049
epoch 1  The loss is: 0.04886
epoch 1  The loss is: 0.04874
epoch 1  The loss is: 0.04822
epoch 2  The loss is: 0.04588
epoch 2  The loss is: 0.04766
epoch 2  The loss is: 0.04406
epoch 2  The loss is: 0.04338
epoch 3  The loss is: 0.04231
epoch 3  The loss is: 0.03998
epoch 3  The loss is: 0.03761
epoch 4  The loss is: 0.04082
epoch 4  The loss is: 0.03876
epoch 4  The loss is: 0.03765
epoch 4  The loss is: 0.03536
epoch 5  The loss is: 0.03468
epoch 5  The loss is: 0.03733
epoch 5  The loss is: 0.03121
epoch 5  The loss is: 0.03454
epoch 6  The loss is: 0.03572
epoch 6  The loss is: 0.03188
epoch 6  The loss is: 0.03199
epoch 6  The loss is: 0.03287
epoch 7  The loss is: 0.03004
epoch 7  The loss is: 0.03169
epoch 7  The loss is: 0.03001
epoch 8  The loss is: 0.03112
epoch 8  The loss is: 0.03038
epoch 8  The loss is: 0.02895
epoch 8  The loss is: 0.03095
epoch 9  T

In [104]:
model.eval()  # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
with torch.no_grad():
    correct = 0
    total = 0
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for texts, labels in use_data[3][1]:
        if total == 0:
            print(texts.shape)
        outputs = model(embeds(texts))
        _, predicted = torch.max(outputs.data, 1)  # the location of the max outputs
        total += labels.size(0)
        correct += (predicted == labels.data).sum()
        TP += ((predicted == 1) & (labels.data == 1)).sum()
        TN += ((predicted == 0) & (labels.data == 0)).sum()
        FN += ((predicted == 0) & (labels.data == 1)).sum()
        FP += ((predicted == 1) & (labels.data == 0)).sum()
        
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2 * recall * precision / (recall + precision)
    print('Accuracy: {} %'.format(100 * correct / total))
    print('Precision: {} %'.format(100 * precision))
    print('Recall: {} %'.format(100 * recall))
    print('F1 Score: {} %'.format(100 * F1))

torch.Size([5, 24])
Accuracy: 84.08602142333984 %
Precision: 83.07292175292969 %
Recall: 97.256103515625 %
F1 Score: 89.60675048828125 %


## Multiclass Classification

In [105]:
weibo_3 = Corpus()
weibo_3.load_data(dataset_3[:,0], stopwords)
weibo_3.load_pre_trained_embedding('data/sgns.weibo.char', 300)
weibo_3.gen_dataset(dataset_3[:, 1])

In [108]:
print(len(weibo_3.word_to_index))
print(weibo_3.paired_num)

14346
10807


In [109]:
total_data_3 = weibo_3.result
print(len(total_data_3))

3396


In [117]:
class TextCNN_3(nn.Module):
    
    def __init__(self, embedding_dimen, sentence_len, num_filters, dropout):
        super(TextCNN_3, self).__init__()
        self.conv3 = nn.Conv2d(1, num_filters, (3, embedding_dimen))
        self.conv4 = nn.Conv2d(1, num_filters, (4, embedding_dimen))
        self.conv5 = nn.Conv2d(1, num_filters, (5, embedding_dimen))
        self.Max3_pool = nn.MaxPool2d((sentence_len-3+1, 1))
        self.Max4_pool = nn.MaxPool2d((sentence_len-4+1, 1))
        self.Max5_pool = nn.MaxPool2d((sentence_len-5+1, 1))
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(3*num_filters, 3)   # 3 is the number of class
        
        
    def forward(self, x):
        batch = x.shape[0]
        # Convolution
        x = x.unsqueeze(1)
        x1 = F.relu(self.conv3(x))
        x2 = F.relu(self.conv4(x))
        x3 = F.relu(self.conv5(x))

        # Pooling
        x1 = self.Max3_pool(x1)
        x2 = self.Max4_pool(x2)
        x3 = self.Max5_pool(x3)

        # capture and concatenate the features
        x = torch.cat((x1, x2, x3), -1)
        # print(x.shape)
        x = x.view(batch, 1, -1)
        
        x = self.dropout(x)

        # project the features to the labels
        x = self.fc(x)
        x = x.view(-1, 3)  # 3 is the number of the label

        return x

In [118]:
kf = KFold(n_splits=5, shuffle=False)

use_data = {}

num = 0
for train_index, test_index in kf.split(dataset_3):
    train = torch.utils.data.DataLoader(dataset=[total_data_3[i] for i in train_index],    # load the data
                                               batch_size=5, 
                                               shuffle=True)
    test = torch.utils.data.DataLoader(dataset=[total_data_3[i] for i in test_index],    # load the data
                                               batch_size=5, 
                                               shuffle=True)
    use_data[num] = [train, test]
    num +=1

In [140]:
learning_rate = 0.0001
num_epoch = 50

model = TextCNN_3(300, 24, 10, 0.5)
weight = torch.FloatTensor(weibo_3.word_embedd)
embeds = nn.Embedding.from_pretrained(weight)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

count = 0
loss_sum = 0

for epoch in tqdm(range(num_epoch)):
    for data, label in use_data[4][0]:  # use_data[0][0] is the train part
        
        input_data = embeds(data)
        out = model(input_data)
        loss = criterion(out, label)
        
        loss_sum += loss
        count += 1

        if count % 100 == 0:
            print("epoch", epoch, end='  ')
            print("The loss is: %.5f" % (loss_sum/1000))

            loss_sum = 0
            count = 0

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

  0%|          | 0/50 [00:00<?, ?it/s]

epoch 0  The loss is: 0.10319
epoch 0  The loss is: 0.10352
epoch 0  The loss is: 0.09857
epoch 0  The loss is: 0.09802
epoch 0  The loss is: 0.09805
epoch 1  The loss is: 0.09147
epoch 1  The loss is: 0.09765
epoch 1  The loss is: 0.09166
epoch 1  The loss is: 0.09193
epoch 1  The loss is: 0.08736
epoch 2  The loss is: 0.09291
epoch 2  The loss is: 0.08840
epoch 2  The loss is: 0.08491
epoch 2  The loss is: 0.09080
epoch 2  The loss is: 0.08640
epoch 2  The loss is: 0.08417
epoch 3  The loss is: 0.08484
epoch 3  The loss is: 0.08260
epoch 3  The loss is: 0.08479
epoch 3  The loss is: 0.08082
epoch 3  The loss is: 0.08183
epoch 4  The loss is: 0.08245
epoch 4  The loss is: 0.08237
epoch 4  The loss is: 0.08427
epoch 4  The loss is: 0.07805
epoch 4  The loss is: 0.07967
epoch 4  The loss is: 0.07512
epoch 5  The loss is: 0.07582
epoch 5  The loss is: 0.08041
epoch 5  The loss is: 0.08039
epoch 5  The loss is: 0.07411
epoch 5  The loss is: 0.07917
epoch 6  The loss is: 0.07535
epoch 6  T

In [141]:
model.eval()  # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)

with torch.no_grad():
    correct = 0
    total = 0
    TP_0 = 0
    TN_0 = 0
    FP_0 = 0
    FN_0 = 0    
    TP_1 = 0
    TN_1 = 0
    FP_1 = 0
    FN_1 = 0
    TP_2 = 0
    TN_2 = 0
    FP_2 = 0
    FN_2 = 0
    for texts, labels in use_data[4][1]:
        if total == 0:
            print(texts.shape)
        outputs = model(embeds(texts))
        _, predicted = torch.max(outputs.data, 1)  # the location of the max outputs
        total += labels.size(0)
        correct += (predicted == labels.data).sum()
        TP_0 += ((predicted == 0) & (labels.data == 0)).sum()
        TN_0 += ((predicted != 0) & (labels.data != 0)).sum()
        FN_0 += ((predicted != 0) & (labels.data == 0)).sum()
        FP_0 += ((predicted == 0) & (labels.data != 0)).sum()
        
        TP_1 += ((predicted == 1) & (labels.data == 1)).sum()
        TN_1 += ((predicted != 1) & (labels.data != 1)).sum()
        FN_1 += ((predicted != 1) & (labels.data == 1)).sum()
        FP_1 += ((predicted == 1) & (labels.data != 1)).sum()
        
        TP_2 += ((predicted == 2) & (labels.data == 2)).sum()
        TN_2 += ((predicted != 2) & (labels.data != 2)).sum()
        FN_2 += ((predicted != 2) & (labels.data == 2)).sum()
        FP_2 += ((predicted == 2) & (labels.data != 2)).sum()
        
    precision_0 = TP_0 / (TP_0 + FP_0)
    precision_1 = TP_1 / (TP_1 + FP_1)   
    precision_2 = TP_2 / (TP_2 + FP_2)    
    recall_0 = TP_0 / (TP_0 + FN_0)
    recall_1 = TP_1 / (TP_1 + FN_1)  
    recall_2 = TP_2 / (TP_2 + FN_2)
    
    F1_0 = 2 * recall_0 * precision_0 / (recall_0 + precision_0)
    F1_1 = 2 * recall_1 * precision_1 / (recall_1 + precision_1)
    F1_2 = 2 * recall_2 * precision_2 / (recall_2 + precision_2)
    print('Accuracy: {} %'.format(100 * correct / total))
    print('Precision-macro: {} %'.format(100 * (precision_0+precision_1+precision_2)/3))
    print('Recall-macro: {} %'.format(100 * (recall_0+recall_1+recall_2)/3))
    print('F1 Score-macro: {} %'.format(100 * (F1_0+F1_1+F1_2)/3))

torch.Size([5, 24])
Accuracy: 68.7776107788086 %
Precision-macro: 66.75790405273438 %
Recall-macro: 66.07547760009766 %
F1 Score-macro: 65.99923706054688 %


In [151]:
print('Binary classification:')
print('average accuracy of cross validation:', (85.59140014648438 + 87.74193572998047 + 87.95698547363281 + 89.22413635253906 + 84.08602142333984)/5,'%')
print('average F1 score macro of cross validation:', (90.5233383178711+92.20245361328125+92.02279663085938+92.87748718261719+89.60675048828125)/5, '%')

Binary classification:
average accuracy of cross validation: 86.92009582519532 %
average F1 score macro of cross validation: 91.44656524658203 %


In [152]:
print('3-Class classification:')
print('average accuracy of cross validation:', (66.47058868408203 + 64.65390014648438+ 67.89395904541016 + 66.42121124267578 + 68.7776107788086)/5,'%')
print('average F1 score macro of cross validation:', (61.498905181884766 + 59.06038284301758 + 63.84635925292969 + 62.752620697021484 + 65.99923706054688)/5, '%')

3-Class classification:
average accuracy of cross validation: 66.84345397949218 %
average F1 score macro of cross validation: 62.631501007080075 %
