In [1]:
#过滤版本差异产生的warning
import warnings
warnings.filterwarnings('ignore')
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F


In [2]:
def load_training_data(path='training_label.txt'):
    #定义读取training所需的数据
    #如果是'training_label.txt'，就读取label如果是'training_nolabel.txt'不需要读取label
    if 'training_label' in path:
        with open(path,'r',encoding = 'UTF-8') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split(' ') for line in lines]
        #每行按空格分割后，第二个符号之后都是句子的单词
        x = [line[2:] for line in lines]
        # 每行按空格分割后，第0个字符是label
        y = [line[0] for line in lines]
        return x,y
    else:
        with open(path,'r',encoding = 'UTF-8') as f:
            lines = f.readlines()
            x = [line.strip('\n').split(' ') for line in lines]
        return x#因为这里的数据没有label，返回x就可以
    
    
def load_testing_data(path='testing_data'):
    with open(path,'r',encoding='UTF-8')as f:
        lines = f.readlines()
        # 第0行是表头，第一行开始是数据
        #第0列是id，第一列是文本，按逗号分割，需要逗号之后的文本
        X = [''.join(line.strip('\n').split(',')[1:]).strip() for line in lines[1:]]
        X = [sen.split(' ') for sen in X ]
    return X
    
    
def evalution(outputs,labels):
    #outputs 预测值概率(float)
    #labels 真实值标签（0，1）
    outputs[outputs>=0.5] = 1#大于等于0.5为正面
    outputs[outputs<=0.5] = 0
    accuracy = torch.sum(torch.eq(outputs,labels)).item()
    return accuracy
        
        
            
        

In [3]:
#word 2 vector training与testing的每个单词都分别变成词向量


#class gensim.models.word2vec.Word2Vec(
    #sentences=None,
    #size=100,# 词向量的维度
    #alpha=0.025,#模型初始的学习率
    #window=5,#在一个句子中，当前词于预测词在一个句子中的最大距离
    #min_count=5,# 用于过滤操作，词频小于该次数的单词会被丢弃
    #max_vocab_size=None,#设置词向量构建期间的RAM限制
    #smaple=0.001,#高频词汇的随机降采样的培育阈值
    #seed=1,#用于随即数发生器
    #workers=3,#控制训练的并行数量
    #min_alpha=0.0001,#随着训练进行，alpha线性下降到min
    #sg=0,#用来设置训练算法，sg=0，CBOW算法，1 skip-gram算法
    #hs=0,#设置为1 会采用hierarchica softmax 如果为0会使用negative sampling
    #negative=5,#noise words的数量
    #cbox_mean=1,#在CBOW算法中，这个值为0，采用上下文词向量的总和，设置为1就采用均值
    #hashfxn=<built-in function hash>,
    #iters=5,#算法迭代次数 
    #null_word=0,
    #trim_rule=None,
    #sorted_vocab=1,#如果这个值为1，则在分配word index会对单词基于频率降序排列
    #batch_words=10000,#每次批处理给线程传递的单词数量
    #compute_loss=False
#)


In [4]:
from gensim.models import Word2Vec
def train_word2vec(x):
    # 训练word to vector的word embedding
    #window：滑动窗口的大小，min_count
    model = Word2Vec(x, vector_size=250, window=5, min_count=5, workers=12, epochs=10, sg=1)
    return model

# 读取training数据
print('Loading training data...')
train_x,y = load_training_data('training_label.txt')
train_x_no_label = load_training_data('training_nolabel.txt')

#读取testing数据
print("Loading testing data...")
test_x = load_testing_data('testing_data.txt')

# 把training中的word变成vector
model = train_word2vec(train_x + train_x_no_label + test_x)#w2v

#保存vector
print('Saving model')
model.save('w2v.model')
model.save('w2v_all.model')
    

Loading training data...
Loading testing data...
Saving model


In [None]:
def add_label(outputs,threshold=0.9):
    id = (outputs>=threshold)|(outputs<1-threshold)
    outputs[outputs>=threshold] = 1
    outputs[outputs<1-threshold] = 0
    return outputs.long(),id

In [5]:
#数据预处理
# 定义一个预处理的类
class Preprocess():
    def __init__(self,sentences,sen_len,w2v_path):
        self.w2v_path = w2v_path#word2vec的存储路径
        self.sentences = sentences#句子
        self.sen_len = sen_len#句子的固定长度
        self.idx2word = []# 列表
        self.word2idx = {}# 存储单词在idx2word的下标
        self.embedding_matrix = []#存储词嵌入的向量的列表
    
    def get_w2v_model(self):
        #读取之前训练好的word2vec
        self.embedding = Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size
        
    def add_embedding(self,word):
        #这里的word只会是‘<PAD>''<UNK>'
        #把一个随机生成的表征向量vector作为上面的嵌入
        vector = torch.empty(1,self.embedding_dim)#创建一个未被初始化数值的tensor,tensor的大小是由size确定 
        torch.nn.init.uniform_(vector)#从均匀分布U(a, b)中生成值，填充输入的张量或变量
        #它的index是word2index这个词典的长度，即最后一个
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix,vector],0)#拼接向量，按行数拼接
        
    def make_embedding(self,load=True):
        print('Get embedding...')
        #获取训练好的Word2vec word embedding
        if load:
            print('Loading word to vec model...')
            self.get_w2v_model()
        else:
            raise NotImplementError
        #遍历嵌入后的单词
        for i,word in enumerate(self.embedding.wv.key_to_index):
            print('get words #{}'.format(i+1),end='\r')
            # 新加入的word的index是word2idx这个词典的长度，即最后一个
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding.wv[word])
        print('')
        #把embedding_matrix变形为tensor
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        #将<PAD> <UNK>加入embedding
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        print("total words:{}".format(len(self.embedding_matrix)))
        return self.embedding_matrix
    
    def pad_sequence(self,sentence):
        #将每个句子变成一样的长度,sen_len length
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            #如果句子长度小于sen_len的长度，就补PAD，缺多少单词补多少PAD
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx["<PAD>"])
        assert len(sentence) == self.sen_len
        return sentence
    
    def sentence_word2idx(self):
        #把句子里面的字变成相应的index
        sentence_list = []
        for i,sen in enumerate(self.sentences):
            print('sentence count #{}'.format(i+1),end='\r')
            sentence_idx = []
            for word in sen:
                if(word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx["<UNK>"])
            #将每个句子变成一样的长度
            sentence_idx = self.pad_sequence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)
    
    def labels_to_tensor(self,y):
        #把labels转成tensor
        y = [int(label) for label in y]
        return torch.LongTensor(y)
       

In [6]:
from torch.utils.data import DataLoader,Dataset

class TwitterDataset(Dataset):
    #Expected Data shape like :(data_num,data_len)
    #Data can be a list of numpy array or a list of lists
    #input data shape:(data_num,seq_len,feature_dim)
    def __init__(self,X,y):
        self.data = X
        self.label = y
    def __getitem__(self,idx):
        if self.label is None:return self.data[idx]
        return self.data[idx],self.label[idx]
    def __len__(self):
        return len(self.data)
    
    




In [7]:
# define LSTM model
from torch import nn

class LSTM_Net(nn.Module):
    def __init__(self,embedding,embedding_dim,hidden_dim,num_layers,dropout=0.5,fix_embedding=True):
        #其中参数input_size 输入数据的特征维数，通常就是embedding_dim(词向量的维度)hidden_size　LSTM中隐层的维度num_layers　循环神经网络的层数batch_first通常输入的数据shape=(batch_size,seq_length,embedding_dim),而batch_first默认是False,此时送进LSTM之前需要将batch_size与seq_length这两个维度调换
        super(LSTM_Net,self).__init__()
        #embedding layer
        
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))# embedding.size(0)词典的大小尺寸，embedding.size(1)嵌入向量的维度
        self.embedding.weight = torch.nn.Parameter(embedding)
        # 是否将embedding固定住，如果fix_embedding为False，在训练过程中embedding也会跟着被训练
        self.embedding.weight.requires_grad = False if fix_embedding else True#self.embedding.weight.requires_grad 指定是否在训练过程中对词向量的权重进行微调
        self.embedding_dim = embedding.size(1)#输入的特征维数 词向量的维度
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim,hidden_dim,num_layers=num_layers,batch_first = True)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim,1),
            nn.Sigmoid()
        )
    
    def forward(self,inputs):
        inputs = self.embedding(inputs)
        x,_ = self.lstm(inputs,None)
        #x的dimension（batch,seq_len,hidden-size)
        #取用LSTM最后一层的hidden state丢到分类器中
        x = x[:,-1,:]#最后一层
        x = self.classifier(x)
        return x
        
        
        

In [8]:
#training
def training(batch_size,n_epoch,lr,train,valid,model,device):
    #输出模型总的参数数量，可训练的参数数量 
    total = sum(p.numel() for p in model.parameters())#numel函数用来获取tensor的元素数量
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('\n start training,parameters total:{},trainable:{}\n'.format(total,trainable))
    
    loss = nn.BCELoss()#定义损失函数为二元交叉熵损失 binary cross entropy loss 这个损失会使输入接近1的输出接近1，输入接近0输出接近0
    t_batch = len(train)#training数据的batch size大小
    v_batch = len(valid)#validation数据的batch size大小
    optimizer = optim.Adam(model.parameters(),lr=lr)#Adam优化器 
    total_loss,total_acc,best_acc = 0,0,0
    for epoch in range(n_epoch):
        total_loss,total_acc = 0,0
        
        #training
        model.train()#model的模式设为train，这样optimizer可以更新model的参数
        for i,(inputs,labels) in enumerate(train):
            inputs = inputs.to(device,dtype=torch.long)# 因为device为‘cuda' inputs tranform to torch.cuda.LongTensor
            labels = labels.to(device,dtype=torch.float)#因为device为‘cuda' inputs tranform to torch.cuda.FloatTensor
            optimizer.zero_grad()
            outputs = model(inputs)#模型输入inputs 输出outputs
            outputs = outputs.squeeze()#去掉最外面的dimension，好让outputs进入loss（） squeeze()可以用来去除维度，仅在维度为1时有效
            batch_loss = loss(outputs,labels)# 计算模型此时的trainingloss
            batch_loss.backward()# 计算loss的gradient
            optimizer.step()
            #计算模型此时的training accuracy
            accuracy = evalution(outputs,labels)
            total_acc += (accuracy/batch_size)
            total_loss += batch_loss.item()
        print('Epoch|{}/{}'.format(epoch+1,n_epoch))
        print("Train|loss:{:.5f}Acc:{:.3f}".format(total_loss/t_batch,total_acc/t_batch*100))
        
        
        
        #validtion
        model.eval()#将model的模式设为eval，这样可以固定model的参数训练完train样本后，生成的模型model要用来测试样本。在model(test)之前，需要加上model.eval()
        with torch.no_grad():
            total_loss,total_acc =0,0
            for i,(inputs,labels) in enumerate(valid):
                inputs = inputs.to(device,dtype=torch.long)# 因为device为‘cuda' inputs tranform to torch.cuda.LongTensor
                labels = labels.to(device,dtype=torch.float)#因为device为‘cuda' inputs tranform to torch.cuda.FloatTensor
                outputs = model(inputs)#模型输入inputs 输出outputs
                outputs = outputs.squeeze()#去掉最外面的dimension，好让outputs进入loss（） squeeze()可以用来去除维度，仅在维度为1时有效
                batch_loss = loss(outputs,labels)# 计算模型此时的trainingloss
                accuracy = evalution(outputs,labels)
                total_acc += (accuracy/batch_size)
                total_loss += batch_loss.item()
            print("Valid|loss:{:.5f}Acc:{:.3f}".format(total_loss/v_batch,total_acc/v_batch*100))
            if total_acc>best_acc:
                #如果validation的结果优于之前所有的结果，就把当下的模型保存下来，用于之后的testing
                best_acc=total_acc
                torch.save(model,'ckpt.model')
        print('-----------------------------------------')
    

In [9]:
from sklearn.model_selection import train_test_split
device = torch.device('cuda')
#定义句子长度，要不要固定embedding，batch size，epoch，lr，w2vpath
sen_len = 20
fix_embedding = True#fix embedding during training
batch_size = 128
epoch = 5
lr = 0.001
w2v_path = 'w2v_all.model'
print("Loading data")#读取’training_label.txt','training_nolabel.txt'
trainx,y = load_training_data('training_label.txt')
train_x_no_label = load_training_data('training_nolabel.txt')

#对input与label做预处理
preprocess = Preprocess(train_x,sen_len,w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
train_x = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(y)# 以上函数的定义都可以在Preprocess找到


#定义模型
model = LSTM_Net(embedding,embedding_dim=250,hidden_dim=150,num_layers=1,dropout=0.5,fix_embedding = fix_embedding)
model = model.to(device)#device = cuda

#把data分为training data validation data
X_train,X_val,y_train,y_val = train_test_split(train_x,y,test_size =0.1,random_state=1,stratify = y)#验证集占比10%，分割方式按照y的分类方式
print('Train|Len:{}\nValid |Len:{}'.format(len(y_train),len(y_val)))

#把data做成dataset供dataloader取用
train_dataset = TwitterDataset(X=X_train,y=y_train)
val_dataset = TwitterDataset(X=X_val,y=y_val)

#data转换为batch of tensors
train_loader = DataLoader(train_dataset,batch_size = batch_size,shuffle=True,num_workers = 0)
val_loader = DataLoader(val_dataset,batch_size = batch_size,shuffle = False,num_workers = 0)

training(batch_size,epoch,lr,train_loader,val_loader,model,device)
      

Loading data
Get embedding...
Loading word to vec model...
get words #24694
total words:24696
Train|Len:180000200000
Valid |Len:20000

 start training,parameters total:6415351,trainable:241351

Epoch|1/5
Train|loss:0.49745Acc:74.985
Valid|loss:0.45371Acc:78.120
-----------------------------------------
Epoch|2/5
Train|loss:0.44246Acc:79.167
Valid|loss:0.43972Acc:78.707
-----------------------------------------
Epoch|3/5
Train|loss:0.42706Acc:80.112
Valid|loss:0.42761Acc:79.454
-----------------------------------------
Epoch|4/5
Train|loss:0.41419Acc:80.871
Valid|loss:0.42308Acc:79.837
-----------------------------------------
Epoch|5/5
Train|loss:0.40315Acc:81.485
Valid|loss:0.42105Acc:79.752
-----------------------------------------


In [12]:
def testing(batch_size,test_loader,model,device):
    model.eval()
    ret_output= []
    with torch.no_grad():
        for i,inputs in enumerate(test_loader):
            inputs = inputs.to(device,dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            outputs[outputs>=0.5] = 1#大于等于0.5为正面
            outputs[outputs<0.5]= 0
            ret_output += outputs.int().tolist()
            
    return ret_output
            

In [13]:
# 测试模型并作预测

# 读取测试数据test_x
print("loading testing data ...")
test_x = load_testing_data('testing_data.txt')
# 对test_x作预处理
preprocess = Preprocess(test_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = TwitterDataset(X=test_x, y=None)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False, num_workers = 0)

# 读取模型
print('\nload model ...')
model = torch.load('ckpt.model')
# 测试模型
outputs = testing(batch_size, test_loader, model, device)

# 保存为 csv 
tmp = pd.DataFrame({"id":[str(i) for i in range(len(test_x))],"label":outputs})
print("save csv ...")
tmp.to_csv('predict RNN emtion.csv', index=False)
print("Finish Predicting")


loading testing data ...
Get embedding...
Loading word to vec model...
get words #24694
total words:24696
sentence count #200000
load model ...
save csv ...
Finish Predicting
