In [203]:
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data as data

In [204]:
#hyperparameters
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
min_freq=10
max_len=50
batch_size=100
embed_size=300
learningrate=0.0001
dropout=0.5

In [205]:
emotionlist=['sadness','joy','love','anger','fear','surprise']
emotiondict={emotionlist[i]:i for i in range(len(emotionlist))}
def idx2emotion(idx):
    if not isinstance(idx,(tuple,list)):
        return emotionlist[idx]
    return [idx2emotion(i) for i in idx]
def emotion2idx(emotion):
    if not isinstance(emotion,(tuple,list)):
        return emotiondict[emotion]
    return [emotion2idx(i) for i in emotion]
print(idx2emotion([1,4,3,2]))
print(emotion2idx(['sadness','joy']))

['joy', 'fear', 'anger', 'love']
[0, 1]


In [206]:
df = pd.read_pickle("merged_training.pkl")
df=df.reset_index(drop=True)
print(len(df))
for i in emotionlist:
    print(f'{i}:{len(df[df.emotions==i])/len(df)}')

416809
sadness:0.2907494799776396
joy:0.3384451871240784
love:0.0829012809224369
anger:0.13751382527728528
fear:0.11446969715145307
surprise:0.03592052954710671


In [207]:
class Vocab():
    def __init__(self,tokens,min_freq=0):
        self.token_freq = {}
        for sentence in tokens:
            for token in sentence.split():
                if token not in self.token_freq:
                    self.token_freq[token] = 1
                else:
                    self.token_freq[token] += 1
        self.itos = ["<pad>","<unk>"]
        for token,freq in self.token_freq.items():
            if freq >= min_freq:
                self.itos.append(token)
        self.itos[2:]=sorted(self.itos[2:],key=lambda x:self.token_freq[x],reverse=True)
        self.stoi = {token:idx for idx,token in enumerate(self.itos)}
    def __len__(self):
        return len(self.itos)
    def __getitem__(self,idx):
        if not isinstance(idx,(list,tuple)):
            return self.itos[idx] if idx!=0 else ''
        return [self.__getitem__(i) for i in idx]
    def tokens2idx(self,tokens):
        if not isinstance(tokens,(list,tuple)):
            if ' ' in tokens:
                return self.tokens2idx(tokens.split())
            if tokens in self.stoi:
                return self.stoi[tokens]
            return self.stoi['<unk>']
        return [self.tokens2idx(i) for i in tokens]
vocab=Vocab(df.text,min_freq=min_freq)
print(vocab[(0,1,2,3,4,5,6,7,8,9)])
print(vocab.stoi['i'],vocab.tokens2idx(['i','feel']))
print(len(vocab))

['', '<unk>', 'i', 'feel', 'and', 'to', 'the', 'a', 'feeling', 'that']
2 [2, 3]
15334


In [208]:
df1=df[:350000]
dfsad=df1[df1.emotions=='sadness']
dfanger=df1[df1.emotions=='anger']
dffear=df1[df1.emotions=='fear']
dflove=df1[df1.emotions=='love']
dfsurprise=df1[df1.emotions=='surprise']
df2=pd.concat([dfsad[:7000],dfanger,dfanger[:16000],dffear,dffear[:27000],dflove,dflove,dflove[:10000],df1],ignore_index=True)
print(df2)

                                                     text emotions
0       i feel awful about it too because it s my job ...  sadness
1                                   im alone i feel awful  sadness
2                i was feeling a little low few days back  sadness
3       i also feel disillusioned that someone who cla...  sadness
4       i wish you knew every word i write i write for...  sadness
...                                                   ...      ...
556191  i just feel like screaming at myself for being...     love
556192  i always end up watching one of his dvd s as i...      joy
556193  i can t seem to describe how i am feeling in a...      joy
556194  i will never let you feel unloved for a second...  sadness
556195  i could sense that he was uncomfortable when h...     fear

[556196 rows x 2 columns]


In [209]:
class dataset(data.Dataset):
    def __init__(self,datasource,vocab):
        self.set=datasource.reset_index(drop=True)
        self.vocab=vocab
    def __getitem__(self, index):
        c=vocab.tokens2idx(self.set.text[index])
        if not isinstance(c,list):
            c=[c]
        if len(c)>=max_len:
            c=c[:max_len]
        else:
            c=c+[0]*(max_len-len(c))
        return (torch.tensor(c),self.set.emotions[index])
    def __len__(self):
        return len(self.set)
trainset=dataset(df2,vocab)
testset=dataset(df[350000:],vocab)
load_trainset=data.DataLoader(trainset,batch_size=batch_size,shuffle=True,drop_last=True)
load_testset=data.DataLoader(testset,batch_size=batch_size,shuffle=True,drop_last=True)
print(trainset[3])
print(vocab[list(trainset[3][0])])
print(df.text[3],df.emotions[3])

(tensor([   2,  119,    3,  852,    9,  147,   75, 5076,    5, 1504,    6,  985,
          29,    2,   39,   21,    7, 4094,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0]), 'sadness')
['i', 'also', 'feel', 'disillusioned', 'that', 'someone', 'who', 'claimed', 'to', 'value', 'the', 'truth', 'as', 'i', 'do', 'was', 'a', 'fraud', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
i was feeling a little low few days back sadness


In [210]:
# 位置编码
class PositionalEncoding(nn.Module):
    # 初始化函数，接收隐藏单元数量、dropout率和最大序列长度作为输入
    def __init__(self, num_hiddens, max_len=1000):
        # 调用父类的初始化函数
        super(PositionalEncoding, self).__init__()
        # 创建一个dropout层，用于随机丢弃输入的元素
        self.dropout = nn.Dropout(dropout)
        # 创建一个形状为(1, max_len, num_hiddens)的位置编码张量P，初始化为全0
        self.P = torch.zeros((1, max_len, num_hiddens))
        # 生成位置编码矩阵X，其中每一行表示一个位置的编码，编码方式采用sin和cos函数
        # 编码公式：X[i, j] = sin(i / 10000^(2j / num_hiddens)) 或 cos(i / 10000^(2j / num_hiddens))
        X = torch.arange(max_len, dtype=torch.float32).reshape(
            -1, 1) / torch.pow(10000, 
                               torch.arange(0, num_hiddens, 2, dtype=torch.float32) /
                               num_hiddens)
        # 将位置编码矩阵中的偶数维度的元素替换为sin函数的结果
        self.P[:,:,0::2] = torch.sin(X)
        # 将位置编码矩阵中的奇数维度的元素替换为cos函数的结果
        self.P[:,:,1::2] = torch.cos(X)
    # 前向传播函数，接收输入张量X作为输入
    def forward(self, X):
        # 将位置编码张量P与输入张量X相加，并将结果移动到与X相同的设备上
        X = X + self.P[:, :X.shape[1], :].to(X.device)
        # 对相加后的结果应用dropout，并返回结果
        return self.dropout(X)

In [211]:
class emotion(nn.Module):
    def __init__(self,vocab_size,embed_size,num_classes,dropout=0.5):
        super(emotion,self).__init__()
        self.embedding=nn.Embedding(vocab_size,embed_size)
        self.pos=PositionalEncoding(embed_size,max_len=max_len)
        self.multiheadattention1=nn.TransformerEncoderLayer(embed_size,5,batch_first=True,dropout=dropout)
        self.multiheadattention2=nn.TransformerEncoderLayer(embed_size,5,batch_first=True,dropout=dropout)
        self.multiheadattention3=nn.TransformerEncoderLayer(embed_size,5,batch_first=True,dropout=dropout)
        self.fc=nn.Linear(embed_size,num_classes)
        #self.softmax=nn.Softmax()
    def forward(self,x):
        mask=~(x==0)
        x=self.embedding(x)
        x=self.pos(x)
        x=self.multiheadattention1(x,src_key_padding_mask=mask)
        x=self.multiheadattention2(x,src_key_padding_mask=mask)
        x=self.multiheadattention3(x,src_key_padding_mask=mask)
        x=torch.sum(x,dim=1)/torch.sum(mask,dim=1).unsqueeze(dim=1)
        x=self.fc(x)
        #x=self.softmax(x)
        return x
model=emotion(len(vocab),embed_size,len(emotionlist),dropout=dropout)
model.to(device=device)
a=torch.ones(1,5,device=device).long()
print(model(a))

tensor([[ 0.2175,  0.1191,  0.3686, -0.4585,  0.6218,  0.1661]],
       device='cuda:0', grad_fn=<AddmmBackward0>)


In [212]:
def predict1sentence(model,sentence,vocab):
    model.eval()
    c=vocab.tokens2idx(sentence.lower().split())
    if not isinstance(c,list):
        c=[c]
    if len(c)>=max_len:
        c=c[:max_len]
    else:
        c=c+[0]*(max_len-len(c))
    c=torch.tensor(c)
    c=c.to(device=device)
    # print(c.unsqueeze(dim=0).shape)
    prediction=model(c.unsqueeze(dim=0))
    # print(c)
    # print(prediction)
    # print(torch.argmax(prediction,dim=1))
    return idx2emotion(torch.argmax(prediction,dim=1))
print(predict1sentence(model,'For all the hours I spent in the gym working and training and Vanessa you holding down the family the way that you have I cant theres no way that I can thank you enough so from the bottom of my heart thank you And what can I say Mamba out',vocab))

fear


In [225]:
def get_acc(model,load_testset):
    model.eval()
    correct=0
    total=0
    countclass=torch.zeros(6)
    totalclass=torch.zeros(6)
    for datapair in load_testset:
        text,label=datapair
        text=text.to(device)
        label=torch.tensor(emotion2idx(label))
        label=label.to(device)
        answer=model(text)
        answer=torch.argmax(answer,dim=1)
        for i in range(6):
            countclass[i]+=torch.count_nonzero(answer==torch.tensor([i],device=device)).item()
            totalclass[i]+=torch.count_nonzero(label==torch.tensor([i],device=device)).item()
        correct+=torch.count_nonzero(answer==label)
        total+=batch_size
        
    return correct,correct/total,countclass,totalclass,countclass/totalclass
print(get_acc(model,load_testset))

(tensor(60282, device='cuda:0'), tensor(0.9024, device='cuda:0'), tensor([19096., 22501.,  6611.,  8324.,  8800.,  1468.]), tensor([19277., 22767.,  5536.,  9164.,  7701.,  2355.]), tensor([0.9906, 0.9883, 1.1942, 0.9083, 1.1427, 0.6234]))


In [224]:
#train
optimizer=torch.optim.Adam(model.parameters(),lr=learningrate)
for epoch in range(1):
    model.eval()
    with torch.no_grad():
        acc=get_acc(model,load_testset=load_testset)
        print(acc)
        acc=get_acc(model,load_trainset)
        print(acc)
        freq=acc[-1]
        weight=nn.functional.softmax(-freq)
    loss=nn.CrossEntropyLoss(weight=weight*6)
    loss.to(device=device)
    model.train()
    totalloss=0
    for number,datapair in enumerate(load_trainset):
        # print(number)
        text,label=datapair
        text=text.to(device)
        label=torch.tensor(emotion2idx(label))
        # print(label)
        label=label.to(device)
        optimizer.zero_grad()
        result=model(text)
        # print(result)
        trainloss=loss(result,label)
        trainloss.backward()
        optimizer.step()
        totalloss+=trainloss
        if number%1000==0:
            print(number)
            print(totalloss/number)
    


(tensor(60279, device='cuda:0'), tensor(0.9024, device='cuda:0'), tensor([18438., 22071.,  6945.,  9840.,  7099.,  2407.]), tensor([19277., 22765.,  5535.,  9167.,  7701.,  2355.]), tensor([0.9565, 0.9695, 1.2547, 1.0734, 0.9218, 1.0221]))
(tensor(501208, device='cuda:0'), tensor(0.9013, device='cuda:0'), tensor([108548., 121418., 100637., 115961.,  93027.,  16509.]), tensor([108886., 118279.,  97036., 112280., 107004.,  12615.]), tensor([0.9969, 1.0265, 1.0371, 1.0328, 0.8694, 1.3087]))


  weight=nn.functional.softmax(-freq)


0
tensor(inf, device='cuda:0', grad_fn=<DivBackward0>)
1000
tensor(0.2628, device='cuda:0', grad_fn=<DivBackward0>)
2000
tensor(0.2623, device='cuda:0', grad_fn=<DivBackward0>)
3000
tensor(0.2612, device='cuda:0', grad_fn=<DivBackward0>)
4000
tensor(0.2600, device='cuda:0', grad_fn=<DivBackward0>)
5000
tensor(0.2588, device='cuda:0', grad_fn=<DivBackward0>)


In [223]:
torch.save(model,'6classes,emotion,withposinfo,acc=9024')

In [None]:
model=torch.load('5classes,emotion',weights_only=False)
print(model)

In [None]:
model.eval()
import random
start=random.randint(1,50000)
for i in range(start,start+20):
    p=testset[i][0]
    p=p.to(device=device)
    predict=model(p.unsqueeze(dim=0))
    predictemotion=idx2emotion(torch.argmax(predict,dim=1))
    sentence=vocab[testset[i][0].tolist()]
    try:
        sentence.remove('')
    except:
        pass
    sentence=' '.join(sentence)
    print(f'原句：{sentence}')
    print(f'预测感情：{predictemotion},实际感情：{testset[i][1]}')
