In [None]:
from transformers import BertModel, BertTokenizer
example_text = '我喜欢自然语言处理'
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
bert_input = tokenizer(example_text, truncation= True, return_tensors='pt')
print(tokenizer.encode(example_text))   
print(tokenizer.decode(bert_input.input_ids[0]))


In [None]:
import pandas as pd
pd_all = pd.read_csv('ChnSentiCorp_htl_all.csv')
pd_all.dropna(inplace= True)
len(pd_all)

In [None]:
#构造平衡语料
pd_positive = pd_all[pd_all.label==1]
pd_negative = pd_all[pd_all.label==0]

def get_balance_corpus(corpus_size, corpus_pos, corpus_neg):
    sample_size = corpus_size // 2
    pd_corpus_balance = pd.concat([corpus_pos.sample(sample_size, replace=corpus_pos.shape[0]<sample_size), \
                                   corpus_neg.sample(sample_size, replace=corpus_neg.shape[0]<sample_size)])
    
    print('评论数目（总体）：%d' % pd_corpus_balance.shape[0])
    print('评论数目（正向）：%d' % pd_corpus_balance[pd_corpus_balance.label==1].shape[0])
    print('评论数目（负向）：%d' % pd_corpus_balance[pd_corpus_balance.label==0].shape[0])    
    
    return pd_corpus_balance

In [None]:
ChnSentiCorp_htl_ba_4800 = get_balance_corpus(4800, pd_positive, pd_negative)

ChnSentiCorp_htl_ba_4800.sample(10)

In [None]:
import numpy as np
data_train, data_valid= np.split(ChnSentiCorp_htl_ba_4800.sample(frac=1,random_state=42) ,[int(0.8*len(ChnSentiCorp_htl_ba_4800))],axis=0)

In [None]:
import torch
import numpy as np
from torch import nn
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm-ext')

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = df['label'].values
        self.reviews = [tokenizer(review, 
                                padding='max_length', 
                                max_length = 512, 
                                truncation=True,
                                return_tensors="pt") 
                      for review in df['review']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_reviews(self, idx):
        # Fetch a batch of inputs
        return self.reviews[idx]

    def __getitem__(self, idx):
        batch_reviews = self.get_batch_reviews(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_reviews, batch_y


class BertClassifier(nn.Module):
    def __init__(self, dropout=0.4):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('hfl/chinese-bert-wwm-ext')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [None]:
from torch.optim import Adam,AdamW
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):
  # 通过Dataset类获取训练和验证集
    train, val = Dataset(train_data), Dataset(val_data)
    # DataLoader根据batch_size获取数据，训练时选择打乱样本
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2,shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
  # 判断是否使用GPU
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # 定义损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
            torch.cuda.empty_cache()
            torch.cuda.empty_cache()
            torch.cuda.empty_cache()
            torch.cuda.empty_cache()
            torch.cuda.empty_cache()
            torch.cuda.empty_cache()
            torch.cuda.empty_cache()
            torch.cuda.empty_cache()
            model = model.cuda()
            criterion = criterion.cuda()
    # 开始进入训练循环
    for epoch_num in range(epochs):
      # 定义两个变量，用于存储训练集的准确率和损失
            total_acc_train = 0
            total_loss_train = 0
      # 进度条函数tqdm    
            for train_input, train_label in tqdm(train_dataloader):
                train_label = train_label.type(torch.LongTensor).to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)
        # 通过模型得到输出
                optimizer.zero_grad()
                output = model(input_id, mask)
                # 计算损失
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                # 计算精度
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc
        # 模型更新
                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            # ------ 验证模型 -----------
            # 定义两个变量，用于存储验证集的准确率和损失
            total_acc_val = 0
            total_loss_val = 0
      # 不需要计算梯度
            with torch.no_grad():
                # 循环获取数据集，并用训练好的模型进行验证
                for val_input, val_label in val_dataloader:
          # 如果有GPU，则使用GPU，接下来的操作同训练
                    val_label = val_label.type(torch.LongTensor).to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)
  
                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'''Epochs: {epoch_num + 1} 
              | Train Loss: {total_loss_train / len(train_data): .3f} 
              | Train Accuracy: {total_acc_train / len(train_data): .3f} 
              | Val Loss: {total_loss_val / len(val_data): .3f} 
              | Val Accuracy: {total_acc_val / len(val_data): .3f}''')    

In [None]:
EPOCHS = 5
model = BertClassifier()
LR = 2e-6
train(model, data_train, data_valid, LR, EPOCHS)

In [2]:
import pandas as pd

# 读取csv文件
df = pd.read_csv('train.csv')

# 删除review字段中的空格
df['review'] = df['review'].str.replace(' ', '')

# 保存新的dataframe到csv文件
df.to_csv('new_train.csv', index=False)


In [6]:
import json
import pandas as pd

# 读取json文件
with open('train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 将数据转换为DataFrame
df = pd.DataFrame(data,columns=['review','label'])

# 保存为csv文件
df.to_csv('train_1.csv', index=False)


In [24]:
import xml.etree.ElementTree as ET
import pandas as pd

# 解析XML文件
tree = ET.parse('dataset/Sina/Training data for Emotion Classification.xml')
root = tree.getroot()

# 提取数据
data = []
for sentence in root.iter('sentence'):
    review = sentence.text
    label = sentence.get('emotion-1-type')
    if label is not None:  # 只保留有情绪标签的句子
        data.append((review, label))

# 创建dataframe
df = pd.DataFrame(data, columns=['review', 'label'])
df.to_csv('Sina1.csv', index=False)

In [22]:
import xml.etree.ElementTree as ET
import pandas as pd

# 解析XML文件
tree = ET.parse('dataset/Sina/EmotionClassficationTest.xml')
root = tree.getroot()

# 提取数据
data = []
for sentence in root.iter('sentence'):
    review = sentence.text
    label = sentence.get('emotion-1-type')
    if label is not None:  # 只保留有情绪标签的句子
        data.append((review, label))

# 创建dataframe
df = pd.DataFrame(data, columns=['review', 'label'])
df.to_csv('Sina2.csv', index=False)

In [25]:
import xml.etree.ElementTree as ET
import pandas as pd

# 解析XML文件
tree = ET.parse('dataset/Sina/ExpressionTest.xml')
root = tree.getroot()

# 提取数据
data = []
for sentence in root.iter('sentence'):
    review = sentence.text
    label = sentence.get('emotion-1-type')
    if label is not None:  # 只保留有情绪标签的句子
        data.append((review, label))

# 创建dataframe
df = pd.DataFrame(data, columns=['review', 'label'])
df.to_csv('Sina3.csv', index=False)

In [26]:
import xml.etree.ElementTree as ET
import pandas as pd

# 解析XML文件
tree = ET.parse('dataset/Sina/NLPCC2014微博情绪分析样例数据.xml')
root = tree.getroot()

# 提取数据
data = []
for sentence in root.iter('sentence'):
    review = sentence.text
    label = sentence.get('emotion-1-type')
    if label is not None:  # 只保留有情绪标签的句子
        data.append((review, label))

# 创建dataframe
df = pd.DataFrame(data, columns=['review', 'label'])
df.to_csv('Sina4.csv', index=False)

In [13]:
df = pd.read_csv('train_cat6.csv')
df['label'].value_counts()

label
0    13993
1     6697
3     5978
2     5348
5     4950
4     3167
Name: count, dtype: int64

In [4]:
import pandas as pd

# 读取四个csv文件
df1 = pd.read_csv('Sina1.csv')
df2 = pd.read_csv('Sina2.csv')
df3 = pd.read_csv('Sina3.csv')
df4 = pd.read_csv('Sina4.csv')

# 合并四个dataframe
df = pd.concat([df1, df2, df3, df4], ignore_index=True)
df = df[df['label'].isin(['like', 'sadness', 'happiness', 'disgust','anger'])]
df['label'] = df['label'].map({'like':'1','sadness':'2','disgust':'3','anger':'4','happiness':'5'})
# 保存为csv文件
df.to_csv('Sina.csv', index=False)


In [50]:
df['label'].value_counts()

label
1    7781
3    4761
5    4720
2    3776
4    2895
Name: count, dtype: int64

In [7]:
df1 = pd.read_csv('Sina.csv')
df2 = pd.read_csv('train_cat6.csv')
final_df = pd.concat([df1,df2],ignore_index=True)
final_df.to_csv('dataset/Sina.csv', index=False)

In [11]:
final_df['label'].value_counts()

label
1    14478
0    13993
3    10739
5     9670
2     9124
4     6062
Name: count, dtype: int64

In [22]:
import pandas as pd

# 读取ChnSentiCorp_htl_all.csv文件
df1 = pd.read_csv('dataset/ChnSentiCorp_htl_all.csv')
df1['label'] = df1['label'].apply(lambda x: -1 if x == 0 else x)

# 读取Sina.csv文件
df2 = pd.read_csv('dataset/Sina.csv')
df2['label'] = df2['label'].apply(lambda x: 1 if x == 5 else (-1 if x in [2, 3, 4] else x))

# 拼接两个dataframe
df = pd.concat([df1, df2], ignore_index=True)

df['label'] = df['label'] + 1
# 保存为csv文件
df.to_csv('dataset/Positive_null_negative.csv', index=False)

In [24]:
df = pd.read_csv('dataset/Positive_null_negative.csv')

df['label'].value_counts()

label
2    29470
0    28369
1    13993
Name: count, dtype: int64

In [26]:
df = pd.read_csv('dataset/Weibo.csv')
len(df)

40133