In [0]:
import os
path = "/content/drive/My Drive/NLP/sentiment_compete"
os.chdir(path)

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 4.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 8.9MB/s 
[?25hCollecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████████████████████████████████| 3.7MB 21.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |█████

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
!export CUDA_LAUNCH_BLOCKING=1 

In [0]:
from processData import DataProcessor
from transformers import BertTokenizer,BertConfig
from transformers import BertModel
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader
from config import Config
import torch
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
from executor import ModelExcuter
from model.bert_model_base import BertModel_Base
from model.robert_model_base import RoBertModel_Base
from model.ernie_model_base import ErnieModel_Base
from model.ernie_model_pool_last3 import Ernie_PoolLast3_Model
from model.ernie_poollast3_multidp import Ernie_poollast3_multidp
from model.ernie_model_outpool import Ernie_outpool
from model.bert_outpool import Bert_outpool
from dataSet import BertDataSet


SEED = 6666

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
def kfold_train(modelExcuter, train_feature, train_labels, config):
    skf = StratifiedKFold(n_splits=5)
    for fold_index, (train_index, dev_index) in enumerate(skf.split(train_feature[0], train_labels)):
        train_input_ids = train_feature[0][train_index]
        train_attention_mask = train_feature[1][train_index]
        train_token_type_ids = train_feature[2][train_index]
        dev_input_ids = train_feature[0][dev_index]
        dev_attention_mask = train_feature[1][dev_index]
        dev_token_type_ids = train_feature[2][dev_index]

        train_dataSet = BertDataSet(train_input_ids, 
                                    train_attention_mask, 
                                    train_token_type_ids, 
                                    labels=train_labels[train_index],
                                    device= config.device)
        dev_dataSet = BertDataSet(dev_input_ids, 
                                    dev_attention_mask, 
                                    dev_token_type_ids, 
                                    labels=train_labels[dev_index],
                                    device= config.device)
        
        train_loader = DataLoader(train_dataSet,
                                  batch_size=config.batch_size,
                                  shuffle=True)
        dev_loader = DataLoader(dev_dataSet, 
                                batch_size=config.batch_size,
                                shuffle=True)
        #torch.cuda.empty_cache()
        print("fold-",fold_index)
        model = Ernie_outpool(config.bert_model_path, bert_config, config.max_seq_len).to(config.device)
        modelExcuter.train(model, train_loader, dev_loader, fold_index)
        del model
        torch.cuda.empty_cache()


In [0]:
def k_fold_predict(modelExcuter, test_feature, test_df, fold_num=5):
    for fold_index in range(fold_num):
        test_input_ids = test_feature[0]
        test_attention_mask = test_feature[1]
        test_token_type_ids = test_feature[2]
        test_dataSet = BertDataSet(test_input_ids, 
                                    test_attention_mask, 
                                    test_token_type_ids,  
                                    labels= None,
                                    device= config.device
                                    )
        
        test_loader = DataLoader(test_dataSet,
                                  batch_size=config.batch_size,
                                  shuffle=False)
        print("fold-",fold_index)
        model = Ernie_outpool(config.bert_model_path, bert_config, config.max_seq_len).to(config.device)
        modelExcuter.predict_k_fold(model, test_loader, test_df['微博id'], fold_index)
        del model
        torch.cuda.empty_cache()
        print("fold{}precit over".format(fold_index))


def combine_fold_anx(config, fold_num=5):
    predicts = [0,0,0]
    ids = None
    for i in range(fold_num):
        df = pd.read_csv(config.predict_save_path + "-_fold" + str(i) + ".csv")
        df.set_index('微博id', inplace=True)
        ids = df.index
        predicts = predicts + df.values
    predicts_ans = np.argmax(predicts, axis=1) - 1
    result_pd = pd.DataFrame(
            {
                'id': ids,
                'y': predicts_ans
            }
    )
    result_pd.to_csv('predict_ans.csv', index=False)
    print("finish !")

In [13]:
config = Config()
bert_config = BertConfig.from_pretrained(config.bert_config_path, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(config.bert_vocab_path)
modelExcuter = ModelExcuter(config)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [0]:
dataProcessor = DataProcessor(tokenizer, config.max_seq_len)

In [11]:
train_feature, train_labels = dataProcessor.get_train_feature(config.train_data_path)
#dev_feature, dev_labels = dataProcessor.get_dev_feature(config.dev_data_path)

100%|██████████| 99913/99913 [00:55<00:00, 1808.26it/s]


In [16]:
test_dataset, test_df = dataProcessor.get_test_feature(config.test_data_path)

100%|██████████| 10000/10000 [00:05<00:00, 1774.12it/s]


In [14]:
kfold_train(modelExcuter, train_feature, train_labels, config)

fold- 0
epoch [1/3]
Epoch: 1 Iter:    500, Train Loss:  0.79, Train Acc: 68.750%, Dev Loss:   0.6, Dev Acc: 73.673%, f1_score: 0.705875, Time: 0:06:46 *
Epoch: 1 Iter:   1000, Train Loss:   0.7, Train Acc: 71.875%, Dev Loss:  0.58, Dev Acc: 74.779%, f1_score: 0.7189451, Time: 0:15:02 *
Epoch: 1 Iter:   1500, Train Loss:  0.38, Train Acc: 81.250%, Dev Loss:  0.57, Dev Acc: 75.764%, f1_score: 0.7201647, Time: 0:23:11 *
Epoch: 1 Iter:   2000, Train Loss:  0.51, Train Acc: 78.125%, Dev Loss:  0.58, Dev Acc: 74.423%, f1_score: 0.7202258, Time: 0:31:21 *
epoch [2/3]
Epoch: 2 Iter:   2500, Train Loss:   0.4, Train Acc: 87.500%, Dev Loss:  0.61, Dev Acc: 72.221%, f1_score:  0.70502, Time: 0:39:30 -
Epoch: 2 Iter:   3000, Train Loss:  0.46, Train Acc: 81.250%, Dev Loss:  0.59, Dev Acc: 74.498%, f1_score: 0.7181823, Time: 0:47:37  
Epoch: 2 Iter:   3500, Train Loss:  0.62, Train Acc: 68.750%, Dev Loss:  0.59, Dev Acc: 73.653%, f1_score: 0.7178405, Time: 0:55:45  
Epoch: 2 Iter:   4000, Train Los

KeyboardInterrupt: ignored

In [17]:
k_fold_predict(modelExcuter, test_dataset, test_df)

fold- 0


100%|██████████| 313/313 [00:41<00:00,  7.59it/s]


Time usage: 0:00:41
fold0precit over
fold- 1


100%|██████████| 313/313 [00:41<00:00,  7.59it/s]


Time usage: 0:00:41
fold1precit over
fold- 2


100%|██████████| 313/313 [00:41<00:00,  7.59it/s]


Time usage: 0:00:41
fold2precit over
fold- 3


100%|██████████| 313/313 [00:41<00:00,  7.60it/s]


Time usage: 0:00:41
fold3precit over
fold- 4


100%|██████████| 313/313 [00:41<00:00,  7.62it/s]


Time usage: 0:00:41
fold4precit over


In [18]:
combine_fold_anx(config)

finish !


In [0]:
model = Ernie_outpool(config.bert_model_path, bert_config, config.max_seq_len).to(config.device)

Linear
Linear


In [0]:
for name, params in model.named_parameters():
    print(name, ":", params.size())

In [0]:
modelExcuter = ModelExcuter(train_dataset, dev_dataset, config)

In [0]:
modelExcuter.train(model, use_weight=False)

In [0]:
!nvidia-smi

Sun Apr 19 14:45:28 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    32W / 250W |   3233MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [0]:
torch.cuda.empty_cache()

In [0]:
!ps -aux

In [0]:
!kill -9 120

In [0]:
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#connect").click() 
}
setInterval(ConnectButton,60000);  

setInterval(()=>{
	if(Array.from(document.getElementById("connect").children[0].children[2].innerHTML).splice(3,4).toString() === '重,新,连,接'){
		document.getElementById("connect").children[0].children[2].click()
	}
},20000)

function ClickConnect(){
	console.log("Working");
	document.querySelector("colab-connect-button").click()
}
setInterval(ClickConnect,10000)

function ClickConnect(){
   try{
    document.querySelector("colab-connect-button").click()
    console.log("working.........."); 
    }
    catch(err) {
          console.log(err)
          console.log("Error.......")
   }
}
setInterval(ClickConnect,10000)


In [0]:
import pandas as pd
ans_predict = pd.read_csv('predict_ans.csv')
ans_labled = pd.read_csv('data/test_labled.csv')

In [0]:
ans_labled['y'].astype(int)

0       0
1       0
2      -1
3       1
4       0
       ..
1739    0
1740    1
1741    1
1742    0
1743    0
Name: y, Length: 1744, dtype: int64

In [0]:
ans_predict['y'].astype(int)

0       0
1      -1
2       0
3      -1
4      -1
       ..
8251    0
8252    0
8253    0
8254    0
8255    1
Name: y, Length: 8256, dtype: int64

In [0]:
ans_labled = ans_labled[['微博id','情感倾向']]

In [0]:
ans_labled.columns = ['id','y']

In [0]:
ans_labled['y'].value_counts()

 0    1111
 1     463
-1     170
Name: y, dtype: int64

In [0]:
ans_labled[ans_labled['y'].isna()]

Unnamed: 0,id,y
480,4460767862121080,


In [0]:
ans = pd.concat([ans_predict,ans_labled], axis=0)

In [0]:
ans

Unnamed: 0,id,y
0,4456068992182160,0
1,4456424178427250,-1
2,4456797466940200,0
3,4456791021108920,-1
4,4457086404997440,-1
...,...,...
1739,4467659929973460,0
1740,4464766938724350,1
1741,4465008438487000,1
1742,4464765668060630,0


In [0]:
ans.to_csv('ans.csv',index=False)

In [0]:
ans_labled[ans_labled['情感倾向'] == -1][['微博中文内容','情感倾向']].head(100)

Unnamed: 0,微博中文内容,情感倾向
2,微博武汉肺炎看多了本来经常喉咙发炎引起发烧的我已经开始心慌慌2020愿全世界的人都健健康康2...,-1
33,#海南防控新型冠状病毒#【东方市人民医院急诊科两名医护人员被感染】东方市第2例新型冠状病毒感...,-1
41,【武汉卫健委：#武汉发现不明原因病毒性肺炎重症11例#】3日，#武汉卫健委通报不明原因肺炎#...,-1
59,#重症隔离病房中的除夕#md这春晚看不下去了，zf能不能干点事，不是说物资充足吗，这就是所谓...,-1
64,国家卫健委：全国新增确诊病例3887例累计确诊24324例185555人正在医学观察.2月4...,-1
...,...,...
1083,//@知识分子:强烈建议问责与“双黄连事件”相关的科研机构以及媒体，该事件对于防疫工作造成了...,-1
1084,//@有个梨UGlee:有意思，但房间保持正压可能是更一劳永逸的办法。//@Libre盖子:...,-1
1085,几大指数仍在涨，创业板指甚至突破了反弹新高，但两市总成交金额，却缩量了，今天勉强凑够了700...,-1
1106,//@酷盖太上头:求官方辟谣口罩不可以剪碎??剪碎会让过滤层里的病菌跑出来??更容易感染??...,-1


In [0]:
df = pd.read_csv('data/train_labled_yuanshi.csv')

In [0]:
df = df[['微博中文内容', '情感倾向']]

In [0]:
df.to_csv('view.csv',index=False)

In [0]:
text = "//@杨长雍://@但斌://@晴天蓝天空气好://@吴春芳--中国定位研究:高福领衔团队在《柳叶刀》杂志发表的9篇臭文，已被确凿的证据证实：""臭不可闻""！牢牢的钉在历史的耻辱柱上，永远翻不了案！科技部的发文，为何强调“把论文写在祖国大地上”？"

In [0]:
import re

In [0]:
re.sub(r'//@.*?:',"",text)

'高福领衔团队在《柳叶刀》杂志发表的9篇臭文，已被确凿的证据证实：臭不可闻！牢牢的钉在历史的耻辱柱上，永远翻不了案！科技部的发文，为何强调“把论文写在祖国大地上”？'

In [19]:
config.train_data_path

'data/train_clean.csv'

In [0]:
import pandas as pd
df = pd.read_csv(config.train_data_path)

In [21]:
df.shape

(99913, 7)

In [22]:
df['情感倾向'].value_counts()

 0    57619
 1    25392
-1    16902
Name: 情感倾向, dtype: int64

In [31]:
df.iloc[100:]

Unnamed: 0,微博id,微博发布时间,发布人账号,微博中文内容,微博图片,微博视频,情感倾向
100,4457139508328450,01月04日 22:32,跳蛙女孩,考试周+发烧=瘦了四斤?,[],[],-1
101,4457149385761270,01月04日 23:11,傲娇水是会夏眠的,揽镜自照，觉得自己怎生的这样好大概是昨晚发烧烧坏了脑子?,[],[],1
102,4457061863582050,01月04日 17:23,马铃薯研磨液,吗的昨天嗓子痒我就有不好感觉今天果然发烧了吗的2020能不能对我好点我就想身体健康每天都能笑...,[],[],-1
103,4457047124498050,01月04日 16:25,小Ni店長,旦增尼玛#向世界安利旦增尼玛#原本想连夜剪辑一版彩排+现场合并版，但大概?展开全文c,['https://wx2.sinaimg.cn/orj480/b640b99dly1gak...,['https://f.video.weibocdn.com/004oCFjolx07zTq...,0
104,4457028250571710,01月04日 15:10,人萌心善人間天使矢吹奈子,我看硬了 这个我懂，ta发烧了给你看在吃药。,['https://ww1.sinaimg.cn/orj360/007cdrMjgy1gaj...,[],0
...,...,...,...,...,...,...,...
99908,4473033438259880,02月17日 19:08,中国教育新闻网,#抗击新型肺炎第一线#【,['https://ww1.sinaimg.cn/orj360/682cebefly1gbz...,[],0
99909,4472969222714290,02月17日 14:53,fuzhuoting,1、类RaTG13病毒（一种从云南蝙蝠身上分离出来的冠状病毒）可能是2019-nCoV的源头...,[],[],0
99910,4473035904435920,02月17日 19:18,蝌蚪五线谱,#微博辟谣#没有证据表明，吃大蒜、漱口水、涂抹芝麻油、生理盐水洗鼻子等手段可以防止感染新型冠...,['https://ww4.sinaimg.cn/orj360/6d2cc4e6ly1gbz...,[],0
99911,4472950743017610,02月17日 13:40,医库,【新冠疫情最受关注的十一篇英文核心期刊论文全解析】本文整理了关于新型冠状病毒最受关注的十一篇...,[],[],1


In [0]:
df = pd.read_csv(config.test_data_path)

In [35]:
df['微博中文内容']

0       #你好2020#新年第一天元气满满的早起出门买早饭结果高估了自己抗冻能力回家成功冻发烧（大概...
1       大宝又感冒鼻塞咳嗽了，还有发烧。队友加班几天不回。感觉自己的情绪在家已然是随时引爆的状态。情...
2                           还要去输两天液，这天也太容易感冒发烧了，一定要多喝热水啊?
3                                 我太难了别人怎么发烧都没事就我一检查甲型流感?
4       果然是要病一场的喽回来第三天开始感冒今儿还发烧了喉咙眼睛都难受的一匹怎么样能不经意让我的毕设...
                              ...                        
9995                              「2020的黑天鹅事件」>2019-nCov?
9996    心灵鸡汤#武汉加油#我们所有人，和我们这个国家一起，正在经历着一场这个星球上史无前例的考验...
9997                          武大人民医院：发热咳嗽并非新冠肺炎的唯一首发症状(来自
9998                                闭关第二天发现一根白发2019-nCoV?
9999                    昨天还在想如果有动画短片就好了，今天就有了，视频果然更直接有效吧。
Name: 微博中文内容, Length: 10000, dtype: object