In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

In [2]:
model_name = "liam168/c2-roberta-base-finetuned-dianping-chinese"
class_num = 2 # positive or negative
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=class_num)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [3]:
!nvidia-smi

Fri Dec 10 16:54:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  TITAN RTX           Off  | 00000000:3B:00.0 Off |                  N/A |
| 41%   29C    P8     7W / 280W |    195MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  TITAN RTX           Off  | 00000000:86:00.0 Off |                  N/A |
| 41%   29C    P8     6W / 280W |      4MiB / 24220MiB |      0%      Default |
|       

In [4]:
device

device(type='cuda')

In [5]:
df = pd.read_pickle("danmukus_all_min.pkl")

In [6]:
filenames = df.filename.unique().astype(int)

In [7]:
for _i, filename in enumerate(filenames):
  if _i > 0:
    break
  print(_i, filename)
  curr_video = df[df.filename == filename]
  

0 10582565


In [8]:
curr_video.head()

Unnamed: 0,time,text,filename
0,0,这是送给中国粉丝的歌哦！！！！！！！！,10582565
1,0,傻孩子们，快跑啊！,10582565
2,0,最后亿遍,10582565
3,0,咳咳这里日斜帮你学唱(并不太赞同看个人),10582565
4,0,戒毒所见,10582565


In [9]:
by_seconds = curr_video.groupby(['time'], as_index = False).agg({'text': ' '.join})

In [10]:
by_seconds

Unnamed: 0,time,text
0,0,这是送给中国粉丝的歌哦！！！！！！！！ 傻孩子们，快跑啊！ 最后亿遍 咳咳这里日斜帮你学唱(...
1,1,没事回来中中毒 回来中毒 大家好我叫大力！ 500万助攻 500万助攻！！！ 每天一遍
2,10,， 我回来了 是你來了 各位又见面了 啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊...
3,100,❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤...
4,101,欢迎收看随时有人系列视频 西内 颜表立
...,...,...
219,95,217好好看 就两年过去了呢 模摸no西子go摸你好谁吧 桃の雫飲み干せば 共同饮尽桃之甘露...
220,96,恐龙妈妈 小姨妈 …… 217中毒了？ awsl
221,97,这毒我不戒了。 哥三不火 天理难容 银发娘我抱走了！！！
222,98,恐龙妈妈


In [11]:
our_text = by_seconds.text.values

In [13]:
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [16]:
model_output = []
for one_text in our_text:
  one_output = classifier(one_text)[0]
  one_output = 1 if one_output['label'] == 'positive' else 0
  model_output.append(one_output)
model_output = np.array(model_output)

In [14]:
model_output

array([0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0])

In [17]:
by_seconds["label"] = model_output

In [19]:
by_seconds["filename"] = filename

In [20]:
by_seconds

Unnamed: 0,time,text,label,filename
0,0,这是送给中国粉丝的歌哦！！！！！！！！ 傻孩子们，快跑啊！ 最后亿遍 咳咳这里日斜帮你学唱(...,0,49432951
1,1,没事回来中中毒 回来中毒 大家好我叫大力！ 500万助攻 500万助攻！！！ 每天一遍,0,49432951
2,10,， 我回来了 是你來了 各位又见面了 啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊...,0,49432951
3,100,❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤...,1,49432951
4,101,欢迎收看随时有人系列视频 西内 颜表立,1,49432951
...,...,...,...,...
219,95,217好好看 就两年过去了呢 模摸no西子go摸你好谁吧 桃の雫飲み干せば 共同饮尽桃之甘露...,1,49432951
220,96,恐龙妈妈 小姨妈 …… 217中毒了？ awsl,0,49432951
221,97,这毒我不戒了。 哥三不火 天理难容 银发娘我抱走了！！！,0,49432951
222,98,恐龙妈妈,0,49432951


## pipeline

In [22]:
from os import error
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

model_name = "liam168/c2-roberta-base-finetuned-dianping-chinese"
class_num = 2 # positive or negative
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=class_num)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

df = pd.read_pickle("danmukus_all_min.pkl")
filenames = df.filename.unique().astype(int)

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
max_text_len = 512 - 2
for _i, filename in enumerate(filenames):
  # if _i > 2:
  #   break
  print(_i, filename)
  try:
    curr_video = df[df.filename == filename]
    by_seconds = curr_video.groupby(['time'], as_index = False).agg({'text': ' '.join})
    our_text = by_seconds.text.values
    by_seconds["filename"] = filename

    model_output = []
    for one_text in our_text:
      if len(one_text) > max_text_len:
        one_text = one_text[:max_text_len]
      one_output = classifier(one_text)[0]
      one_output = 1 if one_output['label'] == 'positive' else 0
      model_output.append(one_output)
    model_output = np.array(model_output)
    by_seconds["label"] = model_output

    by_seconds.to_pickle('output/'+str(filename)+'.pkl')
  except Exception as e:
    print(e)

  # print(by_seconds.head())


0 10582565
  time                                               text  filename  label
0    0  这是送给中国粉丝的歌哦！！！！！！！！ 傻孩子们，快跑啊！ 最后亿遍 咳咳这里日斜帮你学唱(...  10582565      0
1    1        没事回来中中毒 回来中毒 大家好我叫大力！ 500万助攻 500万助攻！！！ 每天一遍  10582565      0
2   10  ， 我回来了 是你來了 各位又见面了 啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊啊...  10582565      0
3  100  ❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤...  10582565      1
4  101                                欢迎收看随时有人系列视频 西内 颜表立  10582565      1
1 49432951
  time                  text  filename  label
0  101     饭圈女孩怎么了，前面自己要走bb啥  49432951      0
1  103                这就是双标吧  49432951      1
2  105  说实话我不是粉也觉得b站很多up主做过了  49432951      0
3  106    欺凌者都是抱着开玩笑的心态欺负别人啊  49432951      0
4  109                    嗯！  49432951      1
2 58999505
  time                                               text  filename  label
0    0                                               jiji  58999505      1
1   10                                            哈哈哈哈哈挠腿  58999505      1
2  100  哈哈哈哈哈哈哈哈