### Setup

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install emoji
!pip install vncorenlp



In [None]:
import torch
import emoji
import re
import gc
import os
import json
import csv
import logging as lg
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from torch import nn
from torch.nn import LSTM
from transformers import *
from vncorenlp import VnCoreNLP
from nltk.tokenize import TweetTokenizer
from pandas import DataFrame

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
!nvidia-smi

Tue Oct  5 10:33:01 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.74       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    26W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Utils.py

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
def save_checkpoint(model, tokenizer, checkpoint_path, epoch='best'):
    torch.save(model.state_dict(), os.path.join(
        checkpoint_path, f'model_{epoch}.bin'))
    model.config.to_json_file(os.path.join(checkpoint_path, 'config.json'))
    tokenizer.save_vocabulary(checkpoint_path)

### Models

In [None]:
class RobertaReINTELClassification(BertPreTrainedModel):
    def __init__(self, config):
        super(RobertaReINTELClassification, self).__init__(config)
        self.roberta = RobertaModel(config)
        self.num_labels = config.num_labels
        self.outputs = torch.nn.Linear(config.hidden_size * 4, self.num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
                start_positions=None, end_positions=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        cls_output = torch.cat((outputs[2][-1][:, 0, ...], outputs[2][-2][:, 0, ...], outputs[2][-3][:, 0, ...], outputs[2][-4][:, 0, ...]), -1)
        logits = self.outputs(cls_output)
        return logits

In [None]:
# class RobertaLargeReINTELClassification(BertPreTrainedModel):
#     def __init__(self, config):
#         super(RobertaLargeReINTELClassification, self).__init__(config)
#         self.roberta = RobertaModel(config)
#         self.num_labels = config.num_labels
#         self.outputs = torch.nn.Linear(config.hidden_size, self.num_labels)

#     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
#                 start_positions=None, end_positions=None):
#         outputs = self.roberta(input_ids, attention_mask=attention_mask)[1]
#         logits = self.outputs(outputs)
#         return logits

In [None]:
class ElectraClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        # x = get_activation("gelu")(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

In [None]:
class ElectraReINTELClassification(ElectraPreTrainedModel):
    def __init__(self, config):
        super(ElectraReINTELClassification, self).__init__(config=config)
        self.electra = ElectraModel(config)
        self.num_labels = config.num_labels
        self.init_weights()
        self.ln = torch.nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
                start_positions=None, end_positions=None):
        outputs = self.electra(input_ids, attention_mask=attention_mask, position_ids=position_ids, head_mask=head_mask)[1]
        cls_output = torch.cat((outputs[-1][:, 0, ...], outputs[-2][:, 0, ...], outputs[-3][:, 0, ...], outputs[-4][:, 0, ...]), -1)
        logits = self.ln(cls_output)
        return logits

In [None]:
class BertReINTELClassification(BertPreTrainedModel):
    def __init__(self, config):
        super(BertReINTELClassification, self).__init__(config=config)
        self.bert = BertModel(config)
        self.num_labels = config.num_labels
        self.ln = torch.nn.Linear(config.hidden_size * 4, self.num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
                start_positions=None, end_positions=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, position_ids=position_ids, head_mask=head_mask)[2]

        cls_output = torch.cat(
            (outputs[-1][:, 0, ...], outputs[-2][:, 0, ...], outputs[-3][:, 0, ...], outputs[-4][:, 0, ...]), -1)
        logits = self.ln(cls_output)
        return logits

### Preprocess

In [None]:
def normalizeToken(token):
    if len(token) == 1:
        return emoji.demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token

In [None]:
vnmese_stopwords = []
f = open("/content/drive/MyDrive/VLSP-Fake-News-Detection/vietnamese-stopwords.txt", "r")
for word in f.readlines():
  word = word.replace("\n", "")
  vnmese_stopwords.append(word)

In [None]:
def isnan(s):
    return s != s


def normalizePost(post, tweet_tokenizer, vncorenlp, use_segment=False, remove_punc_stopword=False, lowercase_opt=False, truncation_method="head_only", length=512):
    post = post.strip()
    URL_pattern = r"(?:http?s://|www.)[^\"]+"
    hashtag_pattern = r"#\s?\w+"

    post = re.sub(URL_pattern, "link", post)
    post = re.sub(hashtag_pattern, "hashtag", post)
    post = re.sub('\.+','.', post)
    if lowercase_opt:
      post = post.lower()
    tokens = tweet_tokenizer.tokenize(post.replace("’", "'").replace("…", "..."))
    
    post = " ".join(tokens)
    if use_segment:
        tokens = vncorenlp.tokenize(post.replace("’", "'").replace("…", "..."))
        tokens = [t for ts in tokens for t in ts]
    normPost = " ".join(tokens)

    if remove_punc_stopword:
      tokens = [t for t in normPost if not t in vnmese_stopwords]
    normPost = " ".join(tokens)

    normPost = re.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normPost)
    normPost = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normPost)
    normPost = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normPost)
    if use_segment:
        normPost = normPost.replace('< url >', '<url>')
        normPost = re.sub(r"# (\w+)", r'#\1', normPost)
    if truncation_method == "head_only":
      normPost = " ".join(normPost.split(" ")[:length])
    if truncation_method == "tail_only":
      normPost = " ".join(normPost.split(" ")[-length:])
    if truncation_method == "head_tail":
      normPost = " ".join(normPost.split(" ")[:int(length*0.25)]) + " " +  " ".join(normPost.split(" ")[-int(length*0.75):])
    return normPost

In [None]:
def convert_tokens_to_ids(texts, tokenizer, max_seq_length=256, labels=None):
    input_ids, attention_masks = [], []
    for text in texts:
        inputs = tokenizer.encode_plus(text, padding='max_length', max_length=max_seq_length, truncation=True)
        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])

    if labels is not None:
        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(attention_masks, dtype=torch.long), torch.tensor(
            labels, dtype=torch.long)
    return torch.tensor(input_ids, dtype=torch.long), torch.tensor(attention_masks, dtype=torch.long)

In [None]:
def get_max_seq(texts, tokenizer):
    max_seq_length = []
    for text in texts:
        tokens = tokenizer.tokenize(text)
        max_seq_length.append(len(tokens))

    return max_seq_length

In [None]:
train_path = "/content/drive/MyDrive/VLSP-Fake-News-Detection/public_train.csv"
val_path = "/content/drive/MyDrive/VLSP-Fake-News-Detection/val.csv"
test_path = "/content/drive/MyDrive/VLSP-Fake-News-Detection/final_private_test.csv"

In [None]:
if torch.cuda.is_available():
        device = torch.device('cuda')
        print(torch.cuda.get_device_name())
else:
    device = torch.device('cpu')

Tesla P100-PCIE-16GB


In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
vncorenlp = VnCoreNLP('/content/drive/MyDrive/VLSP-Fake-News-Detection/vncorenlp/VnCoreNLP-1.1.1.jar', annotators='wseg')
tweet_tokenizer = TweetTokenizer()

### Eval

In [None]:
def eval(val_loader, model, epoch, device):
    # Evaluate model
    model.eval()
    y_val = []
    val_preds = None
    print(f"EPOCH {epoch + 1}: ===EVALUATION===")
    for (input_ids, attention_mask, y_batch) in val_loader:
        y_pred = model(input_ids.to(device),
                       attention_mask=attention_mask.to(device))
        y_pred = y_pred.squeeze().detach().cpu().numpy()
        val_preds = np.atleast_1d(y_pred) if val_preds is None else np.concatenate(
            [val_preds, np.atleast_1d(y_pred)])
        y_val.extend(y_batch.tolist())

    val_preds = sigmoid(val_preds)
    score = f1_score(y_val, val_preds > 0.5, pos_label=0)
    roc_score = roc_auc_score(y_val, val_preds)
    print(f"PREDICT {sum(val_preds <= 0.5)} INFORMATIVES")
    print(f"ACTUALY {len(y_val) - sum(y_val)} INFORMATIVES")

    print(
        f"\n----- F1 score @0.5 = {score:.4f}\nROC-AUC Score = {roc_score:.4f}")
    return roc_score

In [None]:
def predict(test_df, model, config, tweet_tokenizer, vncorenlp, model_tokenizer):
    test_normalized_texts = []
    test_post_ids = []
    for row in test_df.iterrows():
        if not isnan(row[1]['post_message']):
            test_normalized_texts.append(
                normalizePost(row[1]['post_message'], tweet_tokenizer, vncorenlp, use_segment=config['use_wordsegment'],
                              remove_punc_stopword=config['remove_punc_stopword'], lowercase_opt=False, truncation_method="head_only"))
            test_post_ids.append(row[1]['id'])

    test_ids, test_masks = convert_tokens_to_ids(test_normalized_texts, model_tokenizer)

    test_dataset = torch.utils.data.TensorDataset(test_ids, test_masks)
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset, batch_size=8, shuffle=False)

    model.eval()
    test_preds = None
    for i, (input_ids, masks) in enumerate(test_dataloader):
        if i % 20 == 0 or i == len(test_dataloader):
            print(f"Predicted {i} posts.")
        y_pred = model(input_ids.cuda(), attention_mask=masks.cuda())
        y_pred = y_pred.squeeze().detach().cpu().numpy()
        test_preds = np.atleast_1d(y_pred) if test_preds is None else np.concatenate(
            [test_preds, np.atleast_1d(y_pred)])

    test_preds = sigmoid(test_preds)
    test_preds = test_preds.tolist()
    final_result = []
    for post_id, test_pred in zip(test_post_ids, test_preds):
      final_result.append([post_id, test_pred])
    
    result_df = DataFrame(final_result)
    result_df.to_csv('/content/drive/MyDrive/VLSP-Fake-News-Detection/results.csv', index=False)

### Train

In [None]:
EPOCHS = 6
BATCH_SIZE = 32
ACCUMULATION_STEPS = 6
LEARNING_RATE = 2e-5

In [None]:
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

if torch.cuda.is_available():
    device = torch.device('cuda')
    print(torch.cuda.get_device_name())
else:
    device = torch.device('cpu')

Tesla P100-PCIE-16GB


In [None]:
config_path = '/content/drive/MyDrive/VLSP-Fake-News-Detection/config/phobert_base_1.json'
single_model_config = json.load(open(config_path, 'r'))

In [None]:
seed = 9627
seed_everything(seed)

In [None]:
if single_model_config['model_type'] == 'BERT':
  print("===Use BERT model===")
  checkpoint_dir = '/content/drive/MyDrive/VLSP-Fake-News-Detection/trained_models/vbert_base/'
  tokenizer = BertTokenizer.from_pretrained(
      single_model_config['model_name'], do_lower_case=False)
  tokenizer.add_tokens(['<url>'])
  config = BertConfig.from_pretrained(single_model_config['model_name'], num_labels=1,
                                      output_hidden_states=True)
  model = BertReINTELClassification.from_pretrained(
      single_model_config['model_name'], config=config)
  model.to(device)
  tsfm = model.bert
elif single_model_config['model_type'] == 'ROBERTA':
  print("===Use PhoBERT model===")
  checkpoint_dir = '/content/drive/MyDrive/VLSP-Fake-News-Detection/trained_models/phobert_base/'
  tokenizer = AutoTokenizer.from_pretrained(
      single_model_config['model_name'])
  tokenizer.add_tokens(['<url>'])
  config = RobertaConfig.from_pretrained(single_model_config['model_name'], num_labels=1,
                                          output_hidden_states=True)
  model = RobertaReINTELClassification.from_pretrained(
      single_model_config['model_name'], config=config)
  # model.resize_token_embeddings(len(tokenizer))
  model.to(device)
  tsfm = model.roberta
elif single_model_config['model_type'] == 'ELECTRA':
  print("===Use ELECTRA model===")
  checkpoint_dir = '/content/drive/MyDrive/VLSP-Fake-News-Detection/trained_models/electra/'
  tokenizer = ElectraTokenizer.from_pretrained(
      single_model_config['model_name'], do_lower_case=False)
  tokenizer.add_tokens(['<url>'])
  config = ElectraConfig.from_pretrained(single_model_config['model_name'], num_labels=1,
                                          output_hidden_states=True, output_attentions=False)
  model = ElectraReINTELClassification.from_pretrained(
      single_model_config['model_name'], config=config)
  model.resize_token_embeddings(len(tokenizer))
  model.to(device)
  tsfm = model.electra
else:
  print("Model type invalid!!!")

print(f"Seed number: {seed}")

===Use PhoBERT model===


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaReINTELClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaReINTELClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaReINTELClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaReINTELClassification were not initialized from the model checkpoint at vinai/

Seed number: 9627


In [None]:
error_label_idx = []
tr_texts = []
for i, post in enumerate(train_df.post_message):
  if not isnan(post):
    tr_texts.append(normalizePost(post, tweet_tokenizer, vncorenlp, use_segment=single_model_config['use_wordsegment'],
                                  remove_punc_stopword=single_model_config['remove_punc_stopword'], lowercase_opt=False, truncation_method="head_only"))
  else:
    error_label_idx.append(i)
tr_labels = train_df.iloc[~train_df.index.isin(error_label_idx)].label.to_list()
train_ids, train_masks, train_labels = convert_tokens_to_ids(tr_texts, tokenizer, 256, tr_labels)
train_dataset = torch.utils.data.TensorDataset(train_ids, train_masks, train_labels)
train_sampler = torch.utils.data.RandomSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)

In [None]:
single_model_config['remove_punc_stopword']

False

In [None]:
sample_post = train_df['post_message'][0]
processed_sample = normalizePost(sample_post, tweet_tokenizer, vncorenlp, use_segment=True,
                                  remove_punc_stopword=False, lowercase_opt=False, truncation_method="head_only")
print(processed_sample)

THĂNG CẤP_BẬC HÀM ĐỐI_VỚI 2 CÁN_BỘ , CHIẾN_SỸ HY_SINH Ở ĐÀ_NẴNG Ngày 3/4 , Đại_tướng Tô_Lâm , Bộ_trưởng Bộ Công_an đã ký quyết_định số 2398 / QĐ-BCA-X 01 thăng cấp_bậc hàm từ Đại_uý lên Thiếu_tá đối_với đồng_chí Đặng_Thanh_Tuấn . Cùng ngày , Thiếu_tướng Vũ_Xuân_Viên , Giám_đốc Công_an thành_phố Đà_Nẵng ký Quyết_định số 479 / QĐ-CATP thăng cấp_bậc hàm từ Trung_sĩ lên thượng_sỹ đối_với đồng_chí Võ_Văn_Toàn . Đây là 2 cán_bộ , chiến_sỹ đã hy_sinh trong quá_trình thực_hiện nhiệm_vụ đảm_bảo an_ninh , trật_tự , phòng , chống tội_phạm trên địa_bàn thành_phố Đà_Nẵng . Trước đó , vào lúc 20h40 ngày 2/4/2020 , Công_an TP . Đà_Nẵng nhận được tin báo của nhân_dân có nhóm đối_tượng đua xe và cướp_giật người đi đường tại khu_vực quận Sơn_Trà , trong bối_cảnh toàn_quốc thực_hiện cách_ly xã_hội theo Chỉ_thị số 16 của Thủ_tướng Chính_phủ . Công_an TP . Đà_Nẵng đã chỉ_đạo cho Công_an quận Sơn_Trà triển_khai lực_lượng truy bắt nhóm đối_tượng . Trong quá_trình truy_đuổi các đối_tượng trên , Đại_uý Đặng_Th

In [None]:
# process validation set
error_label_idx = []
vl_texts = []
for i, post in enumerate(val_df.post_message):
  if not isnan(post):
      vl_texts.append(normalizePost(post, tweet_tokenizer, vncorenlp, use_segment=single_model_config['use_wordsegment'],
                                    remove_punc_stopword=single_model_config['remove_punc_stopword'], lowercase_opt=False, truncation_method="head_only"))
  else:
      error_label_idx.append(i)
vl_labels = val_df.iloc[~val_df.index.isin(error_label_idx)].label.to_list()
val_ids, val_masks, val_labels = convert_tokens_to_ids(vl_texts, tokenizer, 256, vl_labels)
val_dataset = torch.utils.data.TensorDataset(val_ids, val_masks, val_labels)
val_sampler = torch.utils.data.SequentialSampler(val_dataset)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

In [None]:
num_train_optimization_steps = int(EPOCHS * len(train_dataset) / BATCH_SIZE / ACCUMULATION_STEPS)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(
        np in n for np in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(
        np in n for np in no_decay)], 'weight_decay': 0.01}
]

optimizer = AdamW(optimizer_grouped_parameters,lr=LEARNING_RATE, correct_bias=False)
scheduler0 = get_constant_schedule(optimizer)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_optimization_steps)


for child in tsfm.children():
  for param in child.parameters():
    param.require_grad = False

#Convert to iterator
frozen = True
best_score = 0.0
for epoch in range(EPOCHS):
  if epoch > 0 and frozen:
    for child in tsfm.children():
      for param in child.parameters():
        param.requires_grad = True

    frozen = False
    del scheduler0
    torch.cuda.empty_cache()
    gc.collect()
  optimizer.zero_grad()    
  print('\n------ Start training on Epoch: %d/%d' % (epoch + 1, EPOCHS))
  avg_loss = 0
  avg_accuracy = 0
  model.train()
  for i, (input_ids, attention_mask, y_batch) in enumerate(train_loader):
    if(i % 20 == 0 and not i == 0) or (i == len(train_loader)):
      print(f"Training batch {i} of {len(train_loader)}")
    optimizer.zero_grad()
    y_pred = model(input_ids.to(device), attention_mask=attention_mask.to(device))
    loss = torch.nn.functional.binary_cross_entropy_with_logits(y_pred.view(-1).to(device), y_batch.float().to(device))
    loss = loss.mean()
    loss.backward()
    optimizer.step()

    lossf = loss.item()
    avg_loss += loss.item() / len(train_loader)

  if not frozen:
    scheduler.step()
  else:
    scheduler0.step()
  optimizer.zero_grad()

  roc_score = eval(val_loader, model, epoch, "cuda")
  if roc_score >= best_score:
    best_score = roc_score
    save_checkpoint(model, tokenizer, checkpoint_dir, epoch=seed)
    print("Updated best score model!!! -------<{}>" % best_score)
  print("==================Done=============")


------ Start training on Epoch: 1/6
Training batch 20 of 137
Training batch 40 of 137
Training batch 60 of 137
Training batch 80 of 137
Training batch 100 of 137
Training batch 120 of 137
EPOCH 1: ===EVALUATION===
PREDICT 441 INFORMATIVES
ACTUALY 730 INFORMATIVES

----- F1 score @0.5 = 0.6695
ROC-AUC Score = 0.6511
Updated best score model!!! -------<{}>

------ Start training on Epoch: 2/6
Training batch 20 of 137
Training batch 40 of 137
Training batch 60 of 137
Training batch 80 of 137
Training batch 100 of 137
Training batch 120 of 137
EPOCH 2: ===EVALUATION===
PREDICT 763 INFORMATIVES
ACTUALY 730 INFORMATIVES

----- F1 score @0.5 = 0.9538
ROC-AUC Score = 0.9432
Updated best score model!!! -------<{}>

------ Start training on Epoch: 3/6
Training batch 20 of 137
Training batch 40 of 137
Training batch 60 of 137
Training batch 80 of 137
Training batch 100 of 137
Training batch 120 of 137
EPOCH 3: ===EVALUATION===
PREDICT 775 INFORMATIVES
ACTUALY 730 INFORMATIVES

----- F1 score @0.

In [None]:
if torch.cuda.is_available():
  device = torch.device('cuda')
  print(torch.cuda.get_device_name())
else:
  device = torch.device('cpu')
test_config_path = "/content/drive/MyDrive/VLSP-Fake-News-Detection/trained_models/phobert_base/model_9627.bin"
test_config = RobertaConfig.from_pretrained("/content/drive/MyDrive/VLSP-Fake-News-Detection/trained_models/phobert_base", num_labels=1, output_hidden_states=True)
test_model = RobertaReINTELClassification.from_pretrained(test_config_path, config=test_config)
test_model.to(device)
test_tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/VLSP-Fake-News-Detection/trained_models/phobert_base", do_lower_case=False)

Tesla P100-PCIE-16GB


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
predict(test_df, test_model, single_model_config, tweet_tokenizer, vncorenlp, test_tokenizer)

Predicted 0 posts.
Predicted 20 posts.
Predicted 40 posts.
Predicted 60 posts.
Predicted 80 posts.
Predicted 100 posts.
Predicted 120 posts.
Predicted 140 posts.
Predicted 160 posts.
Predicted 180 posts.
Predicted 200 posts.


In [None]:
sample_inputs = tokenizer("nộp hồ_sơ để trúng_tuyển vào các trường đại_học lớn", return_tensors="pt")
sample_outputs = model(sample_inputs['input_ids'].to(device), sample_inputs['attention_mask'].to(device))

In [None]:
sample_pred = sample_outputs.squeeze().detach().cpu().numpy()

In [None]:
tr_texts

['THĂNG CẤP_BẬC HÀM ĐỐI_VỚI 2 CÁN_BỘ , CHIẾN_SỸ HY_SINH Ở ĐÀ_NẴNG Ngày 3/4 , Đại_tướng Tô_Lâm , Bộ_trưởng Bộ Công_an đã ký quyết_định số 2398 / QĐ-BCA-X 01 thăng cấp_bậc hàm từ Đại_uý lên Thiếu_tá đối_với đồng_chí Đặng_Thanh_Tuấn . Cùng ngày , Thiếu_tướng Vũ_Xuân_Viên , Giám_đốc Công_an thành_phố Đà_Nẵng ký Quyết_định số 479 / QĐ-CATP thăng cấp_bậc hàm từ Trung_sĩ lên thượng_sỹ đối_với đồng_chí Võ_Văn_Toàn . Đây là 2 cán_bộ , chiến_sỹ đã hy_sinh trong quá_trình thực_hiện nhiệm_vụ đảm_bảo an_ninh , trật_tự , phòng , chống tội_phạm trên địa_bàn thành_phố Đà_Nẵng . Trước đó , vào lúc 20h40 ngày 2/4/2020 , Công_an TP . Đà_Nẵng nhận được tin báo của nhân_dân có nhóm đối_tượng đua xe và cướp_giật người đi đường tại khu_vực quận Sơn_Trà , trong bối_cảnh toàn_quốc thực_hiện cách_ly xã_hội theo Chỉ_thị số 16 của Thủ_tướng Chính_phủ . Công_an TP . Đà_Nẵng đã chỉ_đạo cho Công_an quận Sơn_Trà triển_khai lực_lượng truy bắt nhóm đối_tượng . Trong quá_trình truy_đuổi các đối_tượng trên , Đại_uý Đặng_