In [None]:
import torch
import torch.nn as nn

config = {
    'train_file_path': '/content/drive/MyDrive/data/train.csv',
    'test_file_path': '/content/drive/MyDrive/data/test.csv',
    'train_val_ratio': 0.1,
    'vocab_size': 30000,
    'batch_size': 64,
    'num_epoches': 2,
    'learning_rate': 1e-4,
    'logging_step': 300,
    'seed': 2021
}

config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu' 

import random
import numpy as np

def seed_everything(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

seed_everything(config['seed'])


In [None]:
from collections import Counter
from tqdm import tqdm
import jieba
def get_vocab(config):
  token_counter = Counter()

  with open(config['train_file_path'], 'r', encoding='utf8') as f:
    lines = f.readlines()
    for line in tqdm(lines, desc='Counting tokens', total=len(lines)):
      sent = line.split(',')[-1].strip()
      sent_cut = list(jieba.cut(sent))
      token_counter.update(sent_cut)

  vocab = set(token for token, _ in token_counter.most_common(config['vocab_size']))
  return vocab


In [None]:
vocab = get_vocab(config)

Counting tokens: 100%|██████████| 63361/63361 [00:19<00:00, 3281.45it/s]


In [None]:
import bz2
with bz2.open('/content/drive/MyDrive/sgns.weibo.word.bz2') as f:
  token_vector = f.readlines()

In [None]:
type(token_vector)

list

In [None]:
for i, line in enumerate(token_vector):
  if i == 3:
    break
  print(line)

b'195202 300\n'
b'\xef\xbc\x8c 0.094386 -0.200944 -0.030828 0.277130 -0.074674 0.239691 0.345185 0.298053 -0.026679 -0.077352 0.245854 -0.075995 0.058849 -0.176755 -0.072721 -0.246469 -0.182233 0.267472 0.109897 -0.266215 0.176660 -0.101581 0.374443 0.304319 0.024169 -0.158494 -0.254765 -0.265324 0.225498 -0.126215 -0.207172 -0.334429 -0.003584 -0.065530 0.174206 -0.097252 0.114595 -0.158193 -0.099948 -0.145332 -0.076565 -0.119460 0.074718 -0.056694 0.394867 -0.127122 0.270542 0.087482 -0.114715 0.124381 -0.149974 0.192933 0.131023 0.099271 -0.062209 0.079382 0.000578 -0.166215 0.155815 -0.279675 -0.184237 0.123209 0.043216 -0.308681 0.036135 0.064121 0.037674 0.026703 0.307341 0.089780 -0.057681 -0.252376 -0.262095 -0.049573 -0.093128 -0.391340 0.025183 0.078807 0.232554 -0.058487 0.153081 -0.126429 -0.182596 0.114655 0.156500 -0.037163 -0.212255 0.021135 -0.203646 -0.277985 -0.043363 0.133399 -0.164617 -0.084675 -0.411904 -0.152554 0.224462 0.135164 0.139439 -0.167746 0.032351 0.0899

In [None]:
for i, line in enumerate(token_vector):
  if i == 11:
    line = line.split()
    print(line[0].decode('utf8'))
    print(line[1:])
    print(len(line[1:]))
    break

和
[b'-0.008708', b'0.051657', b'0.385103', b'0.226086', b'-0.006173', b'-0.074551', b'0.193000', b'-0.323188', b'0.102296', b'0.194834', b'0.459722', b'-0.092830', b'-0.051126', b'-0.391280', b'-0.004931', b'-0.443963', b'0.133001', b'0.294608', b'0.132881', b'-0.276948', b'0.335401', b'-0.227581', b'-0.156450', b'0.206760', b'-0.264158', b'-0.134839', b'0.151620', b'-0.187104', b'0.608057', b'-0.047043', b'-0.327332', b'-0.463169', b'-0.142783', b'-0.150008', b'-0.030358', b'0.219540', b'0.087702', b'-0.113188', b'-0.474015', b'-0.169074', b'-0.053091', b'0.095054', b'0.170131', b'0.168307', b'0.059609', b'0.001021', b'0.045364', b'0.005945', b'0.158983', b'0.301111', b'0.203937', b'0.048265', b'0.022152', b'-0.081286', b'0.043097', b'-0.164395', b'0.475869', b'0.339868', b'0.630579', b'-0.108776', b'-0.287151', b'0.083748', b'-0.180241', b'0.012283', b'0.286583', b'0.023274', b'0.089716', b'-0.092454', b'0.261729', b'-0.314279', b'-0.225247', b'-0.147503', b'0.040399', b'-0.017565', 

In [None]:
def get_embedding(vocab):
  token2embedding = {}
  
  with bz2.open('/content/drive/MyDrive/sgns.weibo.word.bz2') as f:
    token_vector = f.readlines()

    meta_info = token_vector[0].split()
    print(f'{meta_info[0]} tokens in vectors file in total, vector size is {meta_info[1]}')

    for line in tqdm(token_vector[1:]):
      line = line.split()
      token = line[0].decode('utf8')
      vector = line[1:]

      if token in vocab:
        token2embedding[token] = [float(num) for num in vector] # 转换数据类型     

    token2id = {token: idx for idx, token in enumerate(token2embedding.keys(),4)}
    id2embedding = {token2id[token]: embedding for token, embedding in token2embedding.items()}

    PAD, UNK, BOS, EOS = '<pad>', '<unk>', '<bos>', '<eos>'
    token2id[PAD] = 0
    token2id[UNK] = 1
    token2id[BOS] = 2
    token2id[EOS] = 3

    id2embedding[0] = [.0] * int(meta_info[1])
    id2embedding[1] = [.0] * int(meta_info[1])
    id2embedding[2] = np.random.random(int(meta_info[1])).tolist()
    id2embedding[3] = np.random.random(int(meta_info[1])).tolist()

    emb_mat = [id2embedding[idx] for idx in range(len(id2embedding))] 

    return torch.tensor(emb_mat, dtype=torch.float), token2id, len(vocab)+4

In [None]:
emb_mat, token2id, config['vocab_size'] = get_embedding(vocab)

b'195202' tokens in vectors file in total, vector size is b'300'


100%|██████████| 195202/195202 [00:05<00:00, 38967.07it/s]


In [None]:
emb_mat[5]

tensor([-1.2404e-01, -5.3688e-02,  1.5796e-01, -4.7875e-02,  4.9531e-02,
         1.8500e-01,  2.0083e-01, -1.6088e-01,  1.1068e-01,  9.2775e-02,
         3.1865e-01, -3.2108e-01,  1.9237e-02, -3.6465e-01, -2.0035e-01,
        -1.8701e-01, -2.2670e-02,  1.1204e-01,  1.3903e-01, -1.0286e-01,
         1.0237e-01,  8.8739e-02,  1.4025e-02,  3.4365e-01, -7.0068e-02,
        -1.9792e-01,  2.0745e-02, -4.7428e-02,  3.4013e-01, -1.8864e-01,
        -2.5723e-01, -2.0975e-01,  1.8590e-01, -1.8940e-01,  1.7905e-01,
        -1.0504e-01,  1.7725e-01, -1.1841e-01, -2.7513e-01, -2.7510e-01,
        -2.3698e-01, -1.9499e-02,  1.2009e-01, -1.9623e-01,  3.3145e-01,
        -2.7213e-01,  1.7001e-02,  9.1802e-02, -5.0416e-02, -9.1448e-02,
        -6.5205e-02,  3.0300e-01,  2.3679e-01,  1.8119e-01,  4.8710e-03,
         2.5544e-02, -2.4659e-02, -9.3140e-03,  2.2470e-01, -1.7494e-01,
        -2.9566e-01,  2.6375e-01,  4.0185e-01, -1.5691e-01, -1.8787e-01,
        -4.7452e-02, -1.2100e-04,  2.2551e-01,  4.1

In [None]:
print(token2id)

{'，': 4, '的': 5, '。': 6, '@': 7, '！': 8, '了': 9, '、': 10, '：': 11, '】': 12, '是': 13, '和': 14, '有': 15, '在': 16, '一': 17, '？': 18, '“': 19, '不': 20, '个': 21, '我': 22, '”': 23, '就': 24, '人': 25, '都': 26, '#': 27, '也': 28, '一个': 29, '啊': 30, '你': 31, '；': 32, '》': 33, '这': 34, '要': 35, '.': 36, '-': 37, '好': 38, '去': 39, '我们': 40, '上': 41, '《': 42, ':': 43, '小': 44, '1': 45, '~': 46, '还': 47, '为': 48, '与': 49, '大': 50, '（': 51, '年': 52, '很': 53, '会': 54, '吧': 55, '中': 56, '到': 57, '被': 58, '来': 59, '说': 60, '中国': 61, '能': 62, '可以': 63, '将': 64, ']': 65, '吃': 66, '[': 67, '让': 68, '就是': 69, '把': 70, '2': 71, '月': 72, '看': 73, '最': 74, '天': 75, '之': 76, '多': 77, '给': 78, '北京': 79, '他': 80, '微博': 81, '这个': 82, '）': 83, '又': 84, '自己': 85, '对': 86, '里': 87, '着': 88, '旅游': 89, '没有': 90, '用': 91, '做': 92, '吗': 93, '后': 94, '3': 95, '等': 96, '【': 97, '酒店': 98, '啦': 99, '爱': 100, '大家': 101, '日': 102, '而': 103, '(': 104, '从': 105, '呢': 106, '不是': 107, '想': 108, '三': 109, '但': 110, '?': 111, '还是': 1

In [None]:
def tokenizer(sent, token2id):
  ids = [token2id.get(token,1) for token in jieba.cut(sent)] # get() 以防出现oov报错
  return ids

In [None]:
# def tokenizer(sent, token2id):
#   ids = [token2id[token] for token in jieba.cut(sent)]
#   return ids


# '响个' in token2id.keys()

In [None]:
import pandas as pd
from collections import defaultdict
def read_data(config, token2id, mode='train'):
  data_df = pd.read_csv(config[f'{mode}_file_path'], sep=',')

  if mode == 'train':
    X_train, y_train = defaultdict(list), []
    X_val, y_val = defaultdict(list), []

    num_val = int(config['train_val_ratio'] * len(data_df))
  
  else:
    X_test, y_test = defaultdict(list), []

  for i, row in tqdm(data_df.iterrows(), desc=f'Preprocesing {mode} data', total=len(data_df)):
    label = row[1] if mode=='train' else 0
    sentence = row[-1]
    inputs = tokenizer(sentence, token2id)

    if mode == 'train':
      if i < num_val:
        X_val['input_ids'].append(inputs)
        y_val.append(label)
      else:
        X_train['input_ids'].append(inputs)
        y_train.append(label)

    else:
      X_test['input_ids'].append(inputs)
      y_test.append(label)


  if mode == 'train':
    label2id = {label: i for i, label in enumerate(np.unique(y_train))}
    id2label = {i: label for label, i in label2id.items()}

    y_train = torch.tensor([label2id[label] for label in y_train], dtype=torch.long)
    y_val = torch.tensor([label2id[label] for label in y_val], dtype=torch.long)

    return X_train, y_train, X_val, y_val, label2id, id2label
  
  else:
    y_test = torch.tensor(y_test, dtype=torch.long)

    return X_test, y_test

In [None]:
X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, token2id, mode='train')
X_test, y_test = read_data(config, token2id, mode='test')

Preprocesing train data: 100%|██████████| 63360/63360 [00:13<00:00, 4625.80it/s]
Preprocesing test data: 100%|██████████| 10000/10000 [00:02<00:00, 4615.75it/s]


In [None]:
from torch.utils.data import Dataset
class TNEWSDataset(Dataset):
  def __init__(self, X, y):
    self.x = X
    self.y = y

  def __getitem__(self, idx):
    return{
        'input_ids': self.x['input_ids'][idx],
        'label': self.y[idx]
    }

  def __len__(self):
    return self.y.size(0)


In [None]:
def collate_fn(examples):
  input_ids_list = []
  labels = []

  for example in examples:
    input_ids_list.append(example['input_ids'])
    labels.append(example['label'])

  max_length = max(len(input_ids) for input_ids in input_ids_list)

  input_ids_tensor = torch.zeros((len(labels), max_length), dtype=torch.long)

  # 对齐，放入tensor
  for i, input_ids in enumerate(input_ids_list):
    seq_len = len(input_ids)
    input_ids_tensor[i, : seq_len] = torch.tensor(input_ids, dtype=torch.long)

  return{
      'input_ids': input_ids_tensor,
      'labels': torch.tensor(labels, dtype=torch.long)
  }


In [None]:
import collections
from torch.utils.data import DataLoader
def build_dataloader(config):
  X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, token2id, mode='train')
  X_test, y_test = read_data(config, token2id, mode='test')

  train_dataset = TNEWSDataset(X_train, y_train)
  val_dataset = TNEWSDataset(X_val, y_val)
  test_dataset = TNEWSDataset(X_test, y_test)

  train_dataloader = DataLoader(dataset=train_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=True, collate_fn=collate_fn)
  val_dataloader = DataLoader(dataset=val_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)
  test_dataloader = DataLoader(dataset=test_dataset, batch_size=config['batch_size'], num_workers=4, shuffle=False, collate_fn=collate_fn)

  return id2label, train_dataloader, val_dataloader, test_dataloader

In [None]:
id2label, train_dataloader, val_dataloader, test_dataloader = build_dataloader(config)

Preprocesing train data: 100%|██████████| 63360/63360 [00:14<00:00, 4466.97it/s]
Preprocesing test data: 100%|██████████| 10000/10000 [00:02<00:00, 4599.23it/s]
  cpuset_checked))


In [None]:
for batch in train_dataloader:
  print(len(batch['input_ids']))
  print(batch)
  break

  cpuset_checked))


64
{'input_ids': tensor([[  989,     5,     1,  ...,     0,     0,     0],
        [  956,  1111,   264,  ...,     0,     0,     0],
        [ 6113,  1940,  5204,  ...,     0,     0,     0],
        ...,
        [  190,  9073,    14,  ...,     0,     0,     0],
        [13892,     1,  1111,  ...,     0,     0,     0],
        [ 1015,  8920,  1372,  ...,     0,     0,     0]]), 'labels': tensor([14, 14,  5,  1,  4,  7,  9,  2, 10,  8,  4, 11,  6,  8,  4,  1, 11,  1,
         7, 10,  9, 10,  8,  0,  5,  2,  5,  4,  8,  4, 13,  3,  6,  6,  6,  8,
        11, 11,  9,  3,  3,  4,  2, 11, 14,  5, 11, 11,  9,  5,  7,  4,  4,  2,
         9,  8,  6,  2,  4,  4,  7,  3,  6, 11])}


In [None]:
model_config = {
    'embedding_pretrained': emb_mat,
    'num_filters': 256,
    'emb_size': emb_mat.shape[1],
    'dropout': 0.3,
    'filter_sizes': [2,3,5],
    'num_classes': len(label2id)
}

In [None]:
import torch.nn.functional as F

class Model(nn.Module):
  def __init__(self, config):
    super(Model, self).__init__()

    self.embedding = nn.Embedding.from_pretrained(config['embedding_pretrained'], freeze=True) # 固定词向量

    self.convs = nn.ModuleList([nn.Conv2d(1, config['num_filters'], (k, config['emb_size'])) for k in config['filter_sizes']])

    self.dropout = nn.Dropout(config['dropout'])

    self.fc = nn.Linear(len(config['filter_sizes'])*config['num_filters'], config['num_classes'])

  def convs_and_pool(self, x, conv):
    # 经过卷积后 x [batch_size, out_channels, seq_len_out, embedding_dim_out=1]
    # x [batch_size, out_channels, seq_len_out]
    x = F.relu(conv(x)).squeeze(3)
    
    # x [batch_size, out_channels, seq_len_out=1]
    # x [batch_size, out_channels]
    x = F.max_pool1d(x, x.size(2)).squeeze(2)
    return x

  def forward(self, input_ids, labels):
    # out [batch_size, seq_len, embedding_dim]
    out = self.embedding(input_ids)
    # H:seq_len; W:embedding_dim
    # out [batch_size, 1, seq_len, embedding_dim]
    out = out.unsqueeze(1)
    # out [batch_size, out_channels]
    out = torch.cat([self.convs_and_pool(out, conv) for conv in self.convs], 1)

    out = self.dropout(out)

    out = self.fc(out)

    output = (out, )

    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(out, labels)
      output = (loss, ) + output
    # train output(loss, out)
    # test output(out)
    return output
  

In [None]:
model = Model(model_config)

In [None]:
from sklearn.metrics import f1_score
def evaluation(model, config, val_dataloader):
  model.eval()
  preds = []
  labels = []
  val_loss = 0.
  val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))

  with torch.no_grad():
    for batch in val_iterator:
      labels.append(batch['labels'])
      batch = {item: value.to(config['device']) for item, value in batch.items()}

      loss, logits = model(**batch)[:2]
      val_loss += loss.item()
      
      preds.append(logits.argmax(dim=-1).detach().cpu())

  avg_val_loss = val_loss/len(val_dataloader)
  labels = torch.cat(labels, dim=0).numpy()
  preds = torch.cat(preds, dim=0).numpy()

  f1 = f1_score(labels, preds, average='macro')

  return avg_val_loss, f1


In [None]:
from torch.optim import AdamW
from tqdm import trange
def train(model, config, train_dataloader, val_dataloader):
  optimizer = AdamW(model.parameters(), lr=config['learning_rate'])
  model.to(config['device']) 
  epoches_iterator = trange(config['num_epoches'])

  global_steps = 0
  train_loss = 0.
  logging_loss = 0.

  for epoch in epoches_iterator:
    train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader))
    model.train()
    
    for batch in train_iterator:
      batch = {item: value.to(config['device']) for item, value in batch.items()}

      loss = model(**batch)[0]
      # print(f'loss:{loss}')

      model.zero_grad()
      loss.backward()
      optimizer.step()

      train_loss += loss
      global_steps +=1

      if global_steps % config['logging_step'] == 0:
        print_train_loss = (train_loss - logging_loss)/ config['logging_step'] 
        logging_loss = train_loss
        # print(f'train loss: {print_train_loss}\n')

        avg_val_loss, f1 = evaluation(model, config, val_dataloader)
        print(f'avg_val_loss: {avg_val_loss} \n f1:{f1}')
        model.train()

  return model

In [None]:
best_model = train(model, config, train_dataloader, val_dataloader)

  0%|          | 0/2 [00:00<?, ?it/s]
  cpuset_checked))

Training:   0%|          | 1/891 [00:00<06:23,  2.32it/s][A
Training:   0%|          | 2/891 [00:00<04:31,  3.27it/s][A
Training:   0%|          | 3/891 [00:00<03:54,  3.79it/s][A
Training:   0%|          | 4/891 [00:01<03:28,  4.25it/s][A
Training:   1%|          | 5/891 [00:01<03:13,  4.57it/s][A
Training:   1%|          | 6/891 [00:01<03:07,  4.72it/s][A
Training:   1%|          | 7/891 [00:01<02:57,  4.97it/s][A
Training:   1%|          | 8/891 [00:01<02:50,  5.17it/s][A
Training:   1%|          | 9/891 [00:02<02:59,  4.90it/s][A
Training:   1%|          | 10/891 [00:02<03:16,  4.48it/s][A
Training:   1%|          | 11/891 [00:02<03:16,  4.47it/s][A
Training:   1%|▏         | 12/891 [00:02<03:05,  4.74it/s][A
Training:   1%|▏         | 13/891 [00:02<02:53,  5.06it/s][A
Training:   2%|▏         | 14/891 [00:03<02:54,  5.03it/s][A
Training:   2%|▏         | 15/891 [00:03<02:58,  4.92it/s][A
Training:   2%|▏     

avg_val_loss: 1.4553284452419089 
 f1:0.5202485257190591



Training:  34%|███▍      | 301/891 [01:15<24:32,  2.49s/it][A
Training:  34%|███▍      | 302/891 [01:15<17:40,  1.80s/it][A
Training:  34%|███▍      | 303/891 [01:15<12:54,  1.32s/it][A
Training:  34%|███▍      | 304/891 [01:15<09:39,  1.01it/s][A
Training:  34%|███▍      | 305/891 [01:15<07:18,  1.34it/s][A
Training:  34%|███▍      | 306/891 [01:16<05:38,  1.73it/s][A
Training:  34%|███▍      | 307/891 [01:16<04:30,  2.16it/s][A
Training:  35%|███▍      | 308/891 [01:16<03:42,  2.62it/s][A
Training:  35%|███▍      | 309/891 [01:16<03:10,  3.06it/s][A
Training:  35%|███▍      | 310/891 [01:16<02:47,  3.47it/s][A
Training:  35%|███▍      | 311/891 [01:17<02:32,  3.79it/s][A
Training:  35%|███▌      | 312/891 [01:17<02:23,  4.03it/s][A
Training:  35%|███▌      | 313/891 [01:17<02:18,  4.18it/s][A
Training:  35%|███▌      | 314/891 [01:17<02:16,  4.22it/s][A
Training:  35%|███▌      | 315/891 [01:17<02:08,  4.49it/s][A
Training:  35%|███▌      | 316/891 [01:18<02:06,  4.56

avg_val_loss: 1.466108774295961 
 f1:0.5230206388008288



Training:  67%|██████▋   | 601/891 [02:28<11:57,  2.47s/it][A
Training:  68%|██████▊   | 602/891 [02:29<08:40,  1.80s/it][A
Training:  68%|██████▊   | 603/891 [02:29<06:24,  1.34s/it][A
Training:  68%|██████▊   | 604/891 [02:29<04:50,  1.01s/it][A
Training:  68%|██████▊   | 605/891 [02:29<03:40,  1.29it/s][A
Training:  68%|██████▊   | 606/891 [02:29<02:51,  1.66it/s][A
Training:  68%|██████▊   | 607/891 [02:30<02:16,  2.08it/s][A
Training:  68%|██████▊   | 608/891 [02:30<01:51,  2.54it/s][A
Training:  68%|██████▊   | 609/891 [02:30<01:37,  2.89it/s][A
Training:  68%|██████▊   | 610/891 [02:30<01:26,  3.27it/s][A
Training:  69%|██████▊   | 611/891 [02:30<01:16,  3.64it/s][A
Training:  69%|██████▊   | 612/891 [02:31<01:08,  4.05it/s][A
Training:  69%|██████▉   | 613/891 [02:31<01:03,  4.37it/s][A
Training:  69%|██████▉   | 614/891 [02:31<01:00,  4.55it/s][A
Training:  69%|██████▉   | 615/891 [02:31<00:59,  4.63it/s][A
Training:  69%|██████▉   | 616/891 [02:31<00:58,  4.74

avg_val_loss: 1.4702071114019915 
 f1:0.5206100566888275



Training:   1%|          | 11/891 [00:13<27:10,  1.85s/it][A
Training:   1%|▏         | 12/891 [00:13<19:42,  1.35s/it][A
Training:   1%|▏         | 13/891 [00:13<14:34,  1.00it/s][A
Training:   2%|▏         | 14/891 [00:14<11:05,  1.32it/s][A
Training:   2%|▏         | 15/891 [00:14<08:45,  1.67it/s][A
Training:   2%|▏         | 16/891 [00:14<07:14,  2.02it/s][A
Training:   2%|▏         | 17/891 [00:14<05:56,  2.45it/s][A
Training:   2%|▏         | 18/891 [00:14<05:00,  2.90it/s][A
Training:   2%|▏         | 19/891 [00:15<04:22,  3.32it/s][A
Training:   2%|▏         | 20/891 [00:15<03:51,  3.76it/s][A
Training:   2%|▏         | 21/891 [00:15<03:33,  4.07it/s][A
Training:   2%|▏         | 22/891 [00:15<03:25,  4.22it/s][A
Training:   3%|▎         | 23/891 [00:15<03:16,  4.41it/s][A
Training:   3%|▎         | 24/891 [00:16<03:07,  4.63it/s][A
Training:   3%|▎         | 25/891 [00:16<03:09,  4.56it/s][A
Training:   3%|▎         | 26/891 [00:16<03:02,  4.73it/s][A
Trainin

avg_val_loss: 1.4815214703781436 
 f1:0.5235260864320253



Training:  35%|███▍      | 311/891 [01:27<17:42,  1.83s/it][A
Training:  35%|███▌      | 312/891 [01:28<13:00,  1.35s/it][A
Training:  35%|███▌      | 313/891 [01:28<09:36,  1.00it/s][A
Training:  35%|███▌      | 314/891 [01:28<07:17,  1.32it/s][A
Training:  35%|███▌      | 315/891 [01:28<05:42,  1.68it/s][A
Training:  35%|███▌      | 316/891 [01:28<04:34,  2.09it/s][A
Training:  36%|███▌      | 317/891 [01:29<03:49,  2.50it/s][A
Training:  36%|███▌      | 318/891 [01:29<03:13,  2.96it/s][A
Training:  36%|███▌      | 319/891 [01:29<02:52,  3.31it/s][A
Training:  36%|███▌      | 320/891 [01:29<02:34,  3.68it/s][A
Training:  36%|███▌      | 321/891 [01:29<02:18,  4.10it/s][A
Training:  36%|███▌      | 322/891 [01:30<02:16,  4.16it/s][A
Training:  36%|███▋      | 323/891 [01:30<02:09,  4.38it/s][A
Training:  36%|███▋      | 324/891 [01:30<02:06,  4.50it/s][A
Training:  36%|███▋      | 325/891 [01:30<02:01,  4.67it/s][A
Training:  37%|███▋      | 326/891 [01:30<01:56,  4.84

avg_val_loss: 1.4828069499044707 
 f1:0.5209375958825618



Training:  68%|██████▊   | 610/891 [02:41<11:55,  2.55s/it][A
Training:  69%|██████▊   | 611/891 [02:41<08:36,  1.84s/it][A
Training:  69%|██████▊   | 612/891 [02:41<06:15,  1.34s/it][A
Training:  69%|██████▉   | 613/891 [02:41<04:37,  1.00it/s][A
Training:  69%|██████▉   | 614/891 [02:42<03:30,  1.31it/s][A
Training:  69%|██████▉   | 615/891 [02:42<02:43,  1.68it/s][A
Training:  69%|██████▉   | 616/891 [02:42<02:13,  2.07it/s][A
Training:  69%|██████▉   | 617/891 [02:42<01:48,  2.53it/s][A
Training:  69%|██████▉   | 618/891 [02:42<01:34,  2.88it/s][A
Training:  69%|██████▉   | 619/891 [02:43<01:25,  3.18it/s][A
Training:  70%|██████▉   | 620/891 [02:43<01:14,  3.63it/s][A
Training:  70%|██████▉   | 621/891 [02:43<01:08,  3.92it/s][A
Training:  70%|██████▉   | 622/891 [02:43<01:06,  4.07it/s][A
Training:  70%|██████▉   | 623/891 [02:43<01:01,  4.34it/s][A
Training:  70%|███████   | 624/891 [02:44<01:01,  4.34it/s][A
Training:  70%|███████   | 625/891 [02:44<01:00,  4.41

In [None]:
def predict(config, id2label, model, test_dataloader):
  model.eval()
  test_iterator = tqdm(test_dataloader, desc='Predicting', total=len(test_dataloader))
  test_preds =[]

  with torch.no_grad():
     for batch in test_iterator:
       batch = {item:value.to(config['device']) for item, value in batch.items()}
       logits = model(**batch)[1]
       test_preds.append(logits.argmax(dim=-1).detach().cpu())
  test_preds = torch.cat(test_preds, dim=0).numpy()
  test_preds = [id2label[idx] for idx in test_preds]

  test_df = pd.read_csv(config['test_file_path'], sep=',')
  test_df['preds'] = test_preds
  test_df.to_csv('/content/drive/MyDrive/代码实战/头条新闻分类/TextCNN_result.csv', index=False, encoding='utf8')
  return test_df

In [None]:
test_df = predict(config, id2label, model, test_dataloader)

  cpuset_checked))
Predicting: 100%|██████████| 157/157 [00:16<00:00,  9.67it/s]
