In [16]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.18.0-py3-none-any.whl (4.0 MB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp38-cp38-win_amd64.whl (3.3 MB)
     ---------------------------------------- 3.3/3.3 MB 5.2 MB/s eta 0:00:00
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
     -------------------------------------- 895.2/895.2 KB 9.5 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
     ---------------------------------------- 77.9/77.9 KB ? eta 0:00:00
Collecting typing-extensions>=3.7.4.3
  Downloading typing_extensions-4.2.0-py3-none-any.whl (24 kB)
Collecting packaging>=20.0
  Downloading packaging-21.3-py3-none-any.whl (40 kB)
     -------------------------------------- 40.8/40.8 KB 649.1 kB/s eta 0:00:00
Installing collected packages: tokenizers, typing-extensions, sacremoses, packaging, huggingface-hub, transformers
  Attempting unin

In [28]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizerFast, BertModel

In [40]:
train_stat = pd.read_csv('./data/train_stat_feat_df.csv')
dev_stat = pd.read_csv('./data/dev_stat_feat_df.csv')
dev_stat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tweet_id                  522 non-null    int64  
 1   reply_reply_count         522 non-null    float64
 2   reply_like_count          522 non-null    float64
 3   reply_retweet_count       522 non-null    float64
 4   reply_quote_count         522 non-null    float64
 5   reply_possibly_sensitive  522 non-null    float64
 6   reply_has_url             522 non-null    float64
 7   reply_mentioned_url_num   522 non-null    float64
 8   reply_id_num              522 non-null    float64
 9   reply_isweekday           522 non-null    float64
 10  reply_senti_score         522 non-null    float64
 11  reply_count               522 non-null    float64
 12  like_count                522 non-null    float64
 13  retweet_count             522 non-null    float64
 14  quote_coun

In [42]:
train_tweet = pd.read_csv('./data/train_tweet_df.csv')
dev_tweet = pd.read_csv('./data/dev_tweet_df.csv')
dev_tweet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_id    522 non-null    int64 
 1   text        522 non-null    object
 2   created_at  522 non-null    object
 3   user_id     522 non-null    int64 
 4   tweet_id.1  522 non-null    int64 
 5   label       522 non-null    int64 
 6   reply       522 non-null    object
 7   reply_text  518 non-null    object
dtypes: int64(4), object(4)
memory usage: 32.8+ KB


In [37]:
class TweetDataset(Dataset):
    def __init__(self, data_type, max_seq_len):
        self.max_seq_len = max_seq_len
        # read pre-processed data
        self.tweet_df = pd.read_csv(f'./data/{data_type}_tweet_df.csv', usecols=['text', 'reply_text', 'label'])
        self.statistic_df = pd.read_csv(f'./data/{data_type}_stat_feat_df.csv')
        self.tweet_df['text'] = self.tweet_df['text'].replace(np.nan, '')
        self.tweet_df['reply_text'] = self.tweet_df['reply_text'].replace(np.nan, '')
        # define tokenizer
        self.tokenizer = DistilBertTokenizerFast.from_pretrained("bert-base-uncased")
    def __len__(self):
        return self.tweet_df.shape[0]
    def __getitem__(self, idx):
        source_token_mask = self.tokenizer(self.tweet_df.iloc[idx]['text'], truncation=True, padding='max_length', max_length=self.max_seq_len)
        source_token, source_mask = torch.tensor(source_token_mask['input_ids']), torch.tensor(source_token_mask['attention_mask'])
        pair_token_mask = self.tokenizer(self.tweet_df.iloc[idx]['text'], self.tweet_df.iloc[idx]['reply_text'], truncation='only_second', padding='max_length', max_length=self.max_seq_len)
        pair_tokens_tensor, pair_mask_tensor = torch.tensor(pair_token_mask['input_ids']), torch.tensor(pair_token_mask['attention_mask'])
        return source_token, source_mask, pair_tokens_tensor, pair_mask_tensor, self.tweet_df.iloc[idx]['label'], torch.tensor(self.statistic_df.iloc[idx])

In [38]:
train_loader = DataLoader(TweetDataset('train', 200), shuffle=True, batch_size=20, drop_last=True)
dev_loader = DataLoader(TweetDataset('dev', 200), shuffle=True, batch_size=20, drop_last=True)

Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 4.64kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 253kB/s]  
Downloading: 100%|██████████| 455k/455k [00:01<00:00, 402kB/s]  
Downloading: 100%|██████████| 570/570 [00:00<00:00, 70.7kB/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.
