In [1]:
import pandas as pd
import torch
from os import listdir
from data import encoding_text, labelling_text
from torch.utils.data import Dataset, DataLoader
from numpy import ravel
import random
import numpy as np
from tqdm import tqdm

In [2]:
torch.cuda.empty_cache()

In [3]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [4]:
MODEL_PATH = r'C:\Users\heylu\Documents\github\A Study Case About Mental Health Tweets in Pandemic Times\model\model.pth'

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
model = torch.load(MODEL_PATH)
model.to(device)
model.eval()

Classifier(
  (pretrained_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [6]:
def labelling_text(input_ids, attention_mask, model):
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    softmax = torch.nn.Softmax(dim=-1)
    prediction = softmax(output)
    prediction = torch.argmax(prediction, axis=1)
    return prediction

In [7]:
DATA_PATH = r'C:\Users\heylu\Documents\github\A Study Case About Mental Health Tweets in Pandemic Times\data'

In [8]:
for year in listdir(DATA_PATH):
    for f in listdir(DATA_PATH + '\\' + year):
        print(f)
        df = pd.read_csv(DATA_PATH + '\\' + year + '\\' + f)
        df.rename(columns={'tweet': 'texts'}, inplace=True)
        res = []
        for i in tqdm(range(len(df['texts']))):
            data = encoding_text(df['texts'][i])
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            value = labelling_text(input_ids, attention_mask, model).tolist()[0]
            res.append(value)
        df['labels'] = res
        df.to_csv(DATA_PATH + '\\' + year + '\\' + f)

00-11-2020.csv
100%|██████████| 347331/347331 [2:07:44<00:00, 45.32it/s]
00-12-2020.csv
100%|██████████| 382893/382893 [2:20:40<00:00, 45.36it/s]
00-01-2021.csv
100%|██████████| 429541/429541 [2:40:33<00:00, 44.59it/s]
00-02-2021.csv
100%|██████████| 406924/406924 [2:34:24<00:00, 43.92it/s]
00-03-2021.csv
100%|██████████| 507763/507763 [3:11:53<00:00, 44.10it/s]
00-04-2021.csv
100%|██████████| 444897/444897 [2:49:27<00:00, 43.76it/s]


In [9]:
listdir(DATA_PATH)

['2020', '2021']