In [1]:
import sys
sys.path.append('Models')

In [None]:
import os
import torch
import pandas as pd
from xlm_roberta_large import HateSpeechBERT
from transformers import AutoTokenizer
from tqdm import tqdm 

In [12]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [None]:
save_path = '../Train Data/Results/xlm roberta large'
os.makedirs(save_path, exist_ok=True)

train_processed = pd.read_csv('../No trans data/train_processed_no_trans.csv')
dev_processed = pd.read_csv('../No trans data/dev_processed_no_trans.csv')
test_processed = pd.read_csv('../No trans data/test_processed_no_trans.csv')

train_features = pd.read_csv('../No trans data/train_features_no_trans.csv')
dev_features = pd.read_csv('../No trans data/dev_features_no_trans.csv')
test_features = pd.read_csv('../No trans data/test_features_no_trans.csv')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large")
model = HateSpeechBERT().to(device)
check_pt = '../Train Data/Checkpoints/xlm roberta large 1 lr3 d4 no trans/best_model.pth'
model.load_state_dict(torch.load(check_pt, map_location=device)['model_state_dict'])

<All keys matched successfully>

In [15]:
model.eval()
feature_dict = {row["id"]: torch.tensor(row[1:].values, dtype=torch.float32, device=device) for _, row in train_features.iterrows()}
data_list = []
count = 0

for i in tqdm(range(len(train_processed)), desc="Processing Data"):

    with torch.no_grad():

        data_id = train_processed.loc[i,'id']
        data_text = train_processed.loc[i,'text']
        data_text = tokenizer(data_text, return_tensors='pt', padding=True, return_attention_mask=True, return_token_type_ids=False)
        data_label = train_processed.loc[i,'label']
        data_features = feature_dict[data_id].unsqueeze(dim=0)

        if data_text.input_ids.shape[1] > 512:
            count += 1
            continue

        features, _ = model(input_ids=data_text.input_ids.to(device), attention_mask=data_text.attention_mask.to(device), encodings=data_features)
        features = features.squeeze(dim=0).cpu().numpy()

        data_list.append([data_id] + features.tolist() + [data_label])

columns = ['id'] + [f'f_{i}' for i in range(features.shape[0])] + ['label']
train_df = pd.DataFrame(data_list, columns=columns)
train_df.to_csv(os.path.join(save_path, 'train_512_features'), index=False)
print(f'{count} train samples had a sequence length > 512')

Processing Data: 100%|██████████| 5579/5579 [01:04<00:00, 86.55it/s] 


0 train samples had a sequence length > 512


In [16]:
model.eval()
feature_dict = {row["id"]: torch.tensor(row[1:].values, dtype=torch.float32, device=device) for _, row in dev_features.iterrows()}
data_list = []
count = 0

for i in tqdm(range(len(dev_processed)), desc="Processing Data"):

    with torch.no_grad():

        data_id = dev_processed.loc[i,'id']
        data_text = dev_processed.loc[i,'text']
        data_text = tokenizer(data_text, return_tensors='pt', padding=True, return_attention_mask=True, return_token_type_ids=False)
        data_label = dev_processed.loc[i,'label']
        data_features = feature_dict[data_id].unsqueeze(dim=0)

        if data_text.input_ids.shape[1] > 512:
            count += 1
            continue

        features, _ = model(input_ids=data_text.input_ids.to(device), attention_mask=data_text.attention_mask.to(device), encodings=data_features)
        features = features.squeeze(dim=0).cpu().numpy()

        data_list.append([data_id] + features.tolist() + [data_label])

columns = ['id'] + [f'f_{i}' for i in range(features.shape[0])] + ['label']
dev_df = pd.DataFrame(data_list, columns=columns)
dev_df.to_csv(os.path.join(save_path, 'dev_512_features'), index=False)
print(f'{count} dev samples had a sequence length > 512')

Processing Data: 100%|██████████| 785/785 [00:09<00:00, 83.41it/s]


0 dev samples had a sequence length > 512


In [17]:
model.eval()
feature_dict = {row["id"]: torch.tensor(row[1:].values, dtype=torch.float32, device=device) for _, row in test_features.iterrows()}
data_list = []
count = 0

for i in tqdm(range(len(test_processed)), desc="Processing Data"):

    with torch.no_grad():

        data_id = test_processed.loc[i,'id']
        data_text = test_processed.loc[i,'text']
        data_text = tokenizer(data_text, return_tensors='pt', padding=True, return_attention_mask=True, return_token_type_ids=False)
        data_features = feature_dict[data_id].unsqueeze(dim=0)

        if data_text.input_ids.shape[1] > 512:
            count += 1
            continue

        features, _ = model(input_ids=data_text.input_ids.to(device), attention_mask=data_text.attention_mask.to(device), encodings=data_features)
        features = features.squeeze(dim=0).cpu().numpy()

        data_list.append([data_id] + features.tolist())

columns = ['id'] + [f'f_{i}' for i in range(features.shape[0])]
test_df = pd.DataFrame(data_list, columns=columns)
test_df.to_csv(os.path.join(save_path, 'test_512_features'), index=False)
print(f'{count} test samples had a sequence length > 512')

Processing Data: 100%|██████████| 1576/1576 [00:18<00:00, 86.11it/s]


0 test samples had a sequence length > 512
