In [1]:
import sys
sys.path.append('../Models')

In [2]:
import os
import torch
import pandas as pd
from google_bert import HateSpeechBERT
from transformers import AutoTokenizer
from tqdm import tqdm 

In [3]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [4]:
save_path = '../Train Data/Results/google bert'
os.makedirs(save_path, exist_ok=True)

train_processed = pd.read_csv('../Trans data/train_processed.csv')
dev_processed = pd.read_csv('../Trans data/dev_processed.csv')
test_processed = pd.read_csv('../Trans data/test_processed.csv')

train_features = pd.read_csv('../Trans data/train_features.csv')
dev_features = pd.read_csv('../Trans data/dev_features.csv')
test_features = pd.read_csv('../Trans data/test_features.csv')

In [5]:
#tokenizer = AutoTokenizer.from_pretrained("seanbenhur/tanglish-offensive-language-identification")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-uncased")
#tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBERTv2-MLM-Back-TLM")
#tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
#tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large")

model = HateSpeechBERT().to(device)
check_pt = '../Train Data/Checkpoints/google bert/best_model.pth'
model.load_state_dict(torch.load(check_pt, map_location=device)['model_state_dict'])

<All keys matched successfully>

In [6]:
model.eval()
feature_dict = {row["id"]: torch.tensor(row[1:].values, dtype=torch.float32, device=device) for _, row in train_features.iterrows()}
data_list = []
count = 0

for i in tqdm(range(len(train_processed)), desc="Processing Data"):

    with torch.no_grad():

        data_id = train_processed.loc[i,'id']
        data_text = train_processed.loc[i,'text']
        data_text = tokenizer(data_text, return_tensors='pt', padding=True, return_attention_mask=True, return_token_type_ids=False)
        data_label = train_processed.loc[i,'label']
        data_features = feature_dict[data_id].unsqueeze(dim=0)

        if data_text.input_ids.shape[1] > 512:
            count += 1
            continue

        features, _ = model(input_ids=data_text.input_ids.to(device), attention_mask=data_text.attention_mask.to(device), encodings=data_features)
        features = features.squeeze(dim=0).cpu().numpy()

        data_list.append([data_id] + features.tolist() + [data_label])

columns = ['id'] + [f'f_{i}' for i in range(features.shape[0])] + ['label']
train_df = pd.DataFrame(data_list, columns=columns)
train_df.to_csv(os.path.join(save_path, 'train_2048_features'), index=False)
print(f'{count} train samples had a sequence length > 512')

Processing Data: 100%|██████████| 5579/5579 [01:29<00:00, 62.34it/s]


8 train samples had a sequence length > 512


In [7]:
model.eval()
feature_dict = {row["id"]: torch.tensor(row[1:].values, dtype=torch.float32, device=device) for _, row in dev_features.iterrows()}
data_list = []
count = 0

for i in tqdm(range(len(dev_processed)), desc="Processing Data"):

    with torch.no_grad():

        data_id = dev_processed.loc[i,'id']
        data_text = dev_processed.loc[i,'text']
        data_text = tokenizer(data_text, return_tensors='pt', padding=True, return_attention_mask=True, return_token_type_ids=False)
        data_label = dev_processed.loc[i,'label']
        data_features = feature_dict[data_id].unsqueeze(dim=0)

        if data_text.input_ids.shape[1] > 512:
            count += 1
            continue

        features, _ = model(input_ids=data_text.input_ids.to(device), attention_mask=data_text.attention_mask.to(device), encodings=data_features)
        features = features.squeeze(dim=0).cpu().numpy()

        data_list.append([data_id] + features.tolist() + [data_label])

columns = ['id'] + [f'f_{i}' for i in range(features.shape[0])] + ['label']
dev_df = pd.DataFrame(data_list, columns=columns)
dev_df.to_csv(os.path.join(save_path, 'dev_2048_features'), index=False)
print(f'{count} dev samples had a sequence length > 512')

Processing Data: 100%|██████████| 787/787 [00:14<00:00, 54.00it/s]


0 dev samples had a sequence length > 512


In [8]:
model.eval()
feature_dict = {row["id"]: torch.tensor(row[1:].values, dtype=torch.float32, device=device) for _, row in test_features.iterrows()}
data_list = []
count = 0

for i in tqdm(range(len(test_processed)), desc="Processing Data"):

    with torch.no_grad():

        data_id = test_processed.loc[i,'id']
        data_text = test_processed.loc[i,'text']
        data_text = tokenizer(data_text, return_tensors='pt', padding=True, return_attention_mask=True, return_token_type_ids=False)
        data_features = feature_dict[data_id].unsqueeze(dim=0)

        if data_text.input_ids.shape[1] > 512:
            count += 1
            continue

        features, _ = model(input_ids=data_text.input_ids.to(device), attention_mask=data_text.attention_mask.to(device), encodings=data_features)
        features = features.squeeze(dim=0).cpu().numpy()

        data_list.append([data_id] + features.tolist())

columns = ['id'] + [f'f_{i}' for i in range(features.shape[0])]
test_df = pd.DataFrame(data_list, columns=columns)
test_df.to_csv(os.path.join(save_path, 'test_2048_features'), index=False)
print(f'{count} test samples had a sequence length > 512')

Processing Data: 100%|██████████| 1576/1576 [00:26<00:00, 59.02it/s]


0 test samples had a sequence length > 512
