## Setting up BERT-BASE

In [None]:
from transformers.models.auto.modeling_auto import AutoModel
from transformers.models.auto.tokenization_auto import AutoTokenizer
import torch

model_name = "bert-large-uncased"

model = AutoModel.from_pretrained(model_name).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def can_be_dict(obj):
    """
    This function tells that the object can be converted to a dict or not
    """
    try:
        dict(obj)
        return True
    except Exception:
        return False

def move_to(obj, device="cpu"):
    """
    This function helps in moving objects having tensors to cuda
    """
    if device in dir(obj):
        obj = obj.to(device)
    
    elif can_be_dict(obj):
        obj = dict(obj)
        
        for k, v in obj.items():
            if device in dir(v):
                obj[k] = v.to(device)
    return obj

@torch.no_grad()
def encode(sentences, device="cpu"):
    global model
    sentences = move_to(tokenizer(sentences, padding=True, return_tensors="pt"), device)
    if model.device != device:
        model = model.to(device)
    cls_token = model(**sentences).last_hidden_state[:, 0, :]
    return cls_token.tolist()

## Setting up DataFrame

In [None]:
from tqdm import tqdm
import pandas as pd
import os

data_dir = "../input/tweets-sentiment-analysis/"
train_data = pd.read_csv(os.path.join(data_dir, 'train_data.csv'))

In [None]:
batch_size = 1024
num_batches = len(train_data) // batch_size

batched_traindata = [train_data[(batch_size*batch_idx):(batch_size*(batch_idx+1))] for batch_idx in range(num_batches)]
batched_traindata = [(batch['sentence'].tolist(), batch['sentiment'].tolist()) for batch in batched_traindata]

In [None]:
with open('processed_data.txt', 'w') as file:
    for i, (sentence, sentiments) in enumerate(tqdm(batched_traindata)):
        # Un-Comment incase running on kaggle, As progress bars aren't visible thier
        # print(f"{i}/{len(batched_traindata)} | {(i/len(batched_traindata))*100:.3f}%")
        encoded = encode(sentence, "cuda")
        for ids, sentiment in zip(map(lambda ids: ' '.join(map(str, ids)), encoded), map(str, sentiments)):
            file.write('\t'.join([ids, sentiment]) + '\n')