## Importing Relevant Libraries

In [1]:
from transformers import get_scheduler, AdamW, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, DatasetDict, load_metric
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from scipy.special import softmax
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
import evaluate




## Reading data

In [2]:
data = pd.read_csv('preprocessed_tweets2.csv')
data.head(3)

  data = pd.read_csv('preprocessed_tweets2.csv')


Unnamed: 0,date,text,is_retweet,tokenized_text,text2,hashtags,text_without_stopwords,hashtag_count,tweet_length,contains_price,labels
0,2021-07-19,do you even mine #bitcoin dude â¦ do you even...,False,"['do', 'you', 'even', 'mine', 'bitcoin', 'dude...",do you even mine bitcoin dude do you even run ...,['#bitcoin'],even mine bitcoin dude even run noteâ even un...,1,15,False,0
1,2021-07-18,#bitcoin is whales and big companys instrumen...,False,"['bitcoin', 'is', 'whales', 'and', 'big', 'com...",bitcoin is whales and big companys instrument ...,"['#bitcoin', '#btc']",bitcoin whales big companys instrument exchan...,2,26,False,1
2,2021-08-08,ð $hmc ð\nhospitality monkey coin \ncha...,False,"['hmc', 'hospitality', 'monkey', 'coin', 'char...",hmc hospitality monkey coin charity oriented t...,"['#bsc', '#btc', '#ethereum']",hmc hospitality monkey coin charity oriented ...,3,25,False,0


## Preparing Model

In [5]:
MODEL = f'siebert/sentiment-roberta-large-english'
model = torch.load('RoBERTA-1-epoch.pt', map_location='cpu')
tokenizer = AutoTokenizer.from_pretrained(MODEL)

## Checking the accuracy of the pretrained model

We are going to use a sample of 500 data points

In [7]:
batch = tokenizer(list(data['text_without_stopwords'][:500]),
                  padding=True,
                  truncation=True,
                  max_length=512,
                  return_tensors='pt')
print(batch)
with torch.inference_mode():
    outputs = model(**batch)
    preds = torch.softmax(outputs.logits, dim=1)
    preds = torch.argmax(preds, dim=1)
acc = accuracy_score(data['labels'][:500], preds)
print(f'Accuracy: {acc*100}%')

{'input_ids': tensor([[    0,   190,  4318,  ...,     1,     1,     1],
        [    0, 11388, 18018,  ...,     1,     1,     1],
        [    0,  1368, 29297,  ...,     1,     1,     1],
        ...,
        [    0,    82,   202,  ...,     1,     1,     1],
        [    0, 11388,   614,  ...,     1,     1,     1],
        [    0, 11388,  2935,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
Accuracy: 55.800000000000004%


This model is not very great we will need to train it 

## Preparing dataset for training

In [4]:
raw_dataset = load_dataset("csv", data_files="preprocessed_tweets2.csv")

train_data = raw_dataset['train'].train_test_split(test_size=0.2, seed=42)
test_valid = train_data['test'].train_test_split(test_size=0.5, seed=42)

dataset = DatasetDict({
    'train': train_data['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']
})

dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'text', 'is_retweet', 'tokenized_text', 'text2', 'hashtags', 'text_without_stopwords', 'hashtag_count', 'tweet_length', 'contains_price', 'labels'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['date', 'text', 'is_retweet', 'tokenized_text', 'text2', 'hashtags', 'text_without_stopwords', 'hashtag_count', 'tweet_length', 'contains_price', 'labels'],
        num_rows: 10000
    })
    valid: Dataset({
        features: ['date', 'text', 'is_retweet', 'tokenized_text', 'text2', 'hashtags', 'text_without_stopwords', 'hashtag_count', 'tweet_length', 'contains_price', 'labels'],
        num_rows: 10000
    })
})

In [5]:
tokenizer(dataset['train']['text_without_stopwords'][0])

{'input_ids': [0, 1803, 26713, 821, 16100, 48726, 923, 5505, 3070, 39398, 7664, 11940, 32605, 2405, 11388, 326, 11726, 1638, 25616, 16776, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

### Checking if the training dataset is balanced

In [6]:
print(round(np.sum(dataset['train']['labels'])/len(dataset['train'])*100, 2), "%")

47.28 %


This shows that the train dataset consist of 47% of 1s which should be balanced enough

### Tokenize function

In [7]:
def tokenize_function(examples):
    return tokenizer(examples['text_without_stopwords'], truncation=True, max_length=512, padding=True)

### Tokenizing the dataset

In [8]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [9]:
tokenized_dataset = tokenized_dataset.remove_columns(['date', 'text', 'is_retweet', 'tokenized_text', 'text2', 'hashtags', 'text_without_stopwords', 'hashtag_count', 'tweet_length', 'contains_price'])
tokenized_dataset.set_format('torch')
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
    valid: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [10]:
tokenized_dataset.shape

{'train': (80000, 3), 'test': (10000, 3), 'valid': (10000, 3)}

### Initiating Data Collator

In [11]:
data_collator = DataCollatorWithPadding(tokenizer)

## Training using PyTorch

Turning the tokenized_dataset into a dataloader so we can batch it easily

In [12]:
train_dataloader = DataLoader(
    tokenized_dataset['train'],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset['valid'],
    batch_size=8,
    collate_fn=data_collator
)

for batch in train_dataloader:
    inputs = {k: v.shape for k, v in batch.items() if k!='labels'}
    labels = batch['labels']
    
    print(labels)
    print(inputs)
    outputs = model(**batch)
    print(outputs.loss, outputs.logits.shape)
    break

tensor([0, 1, 0, 1, 1, 0, 0, 0])
{'input_ids': torch.Size([8, 84]), 'attention_mask': torch.Size([8, 84])}
tensor(3.6485, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [59]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [62]:
EPOCHS = 2
training_steps = EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=training_steps
)

In [63]:
# progress_bar = tqdm(range(training_steps))

# model.train()
# for epoch in range(EPOCHS):
#     for batch in train_dataloader:
#         inputs = {k: v.shape for k, v in batch.items() if k!='labels'}
#         labels = batch['labels']
        
#         outputs = model(**batch)

#         loss = outputs.loss
#         loss.backward()

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)

  0%|          | 0/20000 [00:00<?, ?it/s]

In [None]:
# metric = load_metric("glue", "mrpc")

# model.eval()
# for batch in eval_dataloader:
#     inputs = {k: v.shape for k, v in batch.items() if k!='labels'}
#     labels = batch['labels']
    
#     with torch.no_grad():
#         outputs = model(**batch)

#     logits = outputs.logits
#     preds = torch.argmax(logits, dim=1)
#     metric.add_batch(preds, references=labels)

# metric.compute()

## Buat Save Model

In [None]:
# save_dir = "RoBERTa"
# tokenizer.save_pretrained(save_directory=save_dir)
# model.save_pretrained(save_directory=save_dir)