### Sentiment Engine training

#### proper installation and import and mount on google drive

In [None]:
!pip install transformers
from transformers import RobertaForSequenceClassification, RobertaTokenizer, BertForSequenceClassification, BertTokenizer, AutoModelForSequenceClassification, AutoTokenizer, AdamW

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
cd drive/MyDrive/Projects/PLP_PracticeModule/

/content/drive/MyDrive/Projects/PLP_PracticeModule


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
%matplotlib inline

#### read dataset and replace text labels to binary labels

In [4]:
df = pd.read_csv('final_dataset_for_training.csv',usecols=['body','entities','Stock'])
df

Unnamed: 0,body,entities,Stock
0,$TSLA stock starting to rocket 🚀,Bullish,TSLA
1,$TSLA Money got banana it grow big \nBut took ...,Bullish,TSLA
2,"$AAPL bear logic last qtr, but the iPhone numb...",Bullish,AAPL
3,$AAPL 129 at the morning bell. Sleep well my B...,Bullish,AAPL
4,$AMZN $TSLA $AAPL $GOOGL now well get some che...,Bullish,TSLA
...,...,...,...
3199995,$TSLA short the rip until we hit $900,Bearish,TSLA
3199996,$AAPL,Bearish,AAPL
3199997,$AMZN think its going to fill 3670 first...? T...,Bearish,AMZN
3199998,@LuckyLloydChristmas @Bullytrading Im such a f...,Bearish,TSLA


In [5]:
df['entities'] = df.entities.replace('Bullish',1).replace('Bearish',0)

#### preprocessing

In [6]:
!pip install emoji
import emoji
import re

def process_text(texts):
  # lowercase
  # message = message.lower() # RoBERTa tokenizer is uncased
  # remove URLs
  texts = re.sub(r'https?://\S+', "", texts)
  texts = re.sub(r'www.\S+', "", texts)
  # remove '
  texts = texts.replace('&#39;', "'")
  # remove symbol names
  texts = re.sub(r'(\#)(\S+)', r'hashtag_\2', texts)
  texts = re.sub(r'(\$)([A-Za-z]+)', r'cashtag_\2', texts)
  # remove usernames
  texts = re.sub(r'(\@)(\S+)', r'mention_\2', texts)
  # demojize
  texts = emoji.demojize(texts, delimiters=("", " "))


  return texts.strip()
    

Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[?25l[K     |█▉                              | 10 kB 32.2 MB/s eta 0:00:01[K     |███▊                            | 20 kB 31.8 MB/s eta 0:00:01[K     |█████▋                          | 30 kB 19.6 MB/s eta 0:00:01[K     |███████▌                        | 40 kB 12.3 MB/s eta 0:00:01[K     |█████████▍                      | 51 kB 11.0 MB/s eta 0:00:01[K     |███████████▏                    | 61 kB 12.9 MB/s eta 0:00:01[K     |█████████████                   | 71 kB 13.1 MB/s eta 0:00:01[K     |███████████████                 | 81 kB 12.5 MB/s eta 0:00:01[K     |████████████████▉               | 92 kB 13.9 MB/s eta 0:00:01[K     |██████████████████▊             | 102 kB 13.0 MB/s eta 0:00:01[K     |████████████████████▌           | 112 kB 13.0 MB/s eta 0:00:01[K     |██████████████████████▍         | 122 kB 13.0 MB/s eta 0:00:01[K     |████████████████████████▎       | 133 kB 13.0 MB/s eta 0:00:01[K    

In [7]:
df['body'] = df.body.apply(process_text)
df.head()

Unnamed: 0,body,entities,Stock
0,cashtag_TSLA stock starting to rocket rocket,1,TSLA
1,cashtag_TSLA Money got banana it grow big \nBu...,1,TSLA
2,"cashtag_AAPL bear logic last qtr, but the iPho...",1,AAPL
3,cashtag_AAPL 129 at the morning bell. Sleep we...,1,AAPL
4,cashtag_AMZN cashtag_TSLA cashtag_AAPL cashtag...,1,TSLA


In [8]:
labels = df.entities.values
text = df.body.values

#### check the tokens number distribution of all the text data entries

In [9]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize
def helper(text):
  t = word_tokenize(text)
  return len(t)
length = df.body.apply(helper)

print('there are {} entries in the dataframe'.format(len(length[length>=0])))
print('{}% of them have a length smaller than 32 tokens'.format(100-len(length[length>32])/32000))
print('{}% of them have a length smaller than 64 tokens'.format(100-len(length[length>64])/32000))
print('{}% of them have a length smaller than 128 tokens'.format(100-len(length[length>128])/32000))
print('So we just use 64 as RoBERTa tokenizer max padding length')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
there are 3200000 entries in the dataframe
85.026% of them have a length smaller than 32 tokens
95.9338125% of them have a length smaller than 64 tokens
99.14103125% of them have a length smaller than 128 tokens
So we just use 64 as RoBERTa tokenizer max padding length


#### load tokenizer and encode the text data

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('sentimentEngine3/')
len(tokenizer)

In [None]:
input_ids = []
attention_mask = []
for i in text:
    encoded_data = tokenizer.encode_plus(
    i,
    add_special_tokens=True,
    truncation=True,
    max_length=64,
    padding='max_length',
    return_attention_mask= True,
    return_tensors='pt')
    input_ids.append(encoded_data['input_ids'])
    attention_mask.append(encoded_data['attention_mask'])
input_ids = torch.cat(input_ids,dim=0)
attention_mask = torch.cat(attention_mask,dim=0)
labels = torch.tensor(labels)

#### fix random seed and split train/validation/test dataset to dataloader

In [None]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
from torch.utils.data import DataLoader,SequentialSampler,RandomSampler,TensorDataset,random_split

dataset = TensorDataset(input_ids,attention_mask,labels)
train_size = int(0.98*len(dataset))
val_size = int((len(dataset) - train_size)/2)
test_size = val_size

train_dataset,val_dataset = random_split(dataset,[train_size,2*val_size])
val_dataset, test_dataset = random_split(val_dataset,[val_size,test_size])
print('Training Size - ',train_size)
print('Validation Size - ',val_size)
print('Test Size - ',test_size)

Training Size -  3136000
Validation Size -  32000
Test Size -  32000


In [None]:
train_dl = DataLoader(train_dataset,sampler = RandomSampler(train_dataset),
                     batch_size = 128)
val_dl = DataLoader(val_dataset,sampler = SequentialSampler(val_dataset),
                     batch_size = 128)
test_dl = DataLoader(test_dataset,sampler = SequentialSampler(test_dataset),
                     batch_size = 128)
len(train_dl),len(val_dl),len(test_dl)

(98000, 1000, 1000)

#### load model and put to cuda and setup optimizer and scheduler

In [None]:
model = RobertaForSequenceClassification.from_pretrained('sentimentEngine3/')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [None]:
optimizer = AdamW(model.parameters(),lr = 3e-5,eps=1e-8,no_deprecation_warning=True)

In [None]:
from transformers import get_linear_schedule_with_warmup
epochs = 1
total_steps = len(train_dl)*epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)

#### define evaluation functions

In [None]:
def accuracy(preds,labels):
    pred_flat = np.argmax(preds,axis=1).flatten()
    label_flat = labels.flatten()
    return np.sum(pred_flat==label_flat)/len(label_flat)

In [None]:
from sklearn.metrics import classification_report
def report(preds,labels):
  target_names = ['Bearish','Bullish']
  pred_flat = np.argmax(preds,axis=1).flatten()
  label_flat = labels.flatten()
  print(classification_report(label_flat,pred_flat,target_names=target_names,digits=4))

In [None]:
def evaluate(dataloader_test):
    model.eval()
    loss_val_total = 0
    predictions,true_vals = [],[]
    for batch in dataloader_test:
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids':batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    loss_val_avg = loss_val_total / len(dataloader_test)
    predictions = np.concatenate(predictions,axis=0)
    true_vals = np.concatenate(true_vals,axis=0)
    return loss_val_avg,predictions,true_vals

#### actual training loop

In [None]:
from tqdm.notebook import tqdm
torch.cuda.empty_cache()
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(train_dl, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(train_dl)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(val_dl)
    val_acc = accuracy(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'Validation Accuracy: {val_acc}')
    report(predictions,true_vals)    

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/98000 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.2106375729977771
Validation loss: 0.16033244659332557
Validation Accuracy: 0.93434375
              precision    recall  f1-score   support

     Bearish     0.9144    0.9579    0.9357     15956
     Bullish     0.9561    0.9109    0.9329     16044

    accuracy                         0.9343     32000
   macro avg     0.9353    0.9344    0.9343     32000
weighted avg     0.9353    0.9343    0.9343     32000



In [None]:
output_dir = 'sentimentEngine4/'
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('sentimentEngine4/tokenizer_config.json',
 'sentimentEngine4/special_tokens_map.json',
 'sentimentEngine4/vocab.json',
 'sentimentEngine4/merges.txt',
 'sentimentEngine4/added_tokens.json')

#### result for all epochs

In [None]:
Epoch 1
Training loss: 0.34953850715657075
Validation loss: 0.2956085407484323
Accuracy: 0.86789375
            precision    recall  f1-score   support

     Bearish     0.8465    0.8876    0.8666     15967
     Bullish     0.8859    0.8452    0.8651     16034

    accuracy                         0.8679     32000
   macro avg     0.8688    0.8679    0.8679     32000
weighted avg     0.8688    0.8679    0.8679     32000

In [None]:
Epoch 2
Training loss: 0.27165976355687875
Validation loss: 0.2235213685684697
Accuracy: 0.90213652
            precision    recall  f1-score   support

     Bearish     0.8840    0.9248    0.9039     16031
     Bullish     0.9221    0.8793    0.9002     15069

    accuracy                         0.9021     32000
   macro avg     0.9030    0.9021    0.9021     32000
weighted avg     0.9030    0.9021    0.9021     32000

In [None]:
Epoch 3
Training loss: 0.23595350905638948
Validation loss: 0.1874651527861133
Validation Accuracy: 0.921
            precision    recall  f1-score   support

     Bearish     0.9029    0.9437    0.9228     16021
     Bullish     0.9409    0.8982    0.9191     15979

    accuracy                         0.9210     32000
   macro avg     0.9219    0.9210    0.9210     32000
weighted avg     0.9219    0.9210    0.9210     32000

In [None]:
Epoch 4
Training loss: 0.2106375729977771
Validation loss: 0.16033244659332557
Validation Accuracy: 0.93434375
              precision    recall  f1-score   support

     Bearish     0.9144    0.9579    0.9357     15956
     Bullish     0.9561    0.9109    0.9329     16044

    accuracy                         0.9343     32000
   macro avg     0.9353    0.9344    0.9343     32000
weighted avg     0.9353    0.9343    0.9343     32000

### Sentiment engine inferencing

#### Inferencing functions and demo

In [None]:
!pip install transformers
!pip install emoji

In [None]:
import transformers
import torch
import math
import pandas as pd
import numpy as np
from transformers import RobertaForSequenceClassification, RobertaTokenizer, BertForSequenceClassification, BertTokenizer, AutoModelForSequenceClassification, AutoTokenizer, AdamW
import random
import time

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
tokenizer_loaded = RobertaTokenizer.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')
model_loaded = RobertaForSequenceClassification.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/735 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Checksenti() to check a single sentence

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def Sentiment(sent,model=model_loaded,tokenizer=tokenizer_loaded):
  encoded_dict = tokenizer.encode_plus(
                      sent, 
                      add_special_tokens = True,
                      truncation=True,
                      max_length = 64,
                      padding='max_length',
                      return_attention_mask = True,
                      return_tensors = 'pt')
      
  input_id = torch.LongTensor(encoded_dict['input_ids']).to(device)
  attention_mask = torch.LongTensor(encoded_dict['attention_mask']).to(device)
  model = model.to(device)

  with torch.no_grad():
      outputs = model(input_id, token_type_ids=None, attention_mask=attention_mask)

  logits = outputs[0]
  index = logits.argmax()
  return index,logits

import emoji
import re

def process_text(texts):
  # lowercase
  # message = message.lower() # RoBERTa tokenizer is uncased
  # remove URLs
  texts = re.sub(r'https?://\S+', "", texts)
  texts = re.sub(r'www.\S+', "", texts)
  # remove '
  texts = texts.replace('&#39;', "'")
  # remove symbol names
  texts = re.sub(r'(\#)(\S+)', r'hashtag_\2', texts)
  texts = re.sub(r'(\$)([A-Za-z]+)', r'cashtag_\2', texts)
  # remove usernames
  texts = re.sub(r'(\@)(\S+)', r'mention_\2', texts)
  # demojize
  texts = emoji.demojize(texts, delimiters=("", " "))

  return texts.strip()

def checkSenti(sent,return_logits=False):
  labels = ['Bearish','Bullish']
  sent_processed = process_text(sent)
  index,logits = Sentiment(sent_processed)
  if return_logits:
    logit0 = math.exp(logits[0][0])
    logit1 = math.exp(logits[0][1])
    logits = [logit0/(logit0+logit1),logit1/(logit0+logit1)]
    return labels[index],logits
  return labels[index]


batch_checkSenti() use GPU to perform batch inferencing

In [None]:
from torch.utils.data import DataLoader,SequentialSampler,TensorDataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def batch_checkSenti(texts,model=model_loaded,tokenizer=tokenizer_loaded,return_logits=False):
  start = time.time()
  labels = ['Bearish','Bullish']
  input_ids = []
  attention_masks = []
  for text in texts:
      i = process_text(text)
      encoded_data = tokenizer.encode_plus(
      i,
      add_special_tokens=True,
      truncation=True,
      max_length=64,
      padding='max_length',
      return_attention_mask= True,
      return_tensors='pt')
      input_ids.append(encoded_data['input_ids'])
      attention_masks.append(encoded_data['attention_mask'])
  input_ids = torch.cat(input_ids,dim=0)
  attention_masks = torch.cat(attention_masks,dim=0)
  model.to(device)
  print('It takes {}s to tokenize'.format(time.time()-start))
  checkpointtime = time.time()

  testset = TensorDataset(input_ids,attention_masks)
  test_dl = DataLoader(testset,sampler = SequentialSampler(testset),batch_size = 128)

  predictions = []
  for batch in test_dl:
      batch = tuple(b.to(device) for b in batch)
      inputs = {
          'input_ids':batch[0],
          'attention_mask': batch[1],
      }

      with torch.no_grad():
          outputs = model(**inputs)
      logits = outputs[0]
      logits = logits.detach().cpu().numpy()
      predictions.append(logits)

  predictions = np.concatenate(predictions,axis=0)
  index = predictions.argmax(axis=1)
  print('It takes {}s to do predictions'.format(time.time()-checkpointtime))

  # if return_logits:
  #   return index,predictions
  return (index,predictions) if return_logits else index

Use checkSenti()

In [None]:
samples = ['just buy','just sell it','entity rocket to the sky!','go down','even though it is going up, I still think it will go down in the near future']
for sample in samples:
  print(checkSenti(sample))

Bullish
Bearish
Bullish
Bearish
Bearish


Use batch_checkSenti()

In [None]:
samples_num = 20000
testing = batch_checkSenti(df.body.iloc[:samples_num])
true_labels = df.entities.iloc[:samples_num].replace('Bullish',1).replace('Bearish',0)
# len(['same' for i in range(samples_num) if testing[i]==true_labels[i]])/samples_num
testing

It takes 13.196947813034058s to tokenize
It takes 144.93988299369812s to do predictions


array([1, 1, 1, ..., 0, 0, 0])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(true_labels,testing))

              precision    recall  f1-score   support

           0       0.88      0.85      0.86     10000
           1       0.85      0.89      0.87     10000

    accuracy                           0.87     20000
   macro avg       0.87      0.87      0.87     20000
weighted avg       0.87      0.87      0.87     20000



#### test 20000 unseen samples and check confusion matrix

In [None]:
df = pd.read_csv('testset20000.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,body,entities,Stock
0,0,$FB 💚☘️💚☘️💚☘️215 at least today. Don’t be greed.,Bullish,FB
1,1,$AMZN Patience is the key.. $2800 is a hard s...,Bullish,AMZN
2,2,$AAPL $162 open,Bullish,AAPL
3,3,$HYMC 4600 watchers--yesterday at 5 we had 360...,Bullish,AMZN
4,4,$NVDA what has NVDA to do with Russis/ Ukraine?,Bullish,NVDA


In [None]:
# sentiments = []
# true_labels = np.array(df.entities[1595000:1605000])
# for i,text in enumerate(df.body[1595000:1605000]):
#   sentiments.append(checkSenti(text))
#   if i %100 == 0:
#     print(i)

sentiments = []
true_labels = np.array(df.entities)
for i,text in enumerate(df.body):
  sentiments.append(checkSenti(text))
  if i%1000 == 0:
    print(i)

In [None]:
len([item for item in true_labels if item == 'Bullish'])

0

In [None]:
true_pos,true_neg,false_pos,false_neg = 0,0,0,0
for i in range(20000):
  if true_labels[i] == 'Bullish':
    if sentiments[i] == 'Bullish':
      true_pos += 1
    else:
      false_neg += 1
  else:
    if sentiments[i] == 'Bearish':
      true_neg += 1
    else:
      false_pos += 1

In [None]:
print('    True labels')
print('    pos   neg')
print('pos  {}    {}'.format(true_pos,false_pos))
print('neg  {}    {}'.format(false_neg,true_neg))

In [None]:
total = true_pos+true_neg+false_pos+false_neg
accuracy = (true_pos+true_neg)/total
precision = true_pos/(true_pos+false_pos)
recall = true_pos/(true_pos+false_neg)
f1 = 2*precision*recall/(precision+recall)
print('accuracy = ',accuracy)
print('precision = ',precision)
print('recall = ',recall)
print('f1 score = ',f1)

#### Train model on tweeter sentiment dataset, and test on Stocktwits dataset
#### Not good, so we give up train sentiment engine on tweeter sentiment dataset from kaggle

**RoBERTa(fine tuned on stocktwits) on unseen testset**

**accuracy =  0.84975**

**precision =  0.8612000413095116**

**recall =  0.8339**

**f1 score =  0.8473301834070011**

RoBERTa(fine tuned on stocktwits)

accuracy =  0.8866

precision =  0.9020178905762429

recall =  0.867373474694939

f1 score =  0.8843565164185192

RoBERTa(fine tuned on 1.6m tweets)

accuracy =  0.6249

precision =  0.5970775687859474

recall =  0.7682

f1 score =  0.6719146330796816

BERT(fine tuned on 1.6m tweets)

accuracy =  0.6165

precision =  0.5983454330575722

recall =  0.7088

f1 score =  0.6489059782111141

FinBERT(fine tuned on 1.6m tweets)

accuracy =  0.6124

precision =  0.5935419440745673

recall =  0.7132

f1 score =  0.6478924418604652

In [None]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [None]:
get_available_gpus()

In [None]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import pipeline
import pandas as pd

tokenizer_loaded = RobertaTokenizer.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')
model_loaded = RobertaForSequenceClassification.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')

nlp = pipeline("text-classification", model=model_loaded, tokenizer=tokenizer_loaded)

sentences = pd.Series(['just buy','just sell it','entity rocket to the sky!','go down','even though it is going up, I still think it will not keep this trend in the near future'])
sentences = list(sentences.apply(process_text))
results = nlp(sentences)
print(results) 

[{'label': 'LABEL_1', 'score': 0.9866015315055847}, {'label': 'LABEL_0', 'score': 0.9912450909614563}, {'label': 'LABEL_1', 'score': 0.9932643175125122}, {'label': 'LABEL_0', 'score': 0.9969961643218994}, {'label': 'LABEL_0', 'score': 0.9047526717185974}]


### RobertaForSequenceClassification Structure

In [None]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [None]:
model.train()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

### Test pretraining (test only)

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="roberta-base",
    tokenizer="roberta-base"
)
fill_mask("Send these <mask> back!")

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

[{'score': 0.16661493480205536,
  'sequence': 'Send these pictures back!',
  'token': 3493,
  'token_str': ' pictures'},
 {'score': 0.10792786628007889,
  'sequence': 'Send these photos back!',
  'token': 2356,
  'token_str': ' photos'},
 {'score': 0.07670937478542328,
  'sequence': 'Send these emails back!',
  'token': 5575,
  'token_str': ' emails'},
 {'score': 0.04860762506723404,
  'sequence': 'Send these images back!',
  'token': 3156,
  'token_str': ' images'},
 {'score': 0.04841742664575577,
  'sequence': 'Send these letters back!',
  'token': 5430,
  'token_str': ' letters'}]

In [None]:
!git clone https://github.com/cardiffnlp/tweeteval /content/tmp/tweeteval

from transformers import RobertaTokenizer, RobertaForMaskedLM

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base')

Cloning into '/content/tmp/tweeteval'...
remote: Enumerating objects: 367, done.[K
remote: Counting objects: 100% (49/49), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 367 (delta 21), reused 13 (delta 4), pack-reused 318[K
Receiving objects: 100% (367/367), 10.79 MiB | 17.41 MiB/s, done.
Resolving deltas: 100% (112/112), done.


In [None]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="/content/tmp/tweeteval/datasets/hate/train_text.txt",
    block_size=512,
)



In [None]:

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./roberta-retrained",
    overwrite_output_dir=True,
    num_train_epochs=25,
    per_device_train_batch_size=48,
    save_steps=500,
    save_total_limit=2,
    seed=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [None]:
trainer.train()

In [None]:
# trainer.save_model("./roberta-retrained")