In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 2060


In [2]:
from torch.utils.data import TensorDataset, random_split,SequentialSampler,DataLoader

In [3]:
import pandas as pd

In [4]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [7]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [8]:
from transformers import BertTokenizer

In [9]:
output_file = ".model_bert_ft_bitcoin.pth"

In [10]:
#In the future, if you want to continue to work on this model, you can load it back in.
checkpoint = torch.load(output_file, map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [11]:
bitcoin = pd.read_csv("..\\new_data_format\\new_data_format\\2017-2020.csv", 
                 header=0,  usecols=[8, 9] )

In [12]:
bitcoin.shape

(59584, 2)

In [13]:
bitcoin.head()

Unnamed: 0,CommentsTime,update_Comments
0,2020/12/28,Could have bought it in March this year for su...
1,2020/12/28,"It’s the start of the next bull cycle, do your..."
2,2020/12/28,"Based on my experience, it's not really wise t..."
3,2020/12/28,What about people who bought Bitcoin when its ...
4,2020/12/28,buying Bitcoin at $ 27k is not an idiot but so...


In [14]:
print('Number of test sentences: {:,}\n'.format(bitcoin.shape[0]))

Number of test sentences: 59,584



In [15]:
bitcoin_sentence = bitcoin.update_Comments

In [16]:
bitcoin_sentence.shape

(59584,)

In [17]:
bitcoin_sentence[0]

'Could have bought it in March this year for sub $4000.Downside risk is much greater now than it was earlier this year. Is it the mentally challenged that are getting fleeced right now. '

In [18]:
import numpy as np

In [19]:
bitcoin_sentence = bitcoin_sentence[0:10]

In [20]:
bitcoin_sentence

0    Could have bought it in March this year for su...
1    It’s the start of the next bull cycle, do your...
2    Based on my experience, it's not really wise t...
3    What about people who bought Bitcoin when its ...
4    buying Bitcoin at $ 27k is not an idiot but so...
5    So do you mean a people buying BTC in last ATH...
6    The one rule I did read about in the investors...
7    Whoever buying bitcoin at $27k certainly not o...
8    Maybe the correct term would be "late" since t...
9    Says the same people who refused to buy at the...
Name: update_Comments, dtype: object

In [21]:
def score(model, bitcoin_sentence):
    bitcoin_sentence = bitcoin_sentence.replace(np.nan,"")
    from transformers import BertTokenizer

    # Load the BERT tokenizer.
    print('Loading BERT tokenizer...')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in bitcoin_sentence:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 64,           # Pad & truncate all sentences.
                            truncation = True,
                            padding = 'max_length',
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    # labels_test = torch.tensor(labels_test)
    
    # Set the batch size.  
    batch_size = 32
    
    # Create the DataLoader.
    prediction_data = TensorDataset(input_ids, attention_masks)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, batch_size=batch_size)
    
    # Prediction on test set

    print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

    # Put model in evaluation mode
    model.eval()

    # Tracking variables 
    predictions , true_labels = [], []

    # Predict 
    for batch in prediction_dataloader:
      # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask= batch

        # Telling the model not to compute or store gradients, saving memory and 
        # speeding up prediction
        with torch.no_grad():
          # Forward pass, calculate logit predictions
          outputs = model(b_input_ids, token_type_ids=None, 
                          attention_mask=b_input_mask)

        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        # label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.append(logits)
        # true_labels.append(label_ids)

    print('    DONE.')
    
    def softmax(x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0) # only difference
    
    i = 0
    score = []
    for p in predictions:
        for ind in p:
            score.append(softmax(ind))
            
    def score_sentiment(score):
        sentiment_score = []
        for s in score:
            if(s[0]>s[1]):
                sentiment_score.append(-s[0])
            else:
                sentiment_score.append(s[1])
        return sentiment_score
    
    sentiment_score = score_sentiment(score)
    
    return sum(sentiment_score)/len(sentiment_score)

In [22]:
testy = score(model, bitcoin_sentence)

Loading BERT tokenizer...
Predicting labels for 10 test sentences...
    DONE.


In [23]:
testy

-0.3463104128837585