<a href="https://colab.research.google.com/github/Lanwei02/Bitcoin-MarketPrice-Prediction/blob/master/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('max_colwidth', 500)

In [2]:
!pip install transformers bert torch



In [3]:
# Setup root directory
GET_DATA_FROM_COLAB=True
if GET_DATA_FROM_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive', force_remount=True)
  root_path = '/content/gdrive/My Drive/Geekon2020' 
else: # Local machine
    root_path ='' 
os.chdir(root_path)
!pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/Geekon2020


In [4]:
senti_data = pd.read_csv('sentiment_hbw.csv',
                         names = ['deal_uuid', 'permalink', 'grt_l1_cat_name', 'grt_l2_cat_name', 'grt_l3_cat_name', 'grt_l4_cat_name',
                     'pds_cat_name', 'review', 'dwh_updated_at', 'int_value', 'sentiment_type', 'sentiment_score','user_id', 'client_platform'],
                         delimiter='\t', index_col=False,
                         na_values = ['NA', 'N/A', '\\N'])

### PreProcessing

In [5]:
def preprocessing(data):

  # Rename
  data.rename(columns = {'int_value': 'star_rating', 'dwh_updated_at': 'update_timestamp'}, inplace = True)
  
  # Drop NULL Label
  data = data[~data['star_rating'].isna()]

  # Drop 0 star rating deals
  data = data[data['star_rating']>0]

  # Correct data type
  data['update_timestamp'] = pd.to_datetime(data['update_timestamp'])
  data['star_rating'] = data['star_rating'].astype(object)

  # Impute NULL review with empty string
  data['review'].fillna('', inplace = True)



  return data


senti_data_c = senti_data.copy()
senti_data_c = preprocessing(senti_data_c)


In [13]:
senti_data_c_test = senti_data_c.head(1280)

### Train Test Split

In [6]:
from sklearn.model_selection import train_test_split

X = senti_data_c['review'].values
y = senti_data_c['star_rating'].values

# 70% train, 20% validation, 10% test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=2020, stratify = y)

In [7]:
X = X_val
y = y_val

X_val, X_test, y_val, y_test = train_test_split(X, y, test_size=0.333, random_state=2020, stratify = y)

### BERT Tokenizer

In [8]:
from transformers import BertTokenizer
import torch

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data, max_len):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=sent,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=max_len,                  # Max length to truncate/pad
            truncation=True,
            padding='max_length',         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [10]:
# Specify `MAX_LEN`
MAX_LEN = 100

# Print sentence 0 and its encoded token ids
token_ids = list(preprocessing_for_bert([X_train[0]], MAX_LEN)[0].squeeze().numpy())
print('Original: ', X_train[0])
print('\nToken IDs: ', token_ids)

Original:  Super clean, friendly and professional. \nLoved my facial

Token IDs:  [101, 3565, 4550, 1010, 5379, 1998, 2658, 1012, 1032, 17953, 21818, 2094, 2026, 13268, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [11]:
# # Run function `preprocessing_for_bert` on the train set and the validation set
# print('Tokenizing data...')
# train_inputs, train_masks = preprocessing_for_bert(X_train, MAX_LEN)
# val_inputs, val_masks = preprocessing_for_bert(X_val, MAX_LEN)
# test_inputs, test_masks = preprocessing_for_bert(X_test, MAX_LEN)

Tokenizing data...


In [12]:
# train_inputs_test, train_masks_test = preprocessing_for_bert(senti_data_c_test['review'].to_numpy(), MAX_LEN)

In [29]:
# with open('preprocessing_for_bert.pt', 'wb') as handle:
#     pickle.dump([train_inputs, train_masks, val_inputs, val_masks, test_inputs, test_masks, train_inputs_test, train_masks_test], handle)

In [10]:
import pickle
with open('preprocessing_for_bert.pt', 'rb') as handle:
    All = pickle.load(handle)

train_inputs, train_masks, val_inputs, val_masks, test_inputs, test_masks, train_inputs_test, train_masks_test = All[0], All[1], All[2], All[3], All[4], All[5], All[6], All[7]

In [11]:
train_inputs_test = train_inputs[:1280]
train_masks_test = train_masks[:1280]

### Create PyTorch DataLoader

In [14]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train.astype(int))
val_labels = torch.tensor(y_val.astype(int))
test_labels = torch.tensor(y_test.astype(int))

train_labels_test = torch.tensor(senti_data_c_test['star_rating'].to_numpy().astype(int))

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32. # But 64 is quicker?
batch_size = 64

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)


# Create the DataLoader for our test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


# Create the DataLoader for our testing code
train_data_test = TensorDataset(train_inputs_test, train_masks_test, train_labels_test)
train_sampler_test = RandomSampler(train_data_test)
train_dataloader_test = DataLoader(train_data_test, sampler=train_sampler_test, batch_size=batch_size)

### Create BertClassifier

In [15]:
if not os.path.exists("Models"):
  os.makedirs("Models")

In [16]:
epochs = 1
num_classes = len(senti_data_c['star_rating'].unique())

In [17]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import  BertModel, BertTokenizer
from torch.utils.data import DataLoader
import torch.optim as optim
import os

In [18]:
# import torch

# if torch.cuda.is_available():       
#     device = torch.device("cuda")
#     print(f'There are {torch.cuda.device_count()} GPU(s) available.')
#     print('Device name:', torch.cuda.get_device_name(0))

# else:
#     print('No GPU available, using the CPU instead.')
#     device = torch.device("cpu")

In [19]:
# import torch

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)

In [20]:
# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# outputs = model(**inputs)
# pooler_outputs = outputs.pooler_output
# last_hidden_states = outputs.last_hidden_state

In [21]:
import torch.nn as nn
from transformers import BertModel
import torch

class SentimentClassifier(nn.Module):
    def __init__(self, num_classes, freeze_bert = True):
        super(SentimentClassifier, self).__init__()
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        # self.device = device

        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        self.cls_layer = nn.Linear(768, num_classes)

    def forward(self, seq, attn_masks):

        #Feeding the input to BERT model to obtain contextualized representations
        cont_reps, _ = self.bert_layer(seq, attention_mask = attn_masks)

        #Obtaining the representation of [CLS] head
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits
        

In [22]:
net = SentimentClassifier(num_classes, freeze_bert=False).cuda()

In [23]:
totalDatasetSize = len(senti_data_c)
symbols = senti_data_c.groupby('star_rating')
scores_dist = []
for i in range(num_classes):
    scores_dist.append(len(symbols.groups[i+1])/totalDatasetSize)

weights = torch.tensor(scores_dist).cuda()

# Setting the Loss function and Optimizer.
loss_func = nn.NLLLoss(weight=weights)
opti = optim.Adam(net.parameters(), lr = 2e-5)

softmax = nn.LogSoftmax(dim=1)

In [24]:
def get_accuracy(logits, labels):
    # get the index of the max value in the row.
    predictedClass = logits.max(dim = 1)[1]

    # get accuracy by averaging over entire batch.
    acc = (predictedClass == labels).float().mean()
    return acc

def evaluate(net, loss_func, dataloader, config):
  net.eval()

  mean_acc, mean_loss = 0, 0
  count = 0

  with torch.no_grad():
      for seq, attn_masks, labels in dataloader:
          seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)

          logits = net(seq, attn_masks)
          mean_loss += loss_func(m(logits), labels)
          mean_acc += get_accuracy(m(logits), labels)
          print("Validation iteration", count+1)
          count += 1

          '''
          The entire validation set was around 0.1 million entries,
          the validationFraction param controls what fraction of the shuffled
          validation set you want to validate the results on.
          '''
          if count > config["validationFraction"] * len(val_set):
              break

  return mean_acc / count, mean_loss / count   

In [25]:
best_acc = 0
for ep in range(epochs):
    net.train()
    for it, (seq, attn_masks, labels) in enumerate(train_dataloader_test ):
        seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()
        opti.zero_grad()

        logits = net(seq, attn_masks)
        loss = loss_func(softmax(logits), labels)

        loss.backward()
        opti.step()
        print("Iteration: ", it+1)

        if (it + 1) % config["printEvery"] == 0:
            acc = get_accuracy(m(logits), labels)

            # Since a single epoch could take well over hours, we regularly save the model even during evaluation of training accuracy.
            torch.save(net.state_dict(), os.path.join(config["outputFolder"], config["outputFileName"]))
            print("Iteration {} of epoch {} complete. Loss : {} Accuracy : {}".format(it+1, ep+1, loss.item(), acc))

    # perform validation at the end of an epoch.
    # iteration time - len(train_loader)
    val_acc, val_loss = evaluate(net, loss_func, val_loader, config)
    print(" Validation Accuracy : {}, Validation Loss : {}".format(val_acc, val_loss))
    if val_acc > best_acc:
        print("Best validation accuracy improved from {} to {}, saving model...".format(best_acc, val_acc))
        best_acc = val_acc
        torch.save(net.state_dict(), os.path.join(config["outputFolder"], config["outputFileName"] + "_valTested_" + str(best_acc)))

RuntimeError: ignored

In [27]:
loss 

RuntimeError: ignored

In [26]:
seq.cuda()

RuntimeError: ignored