In [1]:
import transformers
from transformers import XLNetTokenizer, XLNetModel, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.utils import shuffle
from collections import defaultdict
from textwrap import wrap
from pylab import rcParams
import re

from torch import nn, optim
from keras_preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset,RandomSampler,SequentialSampler
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F





In [2]:
RANDOM_SEED = 69
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)


<torch._C.Generator at 0x2ac6d952210>

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
# Reading data using pandas
path_to_data = "Crypto_Sentiment_Dataset.csv"
df = pd.read_csv(path_to_data, encoding = 'raw_unicode_escape', engine ='python',header = None)

# Shuffle and Clip data
df = shuffle(df)
#df = df[:20000]

# Function to clean text. Remove tagged entities, hyperlinks, emojis
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text
 
df['tweet'] = df[3].apply(clean_text)

# Function to convert labels to number.
def sentiment2label(sentiment):
    if sentiment == "Positive":
        return 1
    else :
        return 0

df['sentiment'] = df[4].apply(sentiment2label)

# List of class names.
class_names = ['negative', 'positive']

In [5]:
df['sentiment'].value_counts()

1    302
0    261
Name: sentiment, dtype: int64

In [6]:
print(len(df))
df.head(20)

563


Unnamed: 0,0,1,2,3,4,5,tweet,sentiment
343,QQYCQPQK6WFA,false,,Cryptocurrency is bad for the environment beca...,Negative,https://www.reddit.com/r/explainlikeimfive/com...,Cryptocurrency is bad for the environment beca...,0
381,ADCXHGCQRA34,false,,It's great to get paid for heating a swimming ...,Positive,https://www.reddit.com/r/Bitcoin/comments/uzzp...,It's great to get paid for heating a swimming ...,1
135,WYWGJAYCCA6D,false,,"This is the way. Until the ride back up, go ou...",Positive,https://www.reddit.com/r/Bitcoin/comments/ul5y...,This is the way. Until the ride back up go out...,1
222,AQDRYYFGDJJ7,false,,I donât have the money to invest in it but I...,Positive,https://www.reddit.com/r/AskWomen/comments/s36...,I don t have the money to invest in it but I d...,1
20,TD36G7T2EZEK,false,,"Literally crashing to zero in real time atm, L...",Negative,https://old.reddit.com/r/terraluna/comments/un...,Literally crashing to zero in real time atm Lu...,0
386,3796EY394VG3,false,,Another arrogant prick with an algorithmic sta...,Negative,https://www.reddit.com/r/CryptoCurrency/commen...,Another arrogant prick with an algorithmic sta...,0
550,GAN44MJMXW36,false,,just buy it now and forget about it.\n\ntrust ...,Positive,https://www.reddit.com/r/Bitcoin/comments/ly8z...,just buy it now and forget about it. trust the...,1
480,RZ974APDDTJY,false,,Basically how the ponzi falls. Sure Luna has h...,Negative,https://www.reddit.com/r/CryptoCurrency/commen...,Basically how the ponzi falls. Sure Luna has h...,0
430,XKEQPPFJWHY9,false,,"Price-wise, I think it's a good time to get in...",Positive,https://www.reddit.com/r/CryptoCurrency/commen...,Price wise I think it's a good time to get int...,1
307,GWXY3F6YGEPK,false,,"Bitcoin just had a headstart, and it's issuanc...",Positive,https://www.reddit.com/r/ethereum/comments/uze...,Bitcoin just had a headstart and it's issuance...,1


In [7]:
from transformers import XLNetTokenizer, XLNetModel
PRE_TRAINED_MODEL_NAME = 'xlnet-base-cased'
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

In [8]:
class tweetDataset(Dataset):

    def __init__(self, tweets, targets, tokenizer, max_len):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.tweets)
    
    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
        tweet,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        pad_to_max_length=False,
        return_attention_mask=True,
        return_tensors='pt',
        )

        input_ids = pad_sequences(encoding['input_ids'], maxlen=MAX_LEN, dtype=torch.Tensor ,truncating="post",padding="post")
        input_ids = input_ids.astype(dtype = 'int64')
        input_ids = torch.tensor(input_ids) 

        attention_mask = pad_sequences(encoding['attention_mask'], maxlen=MAX_LEN, dtype=torch.Tensor ,truncating="post",padding="post")
        attention_mask = attention_mask.astype(dtype = 'int64')
        attention_mask = torch.tensor(attention_mask)       

        return {
        'tweet': tweet,
        'input_ids': input_ids,
        'attention_mask': attention_mask.flatten(),
        'targets': torch.tensor(target, dtype=torch.long)
        }

In [9]:
df_train, df_test = train_test_split(df, test_size=0.5, random_state=101)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=101)
df_train.shape, df_val.shape, df_test.shape

((281, 8), (141, 8), (141, 8))

In [10]:

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = tweetDataset(
    tweets=df.tweet.to_numpy(),
    targets=df.sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=0
  )

In [11]:
BATCH_SIZE = 2
MAX_LEN = 1024
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [12]:
from transformers import XLNetForSequenceClassification
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels = 2)
model = model.to(device)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [13]:
model

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward

In [14]:

EPOCHS = 6
BATCH_SIZE = 2

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=3e-5)
#optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
print("done")

done


In [15]:
data = next(iter(val_data_loader))
data.keys()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


dict_keys(['tweet', 'input_ids', 'attention_mask', 'targets'])

In [16]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
targets = data['targets'].to(device)
print(input_ids.shape)

torch.Size([2, 1, 1024])


In [17]:
print(input_ids.reshape(BATCH_SIZE,1024).shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

torch.Size([2, 1024])
torch.Size([2, 1024])


In [18]:
input_ids[0]

tensor([[9961,   56, 3429,  ...,    0,    0,    0]], device='cuda:0')

In [19]:
outputs = model(input_ids.reshape(BATCH_SIZE,1024), token_type_ids=None, attention_mask=attention_mask, labels=targets)
outputs

XLNetForSequenceClassificationOutput(loss=tensor(0.7754, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0097,  0.1364],
        [ 0.2492, -0.1607]], device='cuda:0', grad_fn=<AddmmBackward0>), mems=(tensor([[[ 0.0372,  0.0792, -0.0861,  ...,  0.0489,  0.0098, -0.0162],
         [-0.0179, -0.0364, -0.0415,  ...,  0.0125,  0.0062,  0.0325]],

        [[-0.0482, -0.0126,  0.0453,  ..., -0.0780, -0.0425,  0.0127],
         [-0.0004, -0.0517,  0.0827,  ...,  0.0242,  0.0469,  0.0494]],

        [[ 0.0818, -0.0357, -0.0359,  ...,  0.0127, -0.0032,  0.0021],
         [-0.0027, -0.0376,  0.0661,  ...,  0.0254,  0.0216,  0.0054]],

        ...,

        [[-0.0049,  0.0655, -0.0151,  ..., -0.0458, -0.0061,  0.0346],
         [-0.0049,  0.0655, -0.0151,  ..., -0.0458, -0.0061,  0.0346]],

        [[-0.0049,  0.0655, -0.0151,  ..., -0.0458, -0.0061,  0.0346],
         [-0.0049,  0.0655, -0.0151,  ..., -0.0458, -0.0061,  0.0346]],

        [[-0.0049,  0.0655, -0.0151,  ..., -0.045

In [20]:
print(type(outputs[0]))
outputs[0].shape

<class 'torch.Tensor'>


torch.Size([])

In [30]:
from sklearn import metrics
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    acc = 0
    counter = 0
  
    for d in data_loader:
        input_ids = d["input_ids"].reshape(2,1024).to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        
        outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)
        loss = outputs[0]
        logits = outputs[1]

        # preds = preds.cpu().detach().numpy()
        _, prediction = torch.max(outputs[1], dim=1)
        targets = targets.cpu().detach().numpy()
        prediction = prediction.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, prediction)

        acc += accuracy
        losses.append(loss.item())
        
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        counter = counter + 1

    return acc / counter, np.mean(losses)

In [27]:

def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    losses = []
    acc = 0
    counter = 0
  
    with torch.no_grad():
        for d in data_loader:
            #print(d["input_ids"])
            #print(d["input_ids"].shape)
            if str(d["input_ids"].shape) == "torch.Size([4, 1, 1024])":
                #print("correctSize")
                input_ids = d["input_ids"].reshape(4,1024).to(device)
                attention_mask = d["attention_mask"].to(device)
                targets = d["targets"].to(device)

                outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)
                loss = outputs[0]
                logits = outputs[1]

                _, prediction = torch.max(outputs[1], dim=1)
                targets = targets.cpu().detach().numpy()
                prediction = prediction.cpu().detach().numpy()
                accuracy = metrics.accuracy_score(targets, prediction)

                acc += accuracy
                losses.append(loss.item())
                counter += 1

    return acc / counter, np.mean(losses)

In [31]:
%%time
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,     
        optimizer, 
        device, 
        scheduler, 
        len(df_train)
    )

    print(f'Train loss {train_loss} Train accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader, 
        device, 
        len(df_val)
    )

    print(f'Val loss {val_loss} Val accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'xlnet_model.bin')
        best_accuracy = val_acc

Epoch 1/6
----------


RuntimeError: shape '[2, 1024]' is invalid for input of size 1024

Using the trained Model

In [24]:
model.load_state_dict(torch.load('xlnet_model.bin'))

<All keys matched successfully>

In [25]:
model = model.to(device)
test_acc, test_loss = eval_model(
  model,
  test_data_loader,
  device,
  len(df_test)
)

print('Test Accuracy :', test_acc)
print('Test Loss :', test_loss)

ZeroDivisionError: division by zero

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()
    
    tweets = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:

            tweets = d["tweet"]
            input_ids = d["input_ids"].reshape(4,1024).to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)

            loss = outputs[0]
            logits = outputs[1]
            
            _, preds = torch.max(outputs[1], dim=1)

            probs = F.softmax(outputs[1], dim=1)

            tweets.extend(tweets)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return tweets, predictions, prediction_probs, real_values

In [None]:
y_tweet, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

In [None]:
print(classification_report(y_test, y_pred, target_names=class_names))


In [None]:

def predict_sentiment(text):
    tweet = text

    encoded_tweet = tokenizer.encode_plus(
    tweet,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=False,
    return_attention_mask=True,
    return_tensors='pt',
    )

    input_ids = pad_sequences(encoded_tweet['input_ids'], maxlen=MAX_LEN, dtype=torch.Tensor ,truncating="post",padding="post")
    input_ids = input_ids.astype(dtype = 'int64')
    input_ids = torch.tensor(input_ids) 

    attention_mask = pad_sequences(encoded_tweet['attention_mask'], maxlen=MAX_LEN, dtype=torch.Tensor ,truncating="post",padding="post")
    attention_mask = attention_mask.astype(dtype = 'int64')
    attention_mask = torch.tensor(attention_mask) 

    input_ids = input_ids.reshape(1,1024).to(device)
    attention_mask = attention_mask.to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    outputs = outputs[0][0].cpu().detach()

    probs = F.softmax(outputs, dim=-1).cpu().detach().numpy().tolist()
    _, prediction = torch.max(outputs, dim =-1)

    print("Positive score:", probs[1])
    print("Negative score:", probs[0])
    print(f'tweet: {tweet}')
    print(f'Sentiment  : {class_names[prediction]}')

In [None]:
text = "Bitcoin is so in right now"
predict_sentiment(text)

In [None]:
# Reading data using pandas
path_to_data = "Bitcoin_tweets.csv"
dfBitcoin = pd.read_csv(path_to_data, engine ='python')

# Shuffle and Clip data
dfBitcoin = shuffle(dfBitcoin)
dfBitcoin = dfBitcoin[:10000]



In [None]:
# Function to clean text. Remove tagged entities, hyperlinks, emojis
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text
 
dfBitcoin['text'] = dfBitcoin['text'].apply(clean_text)


In [None]:
i = 1
import time
for text in dfBitcoin['text']:
    if i <100 :
        predict_sentiment(text)
        print("-------------------------------------\n")
        i = i+1