In [1]:
!pip install transformers

!pip install pytorch-transformers
from pytorch_transformers import XLNetTokenizer,XLNetForSequenceClassification
from pytorch_transformers import AdamW

import numpy as np 
import pandas as pd 

import torch
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler
import torch.nn as nn

!pip install tensorflow
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 10.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 39.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 39.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.1 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel fo

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')
PATH_elonmusk = "/content/gdrive/My Drive/Colab Notebooks/FinancialEconometrics/data/elonmusk.csv"
PATH_tsla = "/content/gdrive/My Drive/Colab Notebooks/FinancialEconometrics/data/TSLA.csv"

Mounted at /content/gdrive


# Load the data 

#### Load tweet elonmusk: 

In [5]:
df = pd.read_csv(PATH_elonmusk)
df['day'] = pd.to_datetime(df['Timestamp'].apply(lambda date : date[0:10]))

tweet = pd.DataFrame()
tweet['date'] = df['day'].values[1:]
tweet['text'] = df['text'].values[1:]
tweet.head()

Unnamed: 0,date,text
0,2011-12-01,Went to Iceland on Sat to ride bumper cars on ...
1,2011-12-01,I made the volume on the Model S http://ow.ly/...
2,2011-12-03,"Great Voltaire quote, arguably better than Twa..."
3,2011-12-03,That was a total non sequitur btw\n26\n14\n50
4,2011-12-04,Am reading a great biography of Ben Franklin b...


XLNet need [SEP] [CLS] tags at the end of each sentence:

In [6]:
tweet['text'] = tweet['text'] +"[SEP] [CLS]"
tweet.text[0]

"Went to Iceland on Sat to ride bumper cars on ice!  No, not the country, Vlad's rink in Van Nuys. Awesome family fun :)\n30\n24\n188[SEP] [CLS]"

#### Load tesla return: 

In [7]:
df = pd.read_csv(PATH_tsla)
df['date'] = pd.to_datetime(df['date'].apply(lambda x: str(x)[0:4]+"-" + str(x)[4:6]+"-" + str(x)[6:]))
tsla = pd.DataFrame()
tsla['date'] = df['date'].values[1:]
tsla['ret'] = df['RET'].values[1:].astype(float)
tsla.head(10)

Unnamed: 0,date,ret
0,2010-06-30,-0.002511
1,2010-07-01,-0.078473
2,2010-07-02,-0.125683
3,2010-07-06,-0.160938
4,2010-07-07,-0.019243
5,2010-07-08,0.105063
6,2010-07-09,-0.003436
7,2010-07-12,-0.020115
8,2010-07-13,0.06393
9,2010-07-14,0.093716


# Create traning label

In [39]:
import datetime

nb_element = 8000
span = 5
label = []
for i in range(nb_element):
    date = tweet.date.iloc[i]
    while date not in tsla.date.tolist():
        date+= pd.DateOffset(days=1)
        
    mask = (tsla['date'] == date)
    start = tsla.loc[mask].index[0] 
    val = (1+tsla.ret.iloc[start:start + span]).cumprod().values[-1]
    label.append(0 if val < 1 else 1)
print(len(label))

8000


# Tokenize the sentence 

XLNet tokenizer is used to convert our text into tokens that correspond to XLNet’s vocabulary.

In [40]:
tokenizer  = XLNetTokenizer.from_pretrained('xlnet-base-cased',do_lower_case=True)
tokenized_text = [tokenizer.tokenize(sent) for sent in tweet['text']]
ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]

We need to find the max len for padding the rest.


In [41]:
list_len = [len(i) for i in ids]
MAX_LEN = max(list_len)
print("Max len:", MAX_LEN)
input_ids = pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post",padding="post")
print("Len first tokenized text:", len(input_ids[0]))

Max len: 446
Len first tokenized text: 446


# Split train and test dataset

In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [43]:
xtrain,xtest,ytrain,ytest = train_test_split(input_ids[:nb_element],label,test_size=0.15)
Xtrain = torch.tensor(xtrain)
Ytrain = torch.tensor(ytrain)
Xtest = torch.tensor(xtest)
Ytest = torch.tensor(ytest)

In [44]:
batch_size = 8
train_data = TensorDataset(Xtrain.to(device),Ytrain.to(device))
test_data = TensorDataset(Xtest.to(device),Ytest.to(device))
loader = DataLoader(train_data,batch_size=batch_size)
test_loader = DataLoader(test_data,batch_size=batch_size)

# Load model 

In [45]:
from transformers import XLNetForSequenceClassification
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels = 2)
model = model.to(device)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [46]:
optimizer = AdamW(model.parameters(),lr=2e-5) #We pass model parameters
criterion = nn.CrossEntropyLoss()


In [47]:
def flat_accuracy(preds,labels):  #A function to predict Accuracy
    correct=0
    for i in range(0,len(labels)):
        if(preds[i]==labels[i]):
            correct+=1
    return (correct/len(labels))*100

# Train the model

In [48]:
no_train = 0
epochs = 3
for epoch in range(epochs):
    model.train()
    loss1 = []
    steps = 0
    train_loss = []
    l = []
    for inputs, labels1 in loader:

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs[0],labels1)
        [train_loss.append(p.item()) for p in torch.argmax(outputs[0],axis=1).flatten() ]#our predicted 
        [l.append(z.item()) for z in labels1]# real labels
        loss.backward()
        optimizer.step()
        loss1.append(loss.item())
        no_train += inputs.size(0)
        steps += 1
    print("Current Loss is : {} Step is : {} number of Example : {} Accuracy : {}".format(loss.item(),epoch,no_train,flat_accuracy(train_loss,l)))

Current Loss is : 0.5215259790420532 Step is : 0 number of Example : 6800 Accuracy : 58.5
Current Loss is : 0.5939372777938843 Step is : 1 number of Example : 13600 Accuracy : 59.26470588235294
Current Loss is : 0.5822798609733582 Step is : 2 number of Example : 20400 Accuracy : 59.39705882352941


# Test the model 

In [49]:
model.eval() #Testing our Model
acc = []
lab = []
t = 0
for inp,lab1 in test_loader:
    t+=lab1.size(0)
    outp1 = model(inp)
    [acc.append(p1.item()) for p1 in torch.argmax(outp1[0],axis=1).flatten() ]
    [lab.append(z1.item()) for z1 in lab1]
    
print("Total Examples : {} Accuracy {}".format(t,flat_accuracy(acc,lab)))

Total Examples : 1200 Accuracy 60.25


In [51]:
torch.save(model.state_dict(),  '/content/gdrive/My Drive/Colab Notebooks/FinancialEconometrics/data/model1.pth')