Transfer learning using cryptobert and roberta

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig, TextClassificationPipeline
from transformers import pipeline
import numpy as np
import pandas as pd
from scipy.special import softmax
import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pip install torch




In [3]:
!pip install protobuf==3.20.0




## Step 1 - Importing a sample of some Bitcoin Tweet Data to begin analysing the model

In [4]:
# 20230309 - test data from twitter api, from 20221224 to 20230308
data = pd.read_csv('~/Code/giadapi/crypto/data/raw/tweet_24-12-22_to_08-03-23.csv')


In [5]:
data


Unnamed: 0,datetime,username,text
0,2022-12-24 06:09:56+00:00,FerrisBSessions,Looking more spring strength to smash up to $1...
1,2022-12-24 06:09:52+00:00,BNB_Tracker,#BinanceCoin price update: $BNB\n\n#BNB $245.1...
2,2022-12-24 06:09:48+00:00,weeblueghost,"- \nBTC price: $16,823 / £14,057 \n\n59.44 Nak..."
3,2022-12-24 06:09:39+00:00,0xEthereumYoda,#Ethereum price update: \n\n#ETH $1216.65 USD\...
4,2022-12-24 06:09:22+00:00,Blocktopix,"Guys, due to wave count I am pretty sure we wi..."
...,...,...,...
30001,2023-03-09 06:00:01+00:00,HourlyBTCUpdate,Bitcoin: $21739.58\n💔 -1.72 last 1 Hour (-0.01...
30002,2023-03-09 06:00:01+00:00,Mannny23,Popular crypto latest prices:\n $BTC 21751.00\...
30003,2023-03-09 06:00:01+00:00,whalesradar_com,#CTXCUSDT #CTXC \nSignal #4 \n\nLast Signal: ...
30004,2023-03-09 06:00:00+00:00,croxroadnews,The Wings of #Bitcoin: A Symbol of Strength an...


In [6]:
data['datetime']


0        2022-12-24 06:09:56+00:00
1        2022-12-24 06:09:52+00:00
2        2022-12-24 06:09:48+00:00
3        2022-12-24 06:09:39+00:00
4        2022-12-24 06:09:22+00:00
                   ...            
30001    2023-03-09 06:00:01+00:00
30002    2023-03-09 06:00:01+00:00
30003    2023-03-09 06:00:01+00:00
30004    2023-03-09 06:00:00+00:00
30005    2023-03-09 06:00:00+00:00
Name: datetime, Length: 30006, dtype: object

In [7]:
#only run it if the dataset is from Twitter API
# data['text'] = data[['tweet']]
# data['date'] = data[['created_at']]

#only run if the dataset is tweets_2021_reduced.csv
data['date'] = data['datetime']
for i in range(len(data)):
    data['date'][i] = data['datetime'][i][0:10]


In [9]:
data = data[['text', 'date']]


In [10]:
data


Unnamed: 0,text,date
0,Looking more spring strength to smash up to $1...,2022-12-24
1,#BinanceCoin price update: $BNB\n\n#BNB $245.1...,2022-12-24
2,"- \nBTC price: $16,823 / £14,057 \n\n59.44 Nak...",2022-12-24
3,#Ethereum price update: \n\n#ETH $1216.65 USD\...,2022-12-24
4,"Guys, due to wave count I am pretty sure we wi...",2022-12-24
...,...,...
30001,Bitcoin: $21739.58\n💔 -1.72 last 1 Hour (-0.01...,2023-03-09
30002,Popular crypto latest prices:\n $BTC 21751.00\...,2023-03-09
30003,#CTXCUSDT #CTXC \nSignal #4 \n\nLast Signal: ...,2023-03-09
30004,The Wings of #Bitcoin: A Symbol of Strength an...,2023-03-09


## Step 2 - Cleaning the data

In [11]:
# I have changed this to remove more information

def preprocess(text):
    new_text = []
    text = str(text)
    text = text.replace("\n", " ")
    for t in text.split(" "):
        t = '' if t.startswith('@') and len(t) > 1 else t
        t = '' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


In [12]:
#Create a dummy data
data['process_text'] = data.text
data['negative_bert'] = data.text
data['neutral_bert'] = data.text
data['positive_bert'] = data.text

#use the preprocess_2 to clean the data
data['process_text'] = data['text'].apply(preprocess)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['process_text'] = data.text


In [13]:
data


Unnamed: 0,text,date,process_text,negative_bert,neutral_bert,positive_bert
0,Looking more spring strength to smash up to $1...,2022-12-24,Looking more spring strength to smash up to $1...,Looking more spring strength to smash up to $1...,Looking more spring strength to smash up to $1...,Looking more spring strength to smash up to $1...
1,#BinanceCoin price update: $BNB\n\n#BNB $245.1...,2022-12-24,#BinanceCoin price update: $BNB #BNB $245.14 ...,#BinanceCoin price update: $BNB\n\n#BNB $245.1...,#BinanceCoin price update: $BNB\n\n#BNB $245.1...,#BinanceCoin price update: $BNB\n\n#BNB $245.1...
2,"- \nBTC price: $16,823 / £14,057 \n\n59.44 Nak...",2022-12-24,"- BTC price: $16,823 / £14,057 59.44 Naks p...","- \nBTC price: $16,823 / £14,057 \n\n59.44 Nak...","- \nBTC price: $16,823 / £14,057 \n\n59.44 Nak...","- \nBTC price: $16,823 / £14,057 \n\n59.44 Nak..."
3,#Ethereum price update: \n\n#ETH $1216.65 USD\...,2022-12-24,#Ethereum price update: #ETH $1216.65 USD #B...,#Ethereum price update: \n\n#ETH $1216.65 USD\...,#Ethereum price update: \n\n#ETH $1216.65 USD\...,#Ethereum price update: \n\n#ETH $1216.65 USD\...
4,"Guys, due to wave count I am pretty sure we wi...",2022-12-24,"Guys, due to wave count I am pretty sure we wi...","Guys, due to wave count I am pretty sure we wi...","Guys, due to wave count I am pretty sure we wi...","Guys, due to wave count I am pretty sure we wi..."
...,...,...,...,...,...,...
30001,Bitcoin: $21739.58\n💔 -1.72 last 1 Hour (-0.01...,2023-03-09,Bitcoin: $21739.58 💔 -1.72 last 1 Hour (-0.01%...,Bitcoin: $21739.58\n💔 -1.72 last 1 Hour (-0.01...,Bitcoin: $21739.58\n💔 -1.72 last 1 Hour (-0.01...,Bitcoin: $21739.58\n💔 -1.72 last 1 Hour (-0.01...
30002,Popular crypto latest prices:\n $BTC 21751.00\...,2023-03-09,Popular crypto latest prices: $BTC 21751.00 ...,Popular crypto latest prices:\n $BTC 21751.00\...,Popular crypto latest prices:\n $BTC 21751.00\...,Popular crypto latest prices:\n $BTC 21751.00\...
30003,#CTXCUSDT #CTXC \nSignal #4 \n\nLast Signal: ...,2023-03-09,#CTXCUSDT #CTXC Signal #4 Last Signal: 119...,#CTXCUSDT #CTXC \nSignal #4 \n\nLast Signal: ...,#CTXCUSDT #CTXC \nSignal #4 \n\nLast Signal: ...,#CTXCUSDT #CTXC \nSignal #4 \n\nLast Signal: ...
30004,The Wings of #Bitcoin: A Symbol of Strength an...,2023-03-09,The Wings of #Bitcoin: A Symbol of Strength an...,The Wings of #Bitcoin: A Symbol of Strength an...,The Wings of #Bitcoin: A Symbol of Strength an...,The Wings of #Bitcoin: A Symbol of Strength an...


## Step 3 - analyse the language and sentiments by pretrained model

In [17]:
!pyenv local crypto


In [18]:
#Model 2b - Use the bert model, with full scores
MODEL_bert = f"ElKulako/cryptobert"
tokenizer_bert = AutoTokenizer.from_pretrained(MODEL_bert)
tokenizer_bert.model_max_length = 512 #solve the error: RuntimeError: The expanded size of the tensor (562) must match the existing size (514) at non-singleton dimension
config_bert = AutoConfig.from_pretrained(MODEL_bert)


# PT
model_bert = AutoModelForSequenceClassification.from_pretrained(MODEL_bert)
model_bert.config.max_position_embeddings = 512


In [19]:
def scores_bert(sample_text):
    encoded_input_bert = tokenizer_bert(sample_text, return_tensors='pt')
    output_bert = model_bert(**encoded_input_bert)
    scores_bert = output_bert[0][0].detach().numpy()
    scores_bert = softmax(scores_bert) #1st score is negative, 2nd score is netural, 3rd score is positive
    return scores_bert


In [20]:
data['text'] = data['process_text'].apply(scores_bert)


In [21]:
data


Unnamed: 0,text,date,process_text,negative_bert,neutral_bert,positive_bert
0,"[0.0029336608, 0.39226294, 0.6048034]",2022-12-24,Looking more spring strength to smash up to $1...,Looking more spring strength to smash up to $1...,Looking more spring strength to smash up to $1...,Looking more spring strength to smash up to $1...
1,"[0.0016108917, 0.55053204, 0.44785705]",2022-12-24,#BinanceCoin price update: $BNB #BNB $245.14 ...,#BinanceCoin price update: $BNB\n\n#BNB $245.1...,#BinanceCoin price update: $BNB\n\n#BNB $245.1...,#BinanceCoin price update: $BNB\n\n#BNB $245.1...
2,"[0.0054851808, 0.8168942, 0.1776206]",2022-12-24,"- BTC price: $16,823 / £14,057 59.44 Naks p...","- \nBTC price: $16,823 / £14,057 \n\n59.44 Nak...","- \nBTC price: $16,823 / £14,057 \n\n59.44 Nak...","- \nBTC price: $16,823 / £14,057 \n\n59.44 Nak..."
3,"[0.11056101, 0.35159186, 0.5378471]",2022-12-24,#Ethereum price update: #ETH $1216.65 USD #B...,#Ethereum price update: \n\n#ETH $1216.65 USD\...,#Ethereum price update: \n\n#ETH $1216.65 USD\...,#Ethereum price update: \n\n#ETH $1216.65 USD\...
4,"[0.00038249855, 0.5679714, 0.43164608]",2022-12-24,"Guys, due to wave count I am pretty sure we wi...","Guys, due to wave count I am pretty sure we wi...","Guys, due to wave count I am pretty sure we wi...","Guys, due to wave count I am pretty sure we wi..."
...,...,...,...,...,...,...
30001,"[0.0010160431, 0.8708345, 0.1281495]",2023-03-09,Bitcoin: $21739.58 💔 -1.72 last 1 Hour (-0.01%...,Bitcoin: $21739.58\n💔 -1.72 last 1 Hour (-0.01...,Bitcoin: $21739.58\n💔 -1.72 last 1 Hour (-0.01...,Bitcoin: $21739.58\n💔 -1.72 last 1 Hour (-0.01...
30002,"[0.003519426, 0.9071207, 0.08935987]",2023-03-09,Popular crypto latest prices: $BTC 21751.00 ...,Popular crypto latest prices:\n $BTC 21751.00\...,Popular crypto latest prices:\n $BTC 21751.00\...,Popular crypto latest prices:\n $BTC 21751.00\...
30003,"[0.00052115606, 0.96445435, 0.035024412]",2023-03-09,#CTXCUSDT #CTXC Signal #4 Last Signal: 119...,#CTXCUSDT #CTXC \nSignal #4 \n\nLast Signal: ...,#CTXCUSDT #CTXC \nSignal #4 \n\nLast Signal: ...,#CTXCUSDT #CTXC \nSignal #4 \n\nLast Signal: ...
30004,"[0.007387602, 0.53222805, 0.46038437]",2023-03-09,The Wings of #Bitcoin: A Symbol of Strength an...,The Wings of #Bitcoin: A Symbol of Strength an...,The Wings of #Bitcoin: A Symbol of Strength an...,The Wings of #Bitcoin: A Symbol of Strength an...


In [22]:
for i in range(len(data)):
    data['negative_bert'][i] = data['text'][i][0]
    data['neutral_bert'][i] = data['text'][i][1]
    data['positive_bert'][i] = data['text'][i][2]


In [23]:
data = data[['date', 'process_text', 'negative_bert', 'neutral_bert','positive_bert']]


In [24]:
data


Unnamed: 0,date,process_text,negative_bert,neutral_bert,positive_bert
0,2022-12-24,Looking more spring strength to smash up to $1...,0.002934,0.392263,0.604803
1,2022-12-24,#BinanceCoin price update: $BNB #BNB $245.14 ...,0.001611,0.550532,0.447857
2,2022-12-24,"- BTC price: $16,823 / £14,057 59.44 Naks p...",0.005485,0.816894,0.177621
3,2022-12-24,#Ethereum price update: #ETH $1216.65 USD #B...,0.110561,0.351592,0.537847
4,2022-12-24,"Guys, due to wave count I am pretty sure we wi...",0.000382,0.567971,0.431646
...,...,...,...,...,...
30001,2023-03-09,Bitcoin: $21739.58 💔 -1.72 last 1 Hour (-0.01%...,0.001016,0.870835,0.128149
30002,2023-03-09,Popular crypto latest prices: $BTC 21751.00 ...,0.003519,0.907121,0.08936
30003,2023-03-09,#CTXCUSDT #CTXC Signal #4 Last Signal: 119...,0.000521,0.964454,0.035024
30004,2023-03-09,The Wings of #Bitcoin: A Symbol of Strength an...,0.007388,0.532228,0.460384


## Step 4: Count the total number of labels/scores (positive, negative vs neutral) by date

In [25]:
grouped_data = pd.DataFrame(data.groupby(['date'])[['negative_bert', 'neutral_bert', 'positive_bert']].sum().fillna(0).reset_index())
grouped_data = grouped_data.rename_axis("", axis="columns")


In [26]:
grouped_data


Unnamed: 0,date,negative_bert,neutral_bert,positive_bert
0,2022-12-24,45.360382,215.001648,146.637955
1,2022-12-25,34.899250,224.776840,152.323975
2,2022-12-26,43.570820,230.113907,134.315262
3,2022-12-27,25.987583,213.418427,155.594101
4,2022-12-28,29.718109,242.958145,151.323532
...,...,...,...,...
71,2023-03-05,34.170471,179.996368,113.833221
72,2023-03-06,43.931866,208.205460,154.862778
73,2023-03-07,48.533157,219.653091,146.813705
74,2023-03-08,36.966976,213.190887,161.842041


## Step 5 - Download the data

In [27]:
start_date = grouped_data['date'][0]
# start_date_str = datetime.datetime.strftime(start_date, "%Y-%m-%d")
start_date


'2022-12-24'

In [28]:
end_date = grouped_data['date'][len(grouped_data)-1]
# end_date_str = datetime.datetime.strftime(end_date, "%Y-%m-%d")
end_date


'2023-03-09'

In [29]:
file_name = f"{start_date}_{end_date}_twitter_comments.csv"
data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")


In [30]:
file_name = f"{start_date}_{end_date}_twitter_transferlearning.csv"
grouped_data.to_csv(f"~/code/giadapi/crypto/data/processed/{file_name}")
