In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import plotly.graph_objects as go
import csv
import urllib.request
import pandas as pd
import preprocessor as p
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mccak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Preprocess text
def preprocess(filename):
    df = pd.read_csv(filename)
    display(df.head(5))
    
    # Clean clock time
    df.Date = df.Date.apply(lambda row: row.split(" ")[0])
    # Drop usernames
    df.drop('Name', inplace=True, axis=1)
    # Clean twitter links, hashtags etc
    df.Comment = df.Comment.apply(lambda row: p.clean(row))
    # Clean punctuations
    df.Comment = df.Comment.str.replace(r'[^\w\s]+', '', regex=True)
    # Lower words
    df.Comment = df.Comment.str.lower()
    # Remove digits
    df.Comment = df.Comment.str.replace('\d+', '', regex=True)
    # Remove stopwords
    stop = stopwords.words('english')
    df.Comment = df.Comment.apply(lambda row: ' '.join([word for word in row.split() if word not in (stop)]))
    display(df.head(5))

    return df

In [3]:
df = preprocess("data/bitcoin.csv")

Unnamed: 0,Date,Name,Comment
0,2022-02-17 23:57:21+00:00,sportoken,"Bought and Burned 15,700,761 #Sportoken \n\nBu..."
1,2022-02-17 23:55:32+00:00,stacyherbert,My ambition is to translate at least parts int...
2,2022-02-17 23:55:11+00:00,Farida_N,I truly look forward to #Bitcoin22 to speak on...
3,2022-02-17 23:51:56+00:00,Tradermayne,#Bitcoin \n\nWill watch for a reaction around ...
4,2022-02-17 23:51:10+00:00,AltcoinDailyio,"Charlie Munger: ""I’m proud for not investing i..."


Unnamed: 0,Date,Comment
0,2022-02-17,bought burned buy transactions ransactions tra...
1,2022-02-17,ambition translate least parts english bitcoin...
2,2022-02-17,truly look forward speak bitcoin could help ad...
3,2022-02-17,watch reaction around
4,2022-02-17,charlie munger im proud investing cryptoberksh...


In [4]:
# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

Downloading config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [5]:
# download label mapping
labels=[]
# labels --> positive, neutral, negative,
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [6]:
# TF
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)

Downloading tf_model.h5:   0%|          | 0.00/478M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


('cardiffnlp/twitter-roberta-base-sentiment\\tokenizer_config.json',
 'cardiffnlp/twitter-roberta-base-sentiment\\special_tokens_map.json',
 'cardiffnlp/twitter-roberta-base-sentiment\\vocab.json',
 'cardiffnlp/twitter-roberta-base-sentiment\\merges.txt',
 'cardiffnlp/twitter-roberta-base-sentiment\\added_tokens.json',
 'cardiffnlp/twitter-roberta-base-sentiment\\tokenizer.json')

In [7]:
def evaluate(df, batch_size):
    sentiments = []
    iter_num = int(len(df)/batch_size)
    for i in range(iter_num +1):
        index = []
        # last one
        if i == iter_num:
            encoded_input = tokenizer(df.Comment[batch_size*i:len(df)].tolist(), return_tensors='tf', padding=True)
        else:       
            encoded_input = tokenizer(df.Comment[batch_size*i:batch_size*(i+1)].tolist(), return_tensors='tf', padding=True)
        
        #print(tokenizer.decode(encoded_input['input_ids'].numpy()[0]))
        output = model(encoded_input)
        scores = output[0].numpy()
        scores = softmax(scores, axis=1)
        index = np.argmax(scores, axis=1)
        # neg: -1, neu: 0, pos:1
        index = index -1
        
        sentiments.append(index)
    
    sentiments = np.hstack(sentiments)
        
    df['sentiments'] = sentiments
    return df

In [8]:
df = evaluate(df,512)
df.head(10)

KeyboardInterrupt: 

In [None]:
df_sentiment = pd.DataFrame(df.groupby(['Date']).mean()).reset_index()
df_sentiment.sentiments = df_sentiment.sentiments*1000
window_size = 10
df_sentiment['SMA_sentiment'] =  df_sentiment.sentiments.rolling(window=10, min_periods=1).mean()
df_sentiment

In [None]:
df_price = pd.read_csv("data/bitcoin_price.csv")
df_price.drop(columns=['Open', 'High', 'Low', 'Adj Close', 'Volume'], inplace=True)
df_price

In [None]:
def plot(x_data, y_data, title, leg_name, *args, bool_return=False):

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=x_data, y=y_data, name=leg_name)) #marker_color = 'black'

    if args:
        x_data2 = args[0]
        y_data2 = args[1]
        leg_name2 = args[2]
    
        fig.add_trace(go.Scatter(x=x_data2, y=y_data2, name=leg_name2))

    fig.update_layout(
    title={
        'text': title,
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    showlegend = True,
    xaxis_title="",
    yaxis_title="",
    legend_title="",
    font=dict(
          family="Courier New, monospace",
          size=12,
          color="Black"
      )
  )
  
    if bool_return:
        return fig
    else:
        fig.show()

In [None]:
plot(df_sentiment.Date, df_sentiment.SMA_sentiment*200, "Bitcoin price vs sentiments", "Sentiment Score", df_price.Date, df_price.Close, "Bitcoin Price")