In [None]:
# **Team 1**
# **Balsam Hindi**
# **Lynn Nyazika**
# **Course:** AI 574 - Natural Language Processing (FALL I, 2022)

In [None]:

Cryptocurrency markets are notoriously difficult to predict. Even the most experienced investors can have trouble anticipating market behavior. The volatile and complex nature of cryptocurrencies makes them notoriously difficult to model. However, we believe that deep learning models such as convolutional neural networks (CNNs) can potentially provide more accurate predictions. In our project, we will be customizing a model built on a pre-trained CNN model and Transformers to build a models that predict the sentiment of text and relates that to the  behavior of cryptocurrency markets. 

The objective of our project is to design a model that utilizes AI to predict the behavior of cryptocurrency markets, in this context specifically Bitcoin; based on the sentiment of crowds on social media, in this case specifically Twitter. Our suite of tools will be trained on a dataset of past social media data and market data. Once trained, it will be able to automatically detect market trends and make predictions accordingly. Ideally our model will have the potential to surpass previous results and provide accurate predictions of Bitcoin market behavior.

Keywords: Bitcoin, market, twitter, social media, 


In [None]:
# **Data Collection**


# **Bitcoin Tweets**
# **https://www.kaggle.com/datasets/kaushiksuresh147/bitcoin-tweets**
# **Generate sentiment analysis model for Bitcoin-specific Twitter colloquialism.**
# **Twitter Sentiment Dataset**
# **https://www.kaggle.com/datasets/saurabhshahane/twitter-sentiment-dataset**
# **Generate sentiment analysis model for more general Twitter colloquialism.**
# **Sarcasm on Reddit**
# **https://www.kaggle.com/datasets/danofer/sarcasm**
# **Generate sarcasm detection model for social media context.**
# **Bitcoin Historical Dataset**
# **https://www.kaggle.com/datasets/prasoonkottarathil/btcinusd**
# **Utilized for bitcoin performance by day comparison against sentiment.**

In [None]:
from google.colab import drive

In [None]:
drive.mount("/content/drive")

**Code to check gpu print our must be '/device:GPU:0' is enabled**

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

In [None]:
### Required packages

In [None]:
!pip install --upgrade pandas-datareader
#!pip install yfinance --upgrade --no-cache-dir
!pip install yfinance
!pip install fix_yahoo_finance
!pip3 install snscrape
!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git
!pip install --upgrade pandas-datareader

**Generic Libraries for Data Processing and Exploration**

In [None]:
#Libraries needed for Data Exploration
import re

import pandas as pd
pd.__version__

import numpy as np
np.__version__

import datetime
import pandas_datareader.data as web
import snscrape.modules.twitter as sntwitter

import matplotlib.pyplot as plt
import plotly.graph_objects as go #ref https://plotly.com/python/candlestick-charts/

# **Libraries for the heavy duty stuff. Huggingface libraries**

In [None]:
!pip install transformers

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
from keras.utils.vis_utils import plot_model
from transformers import BertTokenizer, BertModel
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
#from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from transformers import AutoTokenizer

from tqdm.notebook import tqdm

# **Please uncomment to run**

In [None]:
#Takes 45minutes to scrape historical data

# Creating list to append tweet data to
#tweets_list = []

# Using TwitterSearchScraper to scrape data and append tweets to list
#for i,tweet in enumerate(sntwitter.TwitterSearchScraper('BTC-USD OR BITCOIN USD, since:2022-07-01 until:2022-08-01').get_items()):
    #if i>100000:
       # break
    #tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.username])
    
# Creating a dataframe from the tweets list above
#tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])

**Please uncomment to run**

In [None]:
#Export file to local system save as csv.

#tweets_df.to_csv("uncleaned_tweets.csv", encoding='utf-8', index=False)

In [None]:
#import file from local machine set delimeter and check for NAN's
tweets_bitcoin = pd.read_csv('/content/drive/MyDrive/Datasets/uncleaned_tweets (1).csv', lineterminator='\n', na_values="?")# parse_dates=["Datetime"]

In [None]:
#Drop NAN in this file there are non but the practice is useful, check length
tweets_bitcoin.dropna()
len(tweets_bitcoin)

Boiler plate code to clean data : 
Reference Code - https://github.com/PushTheEnvelopeAI/Twitter_Stock_Prediction/blob/main/preprocess.py

In [None]:
#remove emojis
tweets_bitcoin['Text'] = tweets_bitcoin['Text'].str.replace('[^A-Za-z0-9]', ' ', flags=re.UNICODE)

#Clean Text

def Preprocess_Tweets(data):
		
	data['Text_Cleaned'] = data['Text'].str.lower()
  

	## FIX HYPERLINKS
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'https?:\/\/.*[\r\n]*', ' ',regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'www.*[\r\n]*', ' ',regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('https', '', regex=False)


	## FIX INDIVIDUAL SYMBOLS 
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(': ', ' ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(', ', ' ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('. ', ' ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('[;\n~]', ' ', regex=True)

	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace("[]'â€¦*™|]", '', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('[[()!?"]', '', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('_', '', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('w/', ' with ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('f/', ' for ', regex=False)


	## FIX EMOJIS
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(':)', '', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(':-)', '', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(':(', '', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(':-(', '', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('0_o', '', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(';)', '', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('=^.^=', '', regex=False)


	## FIX % SYMBOL
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('%', ' percent ', regex=False)


	## FIX & SYMBOL
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' & ', ' and ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('&amp', ' and ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('&gt', ' greater than ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('q&a', 'question and answer', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('&', ' and ', regex=False)

	## FIX USER TAGS AND HASTAGS
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('@[a-z0-9]+', '', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('#[a-z0-9]+', '', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('@', '', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('#', '', regex=False)
	   
	## FIX EMBEDDED COMMAS AND PERIODS    
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([a-z]),([a-z])', r'\1 \2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9]),([0-9])', r'\1\2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9])[+]+', r'\1 ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(',', '', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('u.s.', ' us ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('\.{2,}', ' ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([a-z])\.([a-z])', r'\1 \2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('pdating', 'updating', regex=False) 
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([a-z])\.', r'\1 ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'\.([a-z])', r' \1', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' . ', ' ', regex=False)
		
	## FIX + SYMBOL
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'[+]([0-9])', r'positive \1', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' + ', ' and ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('+ ', ' ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([a-z])[+]([a-z])', r'\1 and \2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('+', '', regex=False)
	
	## FIX - SYMBOL
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([a-z])[-]+([a-z])', r'\1 \2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([a-z]) - ([a-z])', r'\1 to \2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9]) -([0-9\.])', r'\1 to \2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r' [-]([0-9])', r' negative \1', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9])-([0-9\.])', r'\1 to \2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9]) - ([0-9\.])', r'\1 to \2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9a-z])-([0-9a-z])', r'\1 \2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('[-]+[>]', ' ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' [-]+ ', ' ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('-', ' ', regex=False)

	## FIX $ SYMBOL
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('[$][0-9\.]', ' dollars ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('$', '', regex=False)

	## FIX = SYMBOL
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('=', ' equals ', regex=False)

	## FIX / SYMBOL
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('b/c', ' because ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('b/out', ' break out ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('b/o', ' break out ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('p/e', ' pe ratio ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' [/]+ ', ' ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' 1/2 ', ' .5 ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' 1/4 ', ' .25 ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' 3/4 ', ' .75 ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' 1/3 ', ' .3 ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' 2/3 ', ' .6 ', regex=False)

	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('[/]{2,}', ' ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([a-z])/([a-z])', r'\1 and \2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('[0-9]+/[0-9]+/[0-9]+', '', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9]{3,})/([0-9\.]{2,})', r'\1 to \2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9]{2,})/([0-9\.]{3,})', r'\1 to \2', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('[a-z0-9]+/[a-z0-9]+', ' ', regex=True)

	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('/', '', regex=False)

	## FIX < > SYMBOLS
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('[<]+ ', ' ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('<', ' less than ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' [>]+', ' ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('>', ' greater than ', regex=False)

	## FIX : SYMBOL
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('[0-9]+:[0-9]+am', ' ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('[0-9]+:[0-9]', ' ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(':', ' ', regex=False)

	#FIX UNITS CUSTOMIZED
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('mrkt', 'market', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' vol ', ' volume ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' ptrend', ' positive trend ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' ppl', ' people ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' pts', ' points ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' pt', ' point ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' l(ol){1,}', ' laugh ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('imho', ' in my opinion ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace('prev ', 'previous ', regex=True)

	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' 1q', ' first quarter ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' 2q', ' second quarter ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' 3q', ' third quarter ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' 4q', ' fourth quarter ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' q1', ' first quarter ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' q2', ' second quarter ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' q3', ' third quarter ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' q4', ' fourth quarter ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' 10q ', ' form 10 ', regex=False)

	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9])million', r'\1 million ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9])mil', r'\1 million ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' mil ', ' million ', regex=False)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9])billion', r'\1 billion ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9])cents', r'\1 cents ', regex=True)
		
	data['Text_Cleaned'] = data['Text_Cleaned'].replace(r'([0-9])([a-z])', r'\1 \2', regex=True)

	## FIX EXTRA SPACES AND ENDING PUNCTUATION
	data['Text_Cleaned'] = data['Text_Cleaned'].str.replace(' +', ' ', regex=True)
	data['Text_Cleaned'] = data['Text_Cleaned'].str.strip(' .!?,)(:-')


	return data

In [None]:
tweets_bitcoin["Text_Cleaned"] = tweets_bitcoin["Text"]
tweets_bitcoin.head(5)

In [None]:
cleaned_text = Preprocess_Tweets(tweets_bitcoin)
print(cleaned_text)

In [None]:
cleaned_text['Text_Cleaned'] = cleaned_text['Text_Cleaned'].str.lower()
cleaned_text.head(15)

In [None]:
#Check length of tweets and show graphical distriution
tweets_bitcoin['len'] = tweets_bitcoin['Text_Cleaned'].str.len()
tweets_bitcoin['len'].hist().set_xlabel("Tweet length")

In [None]:
#set length of tweet to 500. tf restricted to 512
tweets_btc_filtrd = tweets_bitcoin[tweets_bitcoin['len'] <= 500]
tweets_btc_filtrd.shape

In [None]:
print(tweets_btc_filtrd.groupby(['Text_Cleaned'])['len'].transform('max'))

In [None]:
#get length and row of longest length tweet. is < 500.
col = "len"
max_x = tweets_btc_filtrd.loc[tweets_btc_filtrd[col].idxmax()]
print ("Maximum value of column ", col, " and its corresponding row values:", max_x)


In [None]:
#drop columns not needed
tweets_bitcoin = tweets_bitcoin.drop(columns=['Username','Text'], errors='ignore')
tweets_bitcoin.head(5)

In [None]:
#sample a portion of the reamaining texts
BTC_Tweets = tweets_btc_filtrd.sample(90000)

In [None]:
BTC_Tweets['len'].hist().set_xlabel("Tweet length")

In [None]:
BTC_Tweets.tail(15)

In [None]:
# define datetimes for start and end dates
start_date = '2020-8-1'
end_date = '2022-8-1'
# import stock data for given period between start and end date form yahoo finance
data = web.DataReader("BTC-USD", data_source='yahoo', start=start_date, end=end_date)
# display returned dataframe header
len(data)

In [None]:
data ['growth'] = np.where((data["Open"] > data["Close"]), "negative", "positive")
data.head(15)

In [None]:
data['Volume'].plot(figsize=(15,5), title ='Bitcoin-USD Stock Volumes Purchased')

In [None]:
fig = go.Figure(data=[go.Candlestick(x=data.index,
                open=data['Open'],
                high=data['High'],
                low=data['Low'],
                close=data['Close'])])
fig.show()

Default set to run Twitter Roberta. Uncomment and Finbert and # comment Roberta to run other model both use same tokenizer as per literature guidelines.

In [None]:
from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification
#model1 = f"yiyanghkust/finbert-tone"
model1 = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model1)
model = AutoModelForSequenceClassification.from_pretrained(model1)



Pipeline construction

In [None]:
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
#Test example rpovided on website to make sure everything is still bueno!
nlp("Covid cases are increasing fast!")

In [None]:
#Sample Test from scraped data set
BTC_Tweets['Text_Cleaned'][67884]


In [None]:
#Tokenize looking good 
encoded_text = tokenizer(BTC_Tweets['Text_Cleaned'][67884])
encoded_text

In [None]:
#Dry run test one sample from dataset
nlp(BTC_Tweets['Text_Cleaned'][67884])

In [None]:
#Iterate through entire dataset column text and unique id "Tweet Id"
sent_results = {}
count = 0
for i, d in tqdm(BTC_Tweets.iterrows(), total=len(BTC_Tweets)):
    try:
        sent = nlp(d["Text"])
        sent_results[d["Tweet Id"]] = sent
        count += 1
        if count == 50000:
          break
    except RuntimeError:
        print(f'Failed to run {sent_results[d["Tweet Id"]]}')


In [None]:
sent_results


Reference code used to make charts
https://github.com/RobMulla/twitch-stream-projects/blob/main/051-stock-sentiment/stock-sentiment.ipynb

In [None]:
sent_df = pd.DataFrame(sent_results).T
sent_df["label"] = sent_df[0].apply(lambda x: x["label"])
sent_df["score"] = sent_df[0].apply(lambda x: x["score"])
sent_df = sent_df.merge(
    tweets_bitcoin.set_index("Tweet Id"), left_index=True, right_index=True
)


In [None]:
sent_df.head(50)

In [None]:
sent_df.groupby("label")["score"].plot(kind="hist", bins=50)
plt.legend()
plt.show()

In [None]:
sent_df.head(10)

In [None]:
sent_df['Date'] = pd.to_datetime(sent_df.Datetime, format='%Y-%m-%d')
sent_df['Date'] = sent_df["Date"].dt.date
sent_df =sent_df.set_index(['Date'])


In [None]:
#sent_df.set_index('Datetime').groupby([pd.Grouper(freq='D'),'score']).mean()
#sent_df.groupby(["Date"]).agg({"score": [pd.Series.mode, "mean"]})
#mode = lambda x: x.mode() if len(x) > 2 else np.array(x)
#sent_df.groupby('Date')['score'].agg(mode)
sent_daily = sent_df.groupby('Date')['score'].agg(lambda x: pd.Series.mode(x)[0])
sent_daily.head(50)
len(sent_daily)

In [None]:
sent_daily.head(10)

In [None]:
sent_and_stock = sent_daily.to_frame("sentiment").merge(
    data, left_index=True, right_index=True
)

ax = sent_and_stock["sentiment"].plot(legend="Sentiment")
ax2 = ax.twinx()
sent_and_stock["Volume"].plot(ax=ax2, color="orange", legend="Closing Price")
plt.show()

In [None]:
sent_and_stock.head(10)

In [None]:
#Export file to local system save as csv.

sent_df.to_csv("Roberta_tweets_reslabld.csv", encoding='utf-8', index=False)