In [1]:
import pandas as pd
from datetime import *
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [39]:
tweets = pd.read_csv('bitcoin_tweets.csv', low_memory=False)

In [43]:
price = pd.read_csv('BTC-USD.csv', low_memory=False)

In [3]:
#Convert date field to datetime, any non numerical value will be turned to NaN
tweets['date']= pd.to_datetime(tweets['date'], errors = 'coerce')

In [4]:
tweets['date']

0         2021-02-10 23:59:04
1         2021-02-10 23:58:48
2         2021-02-10 23:54:48
3         2021-02-10 23:54:33
4         2021-02-10 23:54:06
                  ...        
4132330   2022-09-02 19:18:51
4132331   2022-09-02 19:18:26
4132332   2022-09-02 19:18:16
4132333   2022-09-02 19:18:04
4132334   2022-09-02 19:17:48
Name: date, Length: 4132335, dtype: datetime64[ns]

In [5]:
#Take index of  null rows in date field of  

idx = tweets[tweets['date'].isnull()].index

In [6]:
#drop NA's in date and text field. 
tweets = tweets.drop(index=idx, axis=0)
tweets = tweets.dropna(subset=['text']).reset_index(drop=True)

In [7]:
#creating a new dataframe that does not contain the specified columns mentioned 
btcTweets = tweets.loc[:,['date', 'text', 'user_location','user_description', 'user_followers','user_friends','user_verified']]

In [59]:
#Creating a new 'year' column and converting the date column to y-m-d format
btcTweets['year'] = btcTweets['date'].dt.year
btcTweets['date'] = btcTweets['date'].dt.strftime('%Y-%m-%d')

In [9]:
#Filtering dataframe for tweets that are from 2021
btcTweets = btcTweets[btcTweets['year'] == 2021]

In [10]:
#pull a random sample of 5% into a new dataframe
btcTweetsWork = btcTweets.sample(frac=0.05)

In [11]:
#Sort the new dataframe by the date 
btcTweetsWork = btcTweetsWork.sort_values(by=['date'])
btcTweetsWork

Unnamed: 0,date,text,user_location,user_description,user_followers,user_friends,user_verified,year
21519,2021-02-05 10:53:49,#Bitcoin and #ETH both have bullish setups for...,,#Bitcoin #BTC,100.0,388,False,2021
21516,2021-02-05 10:58:03,@JulSwap $juld $bnb #Binance #BSC #BinanceSmar...,Australia,"love interesting startups, stocks and innovati...",2520.0,1085,False,2021
21495,2021-02-05 11:11:19,🔄 Prices update in $USD (1 hour):\n\n$BTC - 37...,,Scans Kraken's main currencies hourly | Also @...,3212.0,1,False,2021
21477,2021-02-05 11:24:21,https://t.co/XRKGaKgDCP\n\nMarketing – weekly ...,www.pool.creamcoin.com,CREAM\nhttps://t.co/k9xpyyYqZs,9677.0,3266,False,2021
21475,2021-02-05 11:25:28,A Possible BIG MOVE FOR #BTC #BITCOIN is comin...,i am living in my cryptoworld,I am a cryptolover. living in my #cryptoworld😀...,741.0,1315,False,2021
...,...,...,...,...,...,...,...,...
1990398,2021-12-30 23:54:35,"Join OKEx with this url below, get 20% commiss...",,,4.0,40.0,False,2021
1990392,2021-12-30 23:54:54,🔥🔥🔥 🐳🚨 Bitcoin Whale Alert: [ TX: 876279b44bea...,,The most advanced #BTC bitcoin tracker and ana...,562.0,0.0,False,2021
1990379,2021-12-30 23:56:36,$BTC\n#BTC update \nhttps://t.co/3nAZtDJN4i ht...,,-Cancer survivor -Covid19 survivor -US Momentu...,453.0,80.0,False,2021
1990377,2021-12-30 23:56:48,"@jackmallers Today or tomorrow, #Litecoin will...",,,30.0,115.0,False,2021


In [12]:
#reset the index of the new dataframe after the sort
btcTweetsWork=btcTweetsWork.reset_index()

In [14]:
#take the text of each row and move them to a list called btcTwts
btcTwts = btcTweetsWork['text'].tolist()

In [None]:
#Run a sentiment analysis that takes the compound score of each text row and append to tweet rating
TweetRating=[]
sid = SentimentIntensityAnalyzer()
for x in range(0, len(btcTwts)):
    print(btcTwts)
    ss = sid.polarity_scores(btcTwts[x])['compound']
    TweetRating.append(ss)

In [17]:
#Take  the appended list of sentiment score and combine with the list of tweet text
TweetScores = {"tweets":btcTwts, "sentiment": TweetRating}

In [23]:
#Convert the dictionary of combined tweets and scores into a dataframe
TweetScoresDf = pd.DataFrame(TweetScores)

In [26]:
#export to a csv file
TweetScoresDf.to_csv("Sentiment_Score.csv")

In [35]:
#combine the tweet sentiment scores with the sampled dataframe and drop the repeated text column called 'tweets'
FinalTweet= pd.concat([btcTweetsWork,TweetScoresDf],axis=1).reset_index(drop=True)
FinalTweet = FinalTweet.drop(['tweets','index',],axis=1)



In [99]:
#Export to csv
FinalTweet.to_csv('TweetSentiment.csv')

In [44]:
#convert date field into datetime in price data 
price['Date'] = pd.to_datetime(price['Date'])

In [None]:
#create new columns Open to close and daily variation 
price['Open_to_Close'] = price['Close'] - price['Open']
price['Daily_Variation'] = abs(price['High']-price['Low'])



In [68]:
#Crete a new dataframe with specific columns and change the title of column date 
priceBTC= price.loc[:,['Date', 'High','Open_to_Close', 'Daily_Variation']]
priceBTC = priceBTC.rename(columns={'Date':'date'})


In [88]:
#convert both date fields from tweet sentiment dataframe and price data then combine into one dataframe
FinalTweet['date']= FinalTweet['date'].astype('datetime64[ns]')
priceBTC['date']=priceBTC['date'].astype('datetime64[ns]')

sentimentPrice = FinalTweet.merge(priceBTC, on='date')

In [93]:
#drop certain columns
sentimentPrice= sentimentPrice.drop(['user_location','year','user_verified'], axis=1)

In [96]:
sentimentPrice.to_csv("sentiment_Price.csv")