# Twitter Sentiment Analysis for top 3 cryptocurrencies

In [1]:
import os
import glob
import re
import plotly
import datetime
import plotly.graph_objs as go
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS


from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline
sns.set_style("darkgrid")

In [2]:
class cd:
    """Context manager for changing the current working directory"""
    def __init__(self, newPath):
        self.newPath = os.path.expanduser(newPath)

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)

data_path='/home/jishnu/Documents/ISB/Term3/practicum/workspace/data_collection/data/daily_data'
with cd(data_path):
    bitcoin_tweets_list = list(glob.iglob('bitcoin_tweets_*_processed'))
    ethereum_tweets_list = list(glob.iglob('ethereum_tweets_*_processed'))
    ripple_tweets_list = list(glob.iglob('ripple_tweets_*_processed'))
    
dates_list = [re.search('[\w]+_tweets_([\d]*)_processed',item).group(1) for item in bitcoin_tweets_list]
bitcoin_tweets_list = [os.path.join(data_path,item) for item in bitcoin_tweets_list]
ethereum_tweets_list = [os.path.join(data_path,item) for item in ethereum_tweets_list]
ripple_tweets_list = [os.path.join(data_path,item) for item in ripple_tweets_list]

currency_list = []

for date in dates_list:
    bitcoin_tweets_file = os.path.join(data_path,'bitcoin_tweets_{0}_processed'.format(date))
    ethereum_tweets_file = os.path.join(data_path,'ethereum_tweets_{0}_processed'.format(date))
    ripple_tweets_file = os.path.join(data_path,'ripple_tweets_{0}_processed'.format(date))
    # Creating dataframes
    bitcoin_tweets = pd.read_table(bitcoin_tweets_file,sep='###!###',
                       header=None,names=['tweet_no','user','date','location','tweet'],
                       parse_dates=['date'],engine='python')
    ethereum_tweets = pd.read_table(ethereum_tweets_file,sep='###!###',
                       header=None,names=['tweet_no','user','date','location','tweet'],
                       parse_dates=['date'],engine='python')
    ripple_tweets = pd.read_table(ripple_tweets_file,sep='###!###',
                       header=None,names=['tweet_no','user','date','location','tweet'],
                       parse_dates=['date'],engine='python')
    # Filling NAs in tweet column
    bitcoin_tweets['tweet'].fillna('NA NA',inplace=True)
    ethereum_tweets['tweet'].fillna('NA NA',inplace=True)
    ripple_tweets['tweet'].fillna('NA NA',inplace=True)
    # Creating container for holding all the three dataframes
    tweets_df_dict = {'bitcoin' : bitcoin_tweets,'ethereum' : ethereum_tweets,'ripple' : ripple_tweets}
    # Preprocessing Steps
    for currency in tweets_df_dict:
        tweets_df_dict[currency]['tweetos'] = ''
        tweets_df_dict[currency]['tweet_clean'] = ''
        #add tweetos first part
        for i in range(len(tweets_df_dict[currency]['tweet'])):
            try:
                tweets_df_dict[currency].iloc[i,5] = tweets_df_dict[currency]['tweet'].str.split(' ')[i][1]
            except AttributeError:    
                tweets_df_dict[currency].iloc[i,5] = 'other'
        #Preprocessing tweetos. select tweetos contains 'b@'
        for i in range(len(tweets_df_dict[currency]['tweet'])):
            if tweets_df_dict[currency]['tweetos'].str.contains('@')[i]  == False:
                tweets_df_dict[currency].iloc[i,5] = 'other'
        # remove URLs, RTs, and twitter handles
        for i in range(len(tweets_df_dict[currency]['tweet'])):
            tweets_df_dict[currency].iloc[i,6] = " ".join([word for word in tweets_df_dict[currency]['tweet'][i].split()
                                        if 'http' not in word and '@' not in word and '<' not in word])
        tweets_df_dict[currency]['tweet_clean'] = tweets_df_dict[currency]['tweet_clean'].apply(lambda x: re.sub('[!@#$:).;,?&]', '', x.lower()))
        tweets_df_dict[currency]['tweet_clean'] = tweets_df_dict[currency]['tweet_clean'].apply(lambda x: re.sub('  ', ' ', x))
    # Performing Sentiment Analysis for each currency
    for currency in tweets_df_dict:
        tweets_df_dict[currency]['text_lem'] = [''.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in tweets_df_dict[currency]['tweet_clean']]       
        vectorizer = TfidfVectorizer(max_df=0.5,max_features=10000,min_df=10,stop_words='english',use_idf=True)
        X = vectorizer.fit_transform(tweets_df_dict[currency]['text_lem'].str.upper())
        sid = SentimentIntensityAnalyzer()
        tweets_df_dict[currency]['sentiment_compound_polarity']=tweets_df_dict[currency].text_lem.apply(lambda x:sid.polarity_scores(x)['compound'])
        tweets_df_dict[currency]['sentiment_neutral']=tweets_df_dict[currency].text_lem.apply(lambda x:sid.polarity_scores(x)['neu'])
        tweets_df_dict[currency]['sentiment_negative']=tweets_df_dict[currency].text_lem.apply(lambda x:sid.polarity_scores(x)['neg'])
        tweets_df_dict[currency]['sentiment_pos']=tweets_df_dict[currency].text_lem.apply(lambda x:sid.polarity_scores(x)['pos'])
        tweets_df_dict[currency]['sentiment_type']=''
        tweets_df_dict[currency].loc[tweets_df_dict[currency].sentiment_compound_polarity>0,'sentiment_type']='POSITIVE'
        tweets_df_dict[currency].loc[tweets_df_dict[currency].sentiment_compound_polarity==0,'sentiment_type']='NEUTRAL'
        tweets_df_dict[currency].loc[tweets_df_dict[currency].sentiment_compound_polarity<0,'sentiment_type']='NEGATIVE'
    # Capturing sentiment scores for each currency and date combination
    for currency in tweets_df_dict:
        tweets_sentiment = tweets_df_dict[currency].groupby(['sentiment_type'])['sentiment_neutral'].count()
        tweets_sentiment = pd.DataFrame(tweets_sentiment).reset_index()
        tweets_sentiment.columns=['sentiment','count']
        sum_count = tweets_sentiment['count'].sum()
        tweets_sentiment['percent'] = tweets_sentiment.apply(lambda row:np.round(row['count']/sum_count,2),axis=1)
        row = dict()
        #row['Currency'] = currency
        row['Date'] = date
        #row['percent_pos'] = tweets_sentiment.loc[tweets_sentiment['sentiment'] == 'POSITIVE']['percent'].values[0]
        #row['percent_neu'] = tweets_sentiment.loc[tweets_sentiment['sentiment'] == 'NEUTRAL']['percent'].values[0]
        #row['percent_neg'] = tweets_sentiment.loc[tweets_sentiment['sentiment'] == 'NEGATIVE']['percent'].values[0]
        if currency == 'bitcoin':
            row['bitcoin_percent_neg'] = tweets_sentiment.loc[tweets_sentiment['sentiment'] == 'NEGATIVE']['percent'].values[0]
        elif currency == 'ethereum':
            row['ethereum_percent_neg'] = tweets_sentiment.loc[tweets_sentiment['sentiment'] == 'NEGATIVE']['percent'].values[0]
        else:
            row['ripple_percent_neg'] = tweets_sentiment.loc[tweets_sentiment['sentiment'] == 'NEGATIVE']['percent'].values[0]
        currency_list.append(row)   
    
currency_sentiments = pd.DataFrame(currency_list)

currency_sentiments['Date'] = pd.to_datetime(currency_sentiments['Date'])
currency_sentiments['Date'] = currency_sentiments['Date'].apply(lambda x:x.to_pydatetime())

date_grouped = currency_sentiments.groupby(by='Date')
currency_sentiments = date_grouped.agg({'bitcoin_percent_neg':max,'ethereum_percent_neg':max,'ripple_percent_neg':max}).reset_index()

currency_sentiments

Unnamed: 0,Date,bitcoin_percent_neg,ethereum_percent_neg,ripple_percent_neg
0,2018-06-21,0.23,0.15,0.14
1,2018-06-22,0.21,0.03,0.07
2,2018-06-23,0.2,0.04,0.07
3,2018-06-24,0.17,0.07,0.15
4,2018-06-26,0.2,0.1,0.12
5,2018-06-27,0.14,0.06,0.11


In [3]:
bitcoin_neg_data = go.Scatter(x=currency_sentiments['Date'],
                              y=currency_sentiments['bitcoin_percent_neg'],
                              name='Bitcoin Negative Percent')
ethereum_neg_data = go.Scatter(x=currency_sentiments['Date'],
                              y=currency_sentiments['ethereum_percent_neg'],
                              name='Ethereum Negative Percent')
ripple_neg_data = go.Scatter(x=currency_sentiments['Date'],
                              y=currency_sentiments['ripple_percent_neg'],
                              name='Ripple Negative Percent')
data = [bitcoin_neg_data,ethereum_neg_data,ripple_neg_data]
layout = go.Layout(width=1000,height=500,
    title='Daily variation of percentage negative sentiment for top 3 cryptocurrencies',
    xaxis=dict(title='Date',titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')),
    yaxis=dict(title='Percentage negative sentiments',titlefont=dict( family='Courier New, monospace',size=18,color='#7f7f7f'))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

## Daily Price Variation for Top 3 Cryptocurrencies

In [4]:
with cd(data_path):
    ticker_data_list = list(glob.iglob('ticker_data_*'))
    
dates_list = [re.search('ticker_data_([\d]*).csv',item).group(1) for item in ticker_data_list]
ticker_data_list = [os.path.join(data_path,item) for item in ticker_data_list]

In [5]:
currency_list = []
for date in dates_list:
    ticker_file = os.path.join(data_path,'ticker_data_{0}.csv'.format(date))
    ticker_df = pd.read_csv(ticker_file,
                      usecols=['name','quotes.USD.market_cap','quotes.USD.percent_change_24h','quotes.USD.price','quotes.USD.volume_24h'])
    currency_list.append({'Date' : date,
                          'bitcoin_price_24h' : ticker_df.loc[ticker_df['name'] == 'Bitcoin']['quotes.USD.percent_change_24h'].values[0],
                          'bitcoin_price' : ticker_df.loc[ticker_df['name'] == 'Bitcoin']['quotes.USD.price'].values[0],
                          'ethereum_price_24h' : ticker_df.loc[ticker_df['name'] == 'Ethereum']['quotes.USD.percent_change_24h'].values[0],
                          'ethereum_price' : ticker_df.loc[ticker_df['name'] == 'Ethereum']['quotes.USD.price'].values[0],
                          'ripple_price_24h' : ticker_df.loc[ticker_df['name'] == 'Ripple']['quotes.USD.percent_change_24h'].values[0],
                          'ripple_price' : ticker_df.loc[ticker_df['name'] == 'Ripple']['quotes.USD.price'].values[0]})
currency_daily_prices = pd.DataFrame(currency_list)
currency_daily_prices['Date'] = pd.to_datetime(currency_daily_prices['Date'])
currency_daily_prices['Date'] = currency_daily_prices['Date'].apply(lambda x:x.to_pydatetime())
currency_daily_prices = currency_daily_prices.sort_values(by='Date')

In [6]:
bitcoin_price_data = go.Scatter(x=currency_daily_prices['Date'],
                              y=currency_daily_prices['bitcoin_price'],
                              name='Bitcoin Price')
ethereum_price_data = go.Scatter(x=currency_daily_prices['Date'],
                              y=currency_daily_prices['ethereum_price'],
                              name='Ethereum Price')
ripple_price_data = go.Scatter(x=currency_daily_prices['Date'],
                              y=currency_daily_prices['ripple_price'],
                              name='Ripple Price')
data = [bitcoin_price_data,ethereum_price_data,ripple_price_data]
layout = go.Layout(width=1000,height=500,
    title='Daily variation of prices for top 3 cryptocurrencies',
    xaxis=dict(title='Date',titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')),
    yaxis=dict(title='Price(USD)',titlefont=dict( family='Courier New, monospace',size=18,color='#7f7f7f'))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [7]:
ethereum_price_data = go.Scatter(x=currency_daily_prices['Date'],
                              y=currency_daily_prices['ethereum_price'],
                              name='Ethereum Price')
ripple_price_data = go.Scatter(x=currency_daily_prices['Date'],
                              y=currency_daily_prices['ripple_price'],
                              name='Ripple Price')
data = [ethereum_price_data,ripple_price_data]
layout = go.Layout(width=1000,height=500,
    title='Daily variation of prices for top 3 cryptocurrencies',
    xaxis=dict(title='Date',titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')),
    yaxis=dict(title='Price(USD)',titlefont=dict( family='Courier New, monospace',size=18,color='#7f7f7f'))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

## Plotting Price and Sentiment Trends

In [8]:
currency_sentiments

Unnamed: 0,Date,bitcoin_percent_neg,ethereum_percent_neg,ripple_percent_neg
0,2018-06-21,0.23,0.15,0.14
1,2018-06-22,0.21,0.03,0.07
2,2018-06-23,0.2,0.04,0.07
3,2018-06-24,0.17,0.07,0.15
4,2018-06-26,0.2,0.1,0.12
5,2018-06-27,0.14,0.06,0.11


In [9]:
currency_daily_prices

Unnamed: 0,Date,bitcoin_price,bitcoin_price_24h,ethereum_price,ethereum_price_24h,ripple_price,ripple_price_24h
4,2018-06-13,6375.15,-5.36,477.792,-7.45,0.534743,-6.6
1,2018-06-15,6549.4,1.23,494.053,3.09,0.54451,0.43
6,2018-06-16,6507.32,-1.64,499.021,-0.72,0.532037,-3.44
7,2018-06-17,6545.04,-0.42,499.069,-0.09,0.53535,-2.04
11,2018-06-18,6759.72,3.27,519.823,3.42,0.542901,2.52
3,2018-06-19,6716.51,3.78,516.749,3.93,0.536424,2.32
12,2018-06-20,6739.31,-0.17,532.993,2.64,0.542668,-0.09
0,2018-06-21,6732.97,-0.88,527.195,-1.94,0.532064,-1.74
10,2018-06-22,6173.73,-8.25,474.704,-10.12,0.498125,-6.52
5,2018-06-23,6079.35,-7.99,464.279,-8.99,0.482938,-7.48


In [10]:
currency_merged = currency_daily_prices.merge(currency_sentiments,how='left',on='Date')
currency_merged

Unnamed: 0,Date,bitcoin_price,bitcoin_price_24h,ethereum_price,ethereum_price_24h,ripple_price,ripple_price_24h,bitcoin_percent_neg,ethereum_percent_neg,ripple_percent_neg
0,2018-06-13,6375.15,-5.36,477.792,-7.45,0.534743,-6.6,,,
1,2018-06-15,6549.4,1.23,494.053,3.09,0.54451,0.43,,,
2,2018-06-16,6507.32,-1.64,499.021,-0.72,0.532037,-3.44,,,
3,2018-06-17,6545.04,-0.42,499.069,-0.09,0.53535,-2.04,,,
4,2018-06-18,6759.72,3.27,519.823,3.42,0.542901,2.52,,,
5,2018-06-19,6716.51,3.78,516.749,3.93,0.536424,2.32,,,
6,2018-06-20,6739.31,-0.17,532.993,2.64,0.542668,-0.09,,,
7,2018-06-21,6732.97,-0.88,527.195,-1.94,0.532064,-1.74,0.23,0.15,0.14
8,2018-06-22,6173.73,-8.25,474.704,-10.12,0.498125,-6.52,0.21,0.03,0.07
9,2018-06-23,6079.35,-7.99,464.279,-8.99,0.482938,-7.48,0.2,0.04,0.07


In [11]:
bitcoin_price_data = go.Scatter(x=currency_merged['Date'],
                              y=currency_merged['bitcoin_price_24h'],
                              name='Bitcoin Price')
bitcoin_sentiment_data = go.Scatter(x=currency_merged['Date'],
                              y=currency_merged['bitcoin_percent_neg']*100,
                              name='Bitcoin Percent Negative Sentiment')
data = [bitcoin_price_data,bitcoin_sentiment_data]
layout = go.Layout(width=1000,height=500,
    title='Daily Price \ Negative Sentiment Variation for Bitcoin',
    xaxis=dict(title='Date',titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')),
    yaxis=dict(title='Price(USD) / Negative Sentiment',titlefont=dict( family='Courier New, monospace',size=18,color='#7f7f7f'))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [12]:
ethereum_price_data = go.Scatter(x=currency_merged['Date'],
                              y=currency_merged['ethereum_price_24h'],
                              name='Ethereum Price')
ethereum_sentiment_data = go.Scatter(x=currency_merged['Date'],
                              y=currency_merged['ethereum_percent_neg']*100,
                              name='Ethereum Percent Negative Sentiment')
data = [ethereum_price_data,ethereum_sentiment_data]
layout = go.Layout(width=1000,height=500,
    title='Daily Price \ Negative Sentiment Variation for Ethereum',
    xaxis=dict(title='Date',titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')),
    yaxis=dict(title='Price(USD) / Negative Sentiment',titlefont=dict( family='Courier New, monospace',size=18,color='#7f7f7f'))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [14]:
ripple_price_data = go.Scatter(x=currency_merged['Date'],
                              y=currency_merged['ripple_price_24h'],
                              name='Ripple Price')
ripple_sentiment_data = go.Scatter(x=currency_merged['Date'],
                              y=currency_merged['ripple_percent_neg']*100,
                              name='Ripple Percent Negative Sentiment')
data = [ripple_price_data,ripple_sentiment_data]
layout = go.Layout(width=1000,height=500,
    title='Daily Price \ Negative Sentiment Variation for Ripple',
    xaxis=dict(title='Date',titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')),
    yaxis=dict(title='Price(USD) / Negative Sentiment',titlefont=dict( family='Courier New, monospace',size=18,color='#7f7f7f'))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)