# Data-X Project

In [1]:
import csv
import re
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from nltk.corpus import stopwords;

## Sentiment Analysis for Cryptocurrency Hashtag on Twitter

We will use the VADER (Valence Aware Dictionary and sEntiment Reasoner) lexicon to analyze the sentiment of Cryptocurrency-related tweets. VADER is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media which is great for our usage.

### 1.Read the files into pandas dataframe 



In [2]:
tweets = pd.read_csv("bitcoin.csv")[['Author', 'Text', 'Retweet_Count', 'Timestamp']]
tweets.head()

Unnamed: 0,Author,Text,Retweet_Count,Timestamp
0,ewin74698177,RT @SolarisCoin: Take a peek @ https://t.co/W8...,49,2018-10-27 23:25:41
1,thegustians,RT @SolarisCoin: Take a peek @ https://t.co/W8...,49,2018-10-27 23:10:20
2,frg_x,RT @SolarisCoin: Take a peek @ https://t.co/W8...,49,2018-10-27 23:07:04
3,TechniCraftCZ,RT @CryptoPepes: CryptoPepes just soft launche...,93,2018-10-27 23:02:48
4,victordorofeev1,RT @SolarisCoin: Take a peek @ https://t.co/W8...,49,2018-10-27 23:01:47


In [3]:
pol = pd.read_table('vader_lexicon.txt', header=None).set_index(0).iloc[:,0]
d = {'polarity': pol}
sent = pd.DataFrame(data = d)
sent.head()

Unnamed: 0_level_0,polarity
0,Unnamed: 1_level_1
$:,-1.5
%),-0.4
%-),-1.5
&-:,-0.4
&:,-0.7


### 2. Data cleaning function: 
Remove urls, mentions, punctuations and stopwords

In [4]:
class CleanText():
    
    def __init__(self, input_text):
        self.input_text = input_text
    
    def remove(self):
        remove_mention = re.sub(r'@\w+', '', self.input_text)
        remove_url = re.sub(r'http.?://[^\s]+[\s]?', '', remove_mention)
        # By compressing the underscore, the emoji is kept as one word
        remove_emoji = remove_url.replace('_','')
        remove_punctuation = re.sub('[^A-Za-z0-9_\s]', '', remove_emoji)
        lowercase = remove_punctuation.lower()
        remove_n = re.sub('[\n\r]', '', lowercase)
        
        return remove_n.replace('rt', '')


In [5]:
clean_tweet = []
for tweet in tweets['Text']:
    clean_tweet.append(CleanText(tweet).remove())

In [6]:
clean_tweets = tweets.drop(['Text'], axis = 1)
clean_tweets['Clean_Text'] = clean_tweet
clean_tweets.head()

Unnamed: 0,Author,Retweet_Count,Timestamp,Clean_Text
0,ewin74698177,49,2018-10-27 23:25:41,take a peek would you like to test it soon ...
1,thegustians,49,2018-10-27 23:10:20,take a peek would you like to test it soon ...
2,frg_x,49,2018-10-27 23:07:04,take a peek would you like to test it soon ...
3,TechniCraftCZ,93,2018-10-27 23:02:48,cryptopepes just soft launched on ethereum m...
4,victordorofeev1,49,2018-10-27 23:01:47,take a peek would you like to test it soon ...


### 3. Tidy format

We can see that there are a lot of duplicate texts, which are the retweets. We will remove them and then perform sentiment analysis. 

Convert the tweets into what's called a tidy format to make the sentiments easier to calculate

In [33]:
d = list(clean_tweets['Clean_Text'].unique())
unique = pd.DataFrame(d)
unique.columns = ['Clean_Text']
unique.head()

Unnamed: 0,Clean_Text
0,take a peek would you like to test it soon ...
1,cryptopepes just soft launched on ethereum m...
2,fiiicoin will be debut in coinegg on at 1300...
3,cryptocurrency price analysis for bitsend bsd ...
4,sold zrxbtc binance price 000012225 time 2...


In [8]:
tidy = pd.DataFrame(unique['Clean_Text'].str.split(expand = True).stack().reset_index(level=1,))
tidy.columns = ['num', 'word']
tidy.head()

Unnamed: 0,num,word
0,0,take
0,1,a
0,2,peek
0,3,would
0,4,you


In [9]:
sent.reset_index(level=0, inplace=True)

In [10]:
sent.columns = ['word', 'polarity']

In [11]:
polarity = pd.merge(tidy.reset_index(level = -1), sent, on = 'word').groupby('index')[['polarity']].sum()
polarity.head()

Unnamed: 0_level_0,polarity
index,Unnamed: 1_level_1
0,5.8
1,0.5
6,1.3
10,0.8
11,0.8


In [12]:
clean_tweets['polarity'] = polarity['polarity']
clean_tweets['polarity'] = clean_tweets['polarity'].replace(np.float64('nan'), 0)

In [13]:
np.mean(clean_tweets['polarity'])
#positive sentiment(> 0.05) in the past one week 

0.10191458026509584

### 4. Most negative and most positive tweets

In [14]:
print('Most negative tweets:')
for t in clean_tweets.sort_values('polarity').head()['Clean_Text']:
    print('\n  ', t)

Most negative tweets:

   bitcoin market price update xbtgbp  last price 499800  24hour low 490400 high 500000 volume 1708945 xbt

   xembuy at koineks and sell at livecoin ratio 100buy at koineks and sell at yobit ratio 1427buy at 

   liquidated bitmex long on xbtusd 639400   920  0 of avg  20181026 153540bitcoin

     xbt bitmex btc bitcoinas the whole world await next btc 000 action i spotted 3 different patterns in play right now on

     take a peek  would you like to test it soon and win some xlr1 like and retween this tweet2


In [15]:
print('Most positive tweets:')
for t in clean_tweets.sort_values('polarity', ascending=False).head()['Clean_Text']:
    print('\n  ', t)

Most positive tweets:

     were here today to raise awareness about bitcoin and  dropped by for an interview with our cfocatch us until 1400

   xrpbuy at bitexen and sell at poloniex ratio 100bitcoin arbitrage arbitraj arbingtool 

   btc 648896 up 020 000 in the last hour bitcoin bitsma

   bchbuy at bitstamp and sell at yobit ratio 479buy at koinim and sell at yobit ratio 149buy at 

     ubex  will be listed on lbanktime 1600 oct 22 gmt8buy buy buyubex ai btc bitcoinnews binance alt alt 


### Tweets that had the most number of retweets

In [52]:
print(clean_tweets.sort_values('Retweet_Count', ascending = False)['Clean_Text'].unique()[0:5])

[ '   2000 stake amp primedice giveaway 11  250 lucky winners will receive 4 and 2 very lucky winners will receive 500  ent'
 '  i wish i had kept my 1700 btc  006 instead of selling them at 030 now that theyre 800 bitcoin'
 '  cryptopepes just soft launched on ethereum mainnet  cryptopepes rat race staed     cryptomini'
 '  send the ravens  lucre token presale will be launched on oct 15th at 000000 utc lucre lcr tokensale ico cryptocu'
 '  these are my predictions for peak prices in 2019bitcoin 85000vergecurrency 175ethereum 4000litecoin 110']
