# Data-X Project

In [63]:
import csv
import re
import numpy as np
import pandas as pd 
import seaborn as sns
import gensim
import spacy
import nltk
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()
ps = nltk.PorterStemmer()
import matplotlib.pyplot as plt
from matplotlib import pyplot
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package wordnet to /srv/app/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Sentiment Analysis for Cryptocurrency Hashtag on Twitter

We will use the VADER (Valence Aware Dictionary and sEntiment Reasoner) lexicon to analyze the sentiment of Cryptocurrency-related tweets. 

VADER is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media which is great for our usage. 

You can find more about this at https://github.com/cjhutto/vaderSentiment

### 1.Read the files into pandas dataframe 



In [75]:
### Tweets 1 consists of all the cryptocurrency related tweets from Oct 20 to Oct 27, 2018. 
### Tweets 2 consists of all the cryptocurrency related tweets from Nov 1 to Nov 9, 2018

In [76]:
tweets1 = pd.read_csv("bitcoin.csv")[['Author', 'Text', 'Retweet_Count', 'Timestamp']]
tweets1.head()

Unnamed: 0,Author,Text,Retweet_Count,Timestamp
0,ewin74698177,RT @SolarisCoin: Take a peek @ https://t.co/W8...,49,2018-10-27 23:25:41
1,thegustians,RT @SolarisCoin: Take a peek @ https://t.co/W8...,49,2018-10-27 23:10:20
2,frg_x,RT @SolarisCoin: Take a peek @ https://t.co/W8...,49,2018-10-27 23:07:04
3,TechniCraftCZ,RT @CryptoPepes: CryptoPepes just soft launche...,93,2018-10-27 23:02:48
4,victordorofeev1,RT @SolarisCoin: Take a peek @ https://t.co/W8...,49,2018-10-27 23:01:47


In [77]:
tweets2 = pd.read_csv('bitcointweets.csv')[['Author', 'Text', 'Retweet_Count', 'Timestamp']]
tweets2.head()

Unnamed: 0,Author,Text,Retweet_Count,Timestamp
0,Sharp_or_Parish,Booooommmmmmm!\r\n\r\n4th POD HAMMER BOMB to t...,0,2018-11-09 06:13:30
1,T45Investments,5.00 GMT Update!\r\n#trading #futures #commodi...,0,2018-11-09 06:03:09
2,ArbingTool,#DASH\r\nBuy at #Koinim and sell at #YoBit. Ra...,0,2018-11-09 06:01:30
3,ArbingTool,#ETH\r\nBuy at #Bitstamp and sell at #Bittrex....,0,2018-11-09 06:01:21
4,ArbingTool,#XRP\r\nBuy at #Bitstamp and sell at #Bitfinex...,0,2018-11-09 06:01:12


### 2. Data cleaning function: 
Remove urls, mentions, punctuations and stopwords

In [78]:
class CleanText():
    
    def __init__(self, input_text):
        self.input_text = input_text
    
    def remove(self):
        remove_mention = re.sub(r'@\w+', '', self.input_text)
        remove_url = re.sub(r'http.?://[^\s]+[\s]?', '', remove_mention)
        # By compressing the underscore, the emoji is kept as one word
        remove_emoji = remove_url.replace('_','')
        remove_punctuation = re.sub('[^A-Za-z0-9_\s]', '', remove_emoji)
        lowercase = remove_punctuation.lower()
        remove_n = re.sub('[\n\r]', '', lowercase)
        remove_num = re.sub('[[:digit:]]', '', remove_n)
        
        return remove_num.replace('rt', '')

In [79]:
clean_tweet1 = []
for tweet in tweets1['Text']:
    clean_tweet1.append(CleanText(tweet).remove())
    
clean_tweets1 = tweets1.drop(['Text'], axis = 1)
clean_tweets1['Clean_Text'] = clean_tweet1
clean_tweets1.head()

clean_tweet2 = []
for tweet in tweets2['Text']:
    clean_tweet2.append(CleanText(tweet).remove())
    
clean_tweets2 = tweets2.drop(['Text'], axis = 1)
clean_tweets2['Clean_Text'] = clean_tweet2
clean_tweets2.head()

Unnamed: 0,Author,Retweet_Count,Timestamp,Clean_Text
0,Sharp_or_Parish,0,2018-11-09 06:13:30,booooommmmmmm4th pod hammer bomb to the window...
1,T45Investments,0,2018-11-09 06:03:09,500 gmt updatetrading futures commodities euru...
2,ArbingTool,0,2018-11-09 06:01:30,dashbuy at koinim and sell at yobit ratio 116b...
3,ArbingTool,0,2018-11-09 06:01:21,ethbuy at bitstamp and sell at bittrex ratio 0...
4,ArbingTool,0,2018-11-09 06:01:12,xrpbuy at bitstamp and sell at bitfinex ratio ...


In [95]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

clean_tweets1['tokenized_clean_text'] = clean_tweets1['Clean_Text'].apply(lambda x: tokenize(x.lower()))
clean_tweets1.head()

Unnamed: 0,Author,Retweet_Count,Timestamp,Clean_Text,tokenized_clean_text,compound
0,ewin74698177,49,2018-10-27 23:25:41,take a peek would you like to test it soon ...,"[, take, a, peek, would, you, like, to, test, ...",0.8316
1,thegustians,49,2018-10-27 23:10:20,take a peek would you like to test it soon ...,"[, take, a, peek, would, you, like, to, test, ...",0.8316
2,frg_x,49,2018-10-27 23:07:04,take a peek would you like to test it soon ...,"[, take, a, peek, would, you, like, to, test, ...",0.8316
3,TechniCraftCZ,93,2018-10-27 23:02:48,cryptopepes just soft launched on ethereum m...,"[, cryptopepes, just, soft, launched, on, ethe...",0.128
4,victordorofeev1,49,2018-10-27 23:01:47,take a peek would you like to test it soon ...,"[, take, a, peek, would, you, like, to, test, ...",0.8316


### 3. Sentiment analysis

We will look at the compound score for each sentence.

In [96]:
analyzer = SentimentIntensityAnalyzer()

score1 = []
for sentence in clean_tweets1['Clean_Text']:
    score1.append(analyzer.polarity_scores(sentence))

sentiment1 = []
for each in score1:
    sentiment1.append(each['compound'])
clean_tweets1['compound'] = sentiment1

score2 = []
for sentence in clean_tweets2['Clean_Text']:
    score2.append(analyzer.polarity_scores(sentence))

sentiment2 = []
for each in score2:
    sentiment2.append(each['compound'])
clean_tweets2['compound'] = sentiment2

The threshold for positive comment is >= 0.05, for neutral comment is > - 0.05 and for negative comment is <= -0.05.

We will classify the tweets using this threshold. 

In [97]:
comment_class = []
for score in clean_tweets1['compound']:
    if score <= -0.05:
        comment_class.append('Negative')
    elif score >= 0.05:
        comment_class.append('Positive')
    else:
        comment_class.append('Neutral')
clean_tweets1['type'] = comment_class

comment_class = []
for score in clean_tweets2['compound']:
    if score <= -0.05:
        comment_class.append('Negative')
    elif score >= 0.05:
        comment_class.append('Positive')
    else:
        comment_class.append('Neutral')
clean_tweets2['type'] = comment_class

### 4. Compute length of each tweets and write both of them to csv file

In [98]:
clean_tweets1['text_len'] = [len(text) for text in clean_tweets1['Clean_Text']]
x1 = clean_tweets1[['compound', 'text_len']]
x1.to_csv('x1.csv')

In [99]:
clean_tweets2['text_len'] = [len(text) for text in clean_tweets2['Clean_Text']]
x2 = clean_tweets2[['compound', 'text_len']]
x2.to_csv('x2.csv')

In [100]:
x1

Unnamed: 0,compound,text_len
0,0.8316,92
1,0.8316,92
2,0.8316,92
3,0.1280,95
4,0.8316,92
5,0.0000,122
6,0.0000,100
7,0.0000,95
8,0.8316,92
9,0.8316,92


In [101]:
x2

Unnamed: 0,compound,text_len
0,-0.4939,89
1,0.0000,98
2,0.0000,98
3,0.0000,100
4,0.0000,100
5,0.0000,102
6,0.0000,97
7,-0.2263,111
8,0.0000,77
9,0.0000,82


### 5. Statistical Analysis

In [None]:
comp_mean1 = np.mean(clean_tweets1['compound'])
comp_mean1
#slightly positive sentiment on average

In [None]:
comp_mean2 = np.mean(clean_tweets2['compound'])
comp_mean2

In [None]:
sd1 = np.std(clean_tweets1['compound'])
sd1

In [None]:
sd2 = np.std(clean_tweets2['compound'])
sd2

### Since R studio has better packages for statistical analysis, I will perform the second part of my analysis in R and please refer to the R file. 