Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Initialize Spark

In [2]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.8-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark.sql import SparkSession
import random

spark = SparkSession.builder.appName("YourTest").master("local[2]").config('spark.ui.port', random.randrange(4000,5000)).getOrCreate()

Initialize Tweepy

In [4]:
# Install Libraries
!pip install textblob
!pip install tweepy
!pip install pycountry
!pip install langdetect
!pip install twython
!pip install autocorrect

Collecting pycountry
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 5.2 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: pycountry
  Building wheel for pycountry (PEP 517) ... [?25l[?25hdone
  Created wheel for pycountry: filename=pycountry-22.3.5-py2.py3-none-any.whl size=10681845 sha256=f00a60fe2927c191659cd9d9f0dcd869c1bebf4f0e85a18e034ec4f29e2a61a6
  Stored in directory: /root/.cache/pip/wheels/0e/06/e8/7ee176e95ea9a8a8c3b3afcb1869f20adbd42413d4611c6eb4
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-22.3.5
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 4.7 MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdete

In [5]:
from textblob import TextBlob
import sys
import tweepy
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import nltk
import pycountry
import re
import string
import json
import socket
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from langdetect import detect
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from datetime import datetime
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [24]:
tweets_raw = spark.read.csv("/content/drive/MyDrive/tweets.csv",sep=';',header=True,inferSchema=True,multiLine=True).drop("id").drop("user").drop("fullname").drop("url").drop("replies").drop("retweets").dropna(how="any")

Import Datasets

In [7]:
btc_prices_raw = spark.read.csv("/content/drive/MyDrive/BTCUSD_daily.csv",sep=',',header=True,inferSchema=True).drop("unix").drop("symbol").drop("high").drop("low").drop("Volume BTC").drop("Volume USD").dropna(how="any")

In [8]:
btc_prices_raw.printSchema()
btc_prices_raw.show(10)


root
 |-- date: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- close: double (nullable = true)

+-------------------+--------+--------+
|               date|    open|   close|
+-------------------+--------+--------+
|2022-03-31 00:00:00|47086.07|47173.36|
|2022-03-30 00:00:00|47459.03|47068.08|
|2022-03-29 00:00:00|47152.38|47459.03|
|2022-03-28 00:00:00|46854.96|47152.38|
|2022-03-27 00:00:00|44553.24|46864.39|
|2022-03-26 00:00:00|44340.49|44535.65|
|2022-03-25 00:00:00|44025.99| 44320.6|
|2022-03-24 00:00:00|42912.21|44025.99|
|2022-03-23 00:00:00|42393.62|42925.41|
|2022-03-22 00:00:00|41018.36|42393.41|
+-------------------+--------+--------+
only showing top 10 rows



In [9]:
tweets_raw.printSchema()
#tweets_raw.show(20)

root
 |-- timestamp: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- text: string (nullable = true)



Convert BTC prices to percent changes

In [77]:
def compute_percent_change(row):
  date = row[0].date().strftime("%Y-%m-%d")
  percent_change = (row[2] - row[1]) * 100 / row[1]
  return (date, "%.2f" % percent_change)

btc_daily_changes = btc_prices_raw.rdd \
                                  .map(compute_percent_change) \
                                  .toDF(["date","percent_change"])

In [78]:
btc_daily_changes.createOrReplaceTempView("btc_daily_changes")
btc_daily_changes.printSchema()
btc_daily_changes.show(10)

root
 |-- date: string (nullable = true)
 |-- percent_change: string (nullable = true)

+----------+--------------+
|      date|percent_change|
+----------+--------------+
|2022-03-31|          0.19|
|2022-03-30|         -0.82|
|2022-03-29|          0.65|
|2022-03-28|          0.63|
|2022-03-27|          5.19|
|2022-03-26|          0.44|
|2022-03-25|          0.67|
|2022-03-24|          2.60|
|2022-03-23|          1.25|
|2022-03-22|          3.35|
+----------+--------------+
only showing top 10 rows



In [12]:
import string
from pyspark.sql.functions import *
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.corpus import words
import re
from autocorrect import Speller
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [66]:
tokenizer = RegexpTokenizer(r"[\w']+") # tokenizer using regular expressions
spell = Speller(lang='en')
lemmatizer = WordNetLemmatizer() 
words = set(nltk.corpus.words.words())

def simple_tokenize(s):
    return re.findall(r"[a-z]+(?:'[a-z]+)?",s.lower())

def get_wordnet_pos(word):
    '''
    get_wordnet_pos(word) Maps POS tag for word (i.e. noun, adjective etc.) to be used in 
    lemmatization function
    get_wordnet_pos: String -> String
    '''
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def regex_clean(col):
    col = re.sub(r'http\S+', '', col)
    col = re.sub(r'[^a-zA-Z\s]', '', col, flags=re.UNICODE)
    col = re.sub(r'\b[a-zA-Z]\b', '', col)
    col = re.sub(r'\n', '', col)
    col = re.sub(r'\r', '', col)
    col = re.sub(r' +', ' ', col)
    col = re.sub(r'&amp', '', col)
    return col

def filter_minimum_like(x):
  try:
    likes = int(x[1])
  except:
    return False

  if likes < 100:
    return False
  else:
    return True

tweets_rdd = tweets_raw.rdd
tweets_rdd = tweets_rdd.filter(filter_minimum_like)
tweets_rdd = tweets_rdd.map(lambda x : (x[0], x[1], str(x[2]).lower()))
tweets_rdd = tweets_rdd.map(lambda x : (x[0], x[1], regex_clean(x[2])))
tweets_rdd = tweets_rdd.map(lambda x : (x[0], x[1], tokenizer.tokenize(x[2])))
tweets_rdd = tweets_rdd.map(lambda x : (x[0], x[1], [word for word in x[2] if word not in (stopwords.words('english'))]))
tweets_rdd = tweets_rdd.map(lambda x : (x[0], x[1], [lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in x[2]]))
tweets_rdd = tweets_rdd.map(lambda x : (x[0], x[1], [w for w in x[2] if len(w) > 2]))
tweets_rdd = tweets_rdd.map(lambda x : (x[0], x[1], [w for w in x[2] if w.lower() in words or not w.isalpha()]))
## Questionable, please invesitgae
tweets_rdd = tweets_rdd.filter(lambda x : len(x[2]) >= 3)
tweets_rdd = tweets_rdd.map(lambda x : (x[0], " ".join(x[2])))

In [54]:
tweets_rdd.take(10)

[('2019-05-27 08:13:06+00', 'price hit new high whats drive hypnotic rally'),
 ('2019-05-02 17:36:29+00', 'cab taxi ride ride'),
 ('2019-05-27 01:37:37+00', 'may pump bet season'),
 ('2019-05-26 20:57:45+00', 'hit high sudden parabolic swing'),
 ('2019-05-26 19:58:37+00',
  'really gaslighting entire thread question never settle know lot try debate criticize learn improve without jerk'),
 ('2019-05-27 01:57:40+00',
  'someone check seem come back vengeance ever since block twitter'),
 ('2019-05-27 00:02:39+00', 'keep eye prize bear get'),
 ('2019-05-27 11:33:39+00', 'new release hour ago development developer'),
 ('2019-05-27 03:29:34+00', 'thanks ruin target guy'),
 ('2019-05-27 11:18:11+00', 'bought dip breakout rally')]

In [67]:
tweets = tweets_rdd.toDF(["date", "text"])
tweets.printSchema()

root
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)



In [48]:
hello = tweets.collect()



Row(date='2019-05-27 01:37:37+00', likes='141', text='may pump bet season')

Convert tweets to scores

In [68]:
def compute_score(row):
  
  try:
    score = SentimentIntensityAnalyzer().polarity_scores(row[1])["compound"]
    if score == 0.0:
      return []
  except:
    return []

  try:
    date = datetime.strptime("{}00".format(row[0]), "%Y-%m-%d %H:%M:%S%z") \
              .date().strftime("%Y-%m-%d")
  except:
    return []

  try:
    custom_score = "%.5f" % score
  except:
    return []

  return [(date, custom_score)]

tweet_daily_scores = tweets.rdd \
                        .flatMap(compute_score) \
                        .toDF(["date","custom_score"]) \
                        .cache()

In [69]:
tweet_daily_scores.createOrReplaceTempView("tweet_daily_scores")
tweet_daily_scores.printSchema()
tweet_daily_scores.show(10)

root
 |-- date: string (nullable = true)
 |-- custom_score: string (nullable = true)

+----------+------------+
|      date|custom_score|
+----------+------------+
|2019-05-26|     0.32610|
|2019-05-27|    -0.44040|
|2019-05-27|     0.51060|
|2019-05-27|    -0.22630|
|2019-05-27|     0.42150|
|2019-05-25|     0.79640|
|2019-05-25|     0.91000|
|2019-05-03|     0.84420|
|2019-05-11|    -0.52160|
|2019-05-15|     0.79640|
+----------+------------+
only showing top 10 rows



Combine BTC daily changes with tweet scores

In [79]:
results = btc_daily_changes.join(
      tweet_daily_scores.groupBy("date").agg({"custom_score": "mean"}),
      on="date"
    ).withColumnRenamed("avg(custom_score)","average_sentiment") \
    .withColumnRenamed("percent_change","btc_percent_change") \
    .cache()
results.show(100)

+----------+------------------+-------------------+
|      date|btc_percent_change|  average_sentiment|
+----------+------------------+-------------------+
|2019-11-23|              0.61| 0.3383588235294118|
|2019-11-22|             -4.29|0.14146783625731005|
|2019-11-21|             -5.86| 0.2574916201117318|
|2019-11-20|             -0.44|0.37080331125827803|
|2019-11-19|             -0.68| 0.2568555555555556|
|2019-11-18|             -3.87| 0.3198695312500001|
|2019-11-17|              0.16| 0.3225184466019417|
|2019-11-16|              0.30|0.25103564356435637|
|2019-11-15|             -1.95|0.32181127819548855|
|2019-11-14|             -1.47| 0.3597206349206349|
|2019-11-13|             -0.70| 0.2978183333333334|
|2019-11-12|              1.07| 0.3562176923076923|
|2019-11-11|             -3.43| 0.3385616438356163|
|2019-11-10|              2.47| 0.2744973214285715|
|2019-11-09|              0.59| 0.2206500000000001|
|2019-11-08|             -4.80|0.33054761904761915|
|2019-11-07|

In [80]:
results_count = results.count()
print("Number of results:", results_count)
true_positive_count = results.filter("(btc_percent_change < 0 and average_sentiment < 0) or (btc_percent_change >= 0 and average_sentiment >= 0)").count()
print("Number of true positives:", true_positive_count)
print("Number of true negatives:", results_count - true_positive_count)
print("Accuracy: {}%".format("%.2f" % (true_positive_count * 100 / results_count)))

Number of results: 987
Number of true positives: 645
Number of true negatives: 342
Accuracy: 65.35%


In [None]:
from tweepy.streaming import StreamListener

class TweetsListener(StreamListener):
    def __init__(self, csocket):
        self.client_socket = csocket
    # we override the on_data() function in StreamListener
    def on_data(self, data):
        try:
            message = json.loads( data )
            print( message['text'].encode('utf-8') )
            self.client_socket.send( message['text'].encode('utf-8') )
            return True
        except BaseException as e:
            print("Error on_data: %s" % str(e))
        return True

    def if_error(self, status):
        print(status)
        return True

In [None]:
def send_tweets(c_socket):
    auth = tweepy.auth.OAuthHandler(consumerKey, consumerSecret)
    auth.set_access_token(accessToken, accessTokenSecret)
    
    twitter_stream = tweepy.Stream(auth, TweetsListener(c_socket))
    twitter_stream.filter(track=['bitcoin'])

Dump tweets into .csv

In [None]:
# import nltk
# nltk.download('vader_lexicon')

# keyword = "bitcoin"
# tweets = tweepy.Cursor(api.search, q=keyword, wait_on_rate_limit=True).items()

# f = open("tweets.csv", "a")

# # Write header line
# # f.write("compound,created_at,retweet_count\n")

# for tweet in tweets:
#   if tweet.retweet_count < 100:
#     continue

#   score = SentimentIntensityAnalyzer().polarity_scores(tweet.text)
#   if score['compound'] == 0.0:
#     continue
  
#   f.write("{},{},{}\n".format(score['compound'], tweet.created_at, tweet.retweet_count))

# f.close()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


TweepError: ignored

Sentiment Analysis

In [None]:
import nltk
nltk.download('vader_lexicon')

#Sentiment Analysis
def percentage(part,whole):
 return 100 * float(part)/float(whole)

keyword = "bitcoin since:2020-04-02"
noOfTweet = 50
tweets = tweepy.Cursor(api.search, q=keyword).items(noOfTweet)
positive = 0
negative = 0
neutral = 0
polarity = 0
tweet_list = []
neutral_list = []
negative_list = []
positive_list = []

for tweet in tweets: 
  tweet_list.append(tweet.text)
  analysis = TextBlob(tweet.text)
  score = SentimentIntensityAnalyzer().polarity_scores(tweet.text)
  print(tweet.favorite_count) # favorite_count does not work
  print(tweet.retweet_count)
  print(score)
  neg = score['neg']
  neu = score['neu']
  pos = score['pos']
  comp = score['compound']
  polarity += analysis.sentiment.polarity
 
  if neg > pos:
    negative_list.append(tweet.text)
    negative += 1
  elif pos > neg:
    positive_list.append(tweet.text)
    positive += 1
  elif pos == neg:
    neutral_list.append(tweet.text)
    neutral += 1

positive = percentage(positive, noOfTweet)
negative = percentage(negative, noOfTweet)
neutral = percentage(neutral, noOfTweet)
polarity = percentage(polarity, noOfTweet)
positive = format(positive, '.1f')
negative = format(negative, '.1f')
neutral = format(neutral, '.1f')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
0
775
{'neg': 0.0, 'neu': 0.728, 'pos': 0.272, 'compound': 0.6808}
0
0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0
0
{'neg': 0.0, 'neu': 0.776, 'pos': 0.224, 'compound': 0.5994}
0
156
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0
0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0
0
{'neg': 0.0, 'neu': 0.352, 'pos': 0.648, 'compound': 0.9688}
0
0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0
8594
{'neg': 0.0, 'neu': 0.651, 'pos': 0.349, 'compound': 0.9153}
0
389
{'neg': 0.061, 'neu': 0.647, 'pos': 0.292, 'compound': 0.7974}
0
0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0
0
{'neg': 0.0, 'neu': 0.726, 'pos': 0.274, 'compound': 0.5267}
0
0
{'neg': 0.0, 'neu': 0.902, 'pos': 0.098, 'compound': 0.0772}
0
0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0
0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0

In [None]:
import nltk
nltk.download('vader_lexicon')

#Sentiment Analysis
def percentage(part,whole):
 return 100 * float(part)/float(whole)

keyword = "bitcoin"
noOfTweet = 50
# tweets = [" ".join(['today', 'isnt', 'lotst', 'worst'])]
tweets = ["today is the worst day"]
positive = 0
negative = 0
neutral = 0
polarity = 0
tweet_list = []
neutral_list = []
negative_list = []
positive_list = []

for tweet in tweets: 
  print(tweet)
  tweet_list.append(tweet)
  analysis = TextBlob(tweet)
  score = SentimentIntensityAnalyzer().polarity_scores(tweet)
  print(score)
  neg = score['neg']
  neu = score['neu']
  pos = score['pos']
  comp = score['compound']
  polarity += analysis.sentiment.polarity
 
  if neg > pos:
    negative_list.append(tweet)
    negative += 1
  elif pos > neg:
    positive_list.append(tweet)
    positive += 1
  elif pos == neg:
    neutral_list.append(tweet)
    neutral += 1

positive = percentage(positive, noOfTweet)
negative = percentage(negative, noOfTweet)
neutral = percentage(neutral, noOfTweet)
polarity = percentage(polarity, noOfTweet)
positive = format(positive, '.1f')
negative = format(negative, '.1f')
neutral = format(neutral, '.1f')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
today is the worst day
{'neg': 0.506, 'neu': 0.494, 'pos': 0.0, 'compound': -0.6249}


Spark Version

In [None]:
import nltk
nltk.download('vader_lexicon')

#Sentiment Analysis
def percentage(part,whole):
 return 100 * float(part)/float(whole)

keyword = "bitcoin since:2020-04-02"
noOfTweet = 50
tweets = tweepy.Cursor(api.search, q=keyword).items(noOfTweet)
positive = 0
negative = 0
neutral = 0
polarity = 0
tweet_list = []
neutral_list = []
negative_list = []
positive_list = []

for tweet in tweets: 
  tweet_list.append(tweet.text)
  analysis = TextBlob(tweet.text)
  score = SentimentIntensityAnalyzer().polarity_scores(tweet.text)
  print(tweet.created_at)
  print(score)
  neg = score['neg']
  neu = score['neu']
  pos = score['pos']
  comp = score['compound']
  polarity += analysis.sentiment.polarity
 
  if neg > pos:
    negative_list.append(tweet.text)
    negative += 1
  elif pos > neg:
    positive_list.append(tweet.text)
    positive += 1
  elif pos == neg:
    neutral_list.append(tweet.text)
    neutral += 1

positive = percentage(positive, noOfTweet)
negative = percentage(negative, noOfTweet)
neutral = percentage(neutral, noOfTweet)
polarity = percentage(polarity, noOfTweet)
positive = format(positive, '.1f')
negative = format(negative, '.1f')
neutral = format(neutral, '.1f')

In [None]:
#Number of Tweets (Total, Positive, Negative, Neutral)
tweet_list = pd.DataFrame(tweet_list)
neutral_list = pd.DataFrame(neutral_list)
negative_list = pd.DataFrame(negative_list)
positive_list = pd.DataFrame(positive_list)
print("total number: ",len(tweet_list))
print("positive number: ",len(positive_list))
print("negative number: ", len(negative_list))
print("neutral number: ",len(neutral_list))

total number:  50
positive number:  15
negative number:  10
neutral number:  25


In [None]:
tweet_list

Unnamed: 0,0
0,RT @Jayecane: I badly need to send 8 people mo...
1,RT @Jayecane: I badly need to send 8 people mo...
2,@MichellePhan @CashApp $RBaiZa #Bitcoin #bitco...
3,RT @Jayecane: I badly need to send 8 people mo...
4,RT @Coinimparator: #Coinimparator ve ailesi yi...
5,RT @Jayecane: I badly need to send 8 people mo...
6,"Bitcoin and Gold have gone up a bit, #xrp has ..."
7,RT @loopstarter: Take the power of the collect...
8,The strongest bullish signal has broken out fo...
9,Now that I agree with #Bitcoin you can’t chang...


In [None]:
api.search()

TweepError: ignored

In [None]:
for tweet in tweets:
 print(tweet.text)

RT @PolandYielder: TUSD is coming to Yield App🚀🚀🚀💎💎💎
Earn up to 14% p.a. 💪💪💪🪙🪙🪙📈📈📈
https://t.co/F1OBcHuym3
@YieldApp @YieldAppPoland #yield…
RT @Stray_cats_BSV: #StrayCats =
@Biarritz82 +
@LCarrion80 aka @BlogueroDigital;
quienes además
de compartir vida, compartimos
pasiones com…
RT @ParaBorsaCrypto: Endeksler ve #Bitcoin 'de hareketlilik başladı.
RT @AirDropTR_EN: 🔥HEDIYE  ZAMANI 🌟

3 KİŞİYE 250'₺ TOPLAM 750'₺

 RT yap

Takip et

Süre Cumartesi 21.00 ⏳kadar

Bol şans dostlar.

#BTC …
RT @gladstein: “Decentralizing both finance and the internet would offer a long-overdue counterweight to the very concentrated power and we…
RT @LeCoinBit: Instead of predicting, try ENGAGING.

The world is depressing when you’re not part of it. Try person… https://t.co/Ynv4zxHwf6
RT @zam_zach711: ALL PRICES ARE BACK TO $50.00 STARTING TODAY (MARCH 10-)!
#GospelMusic #BTC #crytocurrency #Coinbase #cashapp #ETH #Crypto…
🇺🇸 USD: $40,826
🇪🇺 EUR: €36,689 
🇬🇧 GBP: £30,976
🇨🇦 CAD: $51,605
🇦🇺 AUD: $55,365
🇯🇵 JP