Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Initialize Spark

In [2]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark
!wget -q https://student.cs.uwaterloo.ca/~cs451/content/cs431/sql-data.tgz
!tar -xzf sql-data.tgz

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.8-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark.sql import SparkSession
import random

spark = SparkSession.builder.appName("YourTest").master("local[2]").config('spark.ui.port', random.randrange(4000,5000)).getOrCreate()

Initialize Tweepy

In [4]:
# Install Libraries
!pip install textblob
!pip install tweepy
!pip install pycountry
!pip install langdetect
!pip install twython



In [5]:
from textblob import TextBlob
import sys
import tweepy
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import nltk
import pycountry
import re
import string
import json
import socket
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from langdetect import detect
from nltk.stem import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from datetime import datetime
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [7]:
auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
api = tweepy.API(auth)

Import Datasets

In [24]:
btc_prices_raw = spark.read.csv("/content/drive/MyDrive/BTCUSD_daily.csv",sep=',',header=True,inferSchema=True).drop("unix").drop("symbol").drop("high").drop("low").drop("Volume BTC").drop("Volume USD").dropna(how="any")

In [25]:
btc_prices_raw.printSchema()
btc_prices_raw.show(10)

root
 |-- date: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- close: double (nullable = true)

+-------------------+--------+--------+
|               date|    open|   close|
+-------------------+--------+--------+
|2022-03-31 00:00:00|47086.07|47173.36|
|2022-03-30 00:00:00|47459.03|47068.08|
|2022-03-29 00:00:00|47152.38|47459.03|
|2022-03-28 00:00:00|46854.96|47152.38|
|2022-03-27 00:00:00|44553.24|46864.39|
|2022-03-26 00:00:00|44340.49|44535.65|
|2022-03-25 00:00:00|44025.99| 44320.6|
|2022-03-24 00:00:00|42912.21|44025.99|
|2022-03-23 00:00:00|42393.62|42925.41|
|2022-03-22 00:00:00|41018.36|42393.41|
+-------------------+--------+--------+
only showing top 10 rows



In [8]:
tweets_raw = spark.read.csv("/content/drive/MyDrive/tweets.csv",sep=';',header=True,inferSchema=True,multiLine=True).drop("id").drop("user").drop("fullname").drop("url").drop("replies").drop("retweets").dropna(how="any")

In [9]:
tweets_raw.printSchema()
tweets_raw.show(10)

root
 |-- timestamp: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- text: string (nullable = true)

+--------------------+-----+-------------------------------------+
|
+--------------------+-----+-------------------------------------+
|2019-05-27 11:49:...|    0|                 È appena uscito u...|
|2019-05-27 11:49:...|    0|                 Cardano: Digitize...|
|2019-05-27 11:49:...|    2|                 Another Test twee...|
|2019-05-27 11:49:...|    0|                 Current Crypto Pr...|
|2019-05-27 11:49:...|    0|                 Spiv (Nosar Baz):...|
|2019-05-27 11:49:...|    0|                 #btc inceldiği ye...|
|2019-05-27 11:49:...|    0|                 @nwoodfine We hav...|
|2019-05-27 11:49:...|    0|                 @pedronauck como ...|
|2019-05-27 11:49:...|    0|ブラジルはまぁ置いといてもドイツは...|
|2019-05-27 11:49:...|    0|                 CHANGE IS COMING....|
+--------------------+-----+-------------------------------------+
only showing top 10 row

Convert BTC prices to percent changes

In [30]:
def compute_percent_change(row):
  date = row[0].date().strftime("%Y-%m-%d")
  percent_change = (row[2] - row[1]) * 100 / row[1]
  return (date, "%.2f" % percent_change)

btc_daily_changes = btc_prices_raw.rdd \
                                  .map(compute_percent_change) \
                                  .toDF(["date","percent_change"])

In [31]:
btc_daily_changes.createOrReplaceTempView("btc_daily_changes")
btc_daily_changes.printSchema()
btc_daily_changes.show(10)

root
 |-- date: string (nullable = true)
 |-- percent_change: string (nullable = true)

+----------+--------------+
|      date|percent_change|
+----------+--------------+
|2022-03-31|          0.19|
|2022-03-30|         -0.82|
|2022-03-29|          0.65|
|2022-03-28|          0.63|
|2022-03-27|          5.19|
|2022-03-26|          0.44|
|2022-03-25|          0.67|
|2022-03-24|          2.60|
|2022-03-23|          1.25|
|2022-03-22|          3.35|
+----------+--------------+
only showing top 10 rows



Convert tweets to scores

In [21]:
min_likes = 100

def compute_score(row):
  try:
    likes = int(row[1])
  except:
    return []

  if likes < min_likes:
    return []
  
  score = SentimentIntensityAnalyzer().polarity_scores(row[2])["compound"]
  if score == 0.0:
    return []

  date = datetime.strptime("{}00".format(row[0]), "%Y-%m-%d %H:%M:%S%z") \
              .date().strftime("%Y-%m-%d")

  return [(date, likes, "%.5f" % score)]

tweet_daily_scores = tweets_raw.rdd \
                        .flatMap(compute_score) \
                        .toDF(["date","likes","score"]) \
                        .cache()

In [22]:
tweet_daily_scores.createOrReplaceTempView("tweet_daily_scores")
tweet_daily_scores.printSchema()
tweet_daily_scores.show(10)

root
 |-- date: string (nullable = true)
 |-- likes: long (nullable = true)
 |-- score: string (nullable = true)

+----------+-----+--------+
|      date|likes|   score|
+----------+-----+--------+
|2019-05-27|  141|-0.69560|
|2019-05-26|  715|-0.82180|
|2019-05-27| 1476|-0.27320|
|2019-05-27|  382| 0.51060|
|2019-05-27|  300| 0.22630|
|2019-05-27|  632| 0.02580|
|2019-05-27|  108| 0.42150|
|2019-05-25|  376| 0.79640|
|2019-05-25|  203| 0.94450|
|2019-05-03| 1870| 0.80380|
+----------+-----+--------+
only showing top 10 rows



Combine BTC daily changes with tweet scores

In [42]:
results = btc_daily_changes.join(
      tweet_daily_scores.groupBy("date").agg({"score": "mean"}),
      on="date"
    ).withColumnRenamed("avg(score)","average_sentiment") \
    .withColumnRenamed("percent_change","btc_percent_change") \
    .cache()
results.show(10)

+----------+------------------+-------------------+
|      date|btc_percent_change|  average_sentiment|
+----------+------------------+-------------------+
|2019-11-23|              0.61|            0.30866|
|2019-11-22|             -4.29| 0.1684896174863388|
|2019-11-21|             -5.86|0.25843722222222226|
|2019-11-20|             -0.44|0.35864370860927136|
|2019-11-19|             -0.68|0.27039924242424257|
|2019-11-18|             -3.87| 0.2539301369863015|
|2019-11-17|              0.16|0.29695877192982445|
|2019-11-16|              0.30| 0.2516770642201835|
|2019-11-15|             -1.95| 0.2905542857142857|
|2019-11-14|             -1.47| 0.3743210526315788|
+----------+------------------+-------------------+
only showing top 10 rows



In [49]:
results_count = results.count()
print("Number of results:", results_count)
true_positive_count = results.filter("(btc_percent_change < 0 and average_sentiment < 0) or (btc_percent_change >= 0 and average_sentiment >= 0)").count()
print("Number of true positives:", true_positive_count)
print("Number of true negatives:", results_count - true_positive_count)
print("Accuracy: {}%".format("%.2f" % (true_positive_count * 100 / results_count)))

Number of results: 995
Number of true positives: 643
Number of true negatives: 352
Accuracy: 64.62%


In [None]:
from tweepy.streaming import StreamListener

class TweetsListener(StreamListener):
    def __init__(self, csocket):
        self.client_socket = csocket
    # we override the on_data() function in StreamListener
    def on_data(self, data):
        try:
            message = json.loads( data )
            print( message['text'].encode('utf-8') )
            self.client_socket.send( message['text'].encode('utf-8') )
            return True
        except BaseException as e:
            print("Error on_data: %s" % str(e))
        return True

    def if_error(self, status):
        print(status)
        return True

In [None]:
def send_tweets(c_socket):
    auth = tweepy.auth.OAuthHandler(consumerKey, consumerSecret)
    auth.set_access_token(accessToken, accessTokenSecret)
    
    twitter_stream = tweepy.Stream(auth, TweetsListener(c_socket))
    twitter_stream.filter(track=['bitcoin'])

Dump tweets into .csv

In [None]:
# import nltk
# nltk.download('vader_lexicon')

# keyword = "bitcoin"
# tweets = tweepy.Cursor(api.search, q=keyword, wait_on_rate_limit=True).items()

# f = open("tweets.csv", "a")

# # Write header line
# # f.write("compound,created_at,retweet_count\n")

# for tweet in tweets:
#   if tweet.retweet_count < 100:
#     continue

#   score = SentimentIntensityAnalyzer().polarity_scores(tweet.text)
#   if score['compound'] == 0.0:
#     continue
  
#   f.write("{},{},{}\n".format(score['compound'], tweet.created_at, tweet.retweet_count))

# f.close()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


TweepError: ignored

Sentiment Analysis

In [None]:
import nltk
nltk.download('vader_lexicon')

#Sentiment Analysis
def percentage(part,whole):
 return 100 * float(part)/float(whole)

keyword = "bitcoin since:2020-04-02"
noOfTweet = 50
tweets = tweepy.Cursor(api.search, q=keyword).items(noOfTweet)
positive = 0
negative = 0
neutral = 0
polarity = 0
tweet_list = []
neutral_list = []
negative_list = []
positive_list = []

for tweet in tweets: 
  tweet_list.append(tweet.text)
  analysis = TextBlob(tweet.text)
  score = SentimentIntensityAnalyzer().polarity_scores(tweet.text)
  print(tweet.favorite_count) # favorite_count does not work
  print(tweet.retweet_count)
  print(score)
  neg = score['neg']
  neu = score['neu']
  pos = score['pos']
  comp = score['compound']
  polarity += analysis.sentiment.polarity
 
  if neg > pos:
    negative_list.append(tweet.text)
    negative += 1
  elif pos > neg:
    positive_list.append(tweet.text)
    positive += 1
  elif pos == neg:
    neutral_list.append(tweet.text)
    neutral += 1

positive = percentage(positive, noOfTweet)
negative = percentage(negative, noOfTweet)
neutral = percentage(neutral, noOfTweet)
polarity = percentage(polarity, noOfTweet)
positive = format(positive, '.1f')
negative = format(negative, '.1f')
neutral = format(neutral, '.1f')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
0
775
{'neg': 0.0, 'neu': 0.728, 'pos': 0.272, 'compound': 0.6808}
0
0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0
0
{'neg': 0.0, 'neu': 0.776, 'pos': 0.224, 'compound': 0.5994}
0
156
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0
0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0
0
{'neg': 0.0, 'neu': 0.352, 'pos': 0.648, 'compound': 0.9688}
0
0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0
8594
{'neg': 0.0, 'neu': 0.651, 'pos': 0.349, 'compound': 0.9153}
0
389
{'neg': 0.061, 'neu': 0.647, 'pos': 0.292, 'compound': 0.7974}
0
0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0
0
{'neg': 0.0, 'neu': 0.726, 'pos': 0.274, 'compound': 0.5267}
0
0
{'neg': 0.0, 'neu': 0.902, 'pos': 0.098, 'compound': 0.0772}
0
0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0
0
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
0

In [None]:
import nltk
nltk.download('vader_lexicon')

#Sentiment Analysis
def percentage(part,whole):
 return 100 * float(part)/float(whole)

keyword = "bitcoin"
noOfTweet = 50
# tweets = [" ".join(['today', 'isnt', 'lotst', 'worst'])]
tweets = ["today is the worst day"]
positive = 0
negative = 0
neutral = 0
polarity = 0
tweet_list = []
neutral_list = []
negative_list = []
positive_list = []

for tweet in tweets: 
  print(tweet)
  tweet_list.append(tweet)
  analysis = TextBlob(tweet)
  score = SentimentIntensityAnalyzer().polarity_scores(tweet)
  print(score)
  neg = score['neg']
  neu = score['neu']
  pos = score['pos']
  comp = score['compound']
  polarity += analysis.sentiment.polarity
 
  if neg > pos:
    negative_list.append(tweet)
    negative += 1
  elif pos > neg:
    positive_list.append(tweet)
    positive += 1
  elif pos == neg:
    neutral_list.append(tweet)
    neutral += 1

positive = percentage(positive, noOfTweet)
negative = percentage(negative, noOfTweet)
neutral = percentage(neutral, noOfTweet)
polarity = percentage(polarity, noOfTweet)
positive = format(positive, '.1f')
negative = format(negative, '.1f')
neutral = format(neutral, '.1f')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
today is the worst day
{'neg': 0.506, 'neu': 0.494, 'pos': 0.0, 'compound': -0.6249}


Spark Version

In [None]:
import nltk
nltk.download('vader_lexicon')

#Sentiment Analysis
def percentage(part,whole):
 return 100 * float(part)/float(whole)

keyword = "bitcoin since:2020-04-02"
noOfTweet = 50
tweets = tweepy.Cursor(api.search, q=keyword).items(noOfTweet)
positive = 0
negative = 0
neutral = 0
polarity = 0
tweet_list = []
neutral_list = []
negative_list = []
positive_list = []

for tweet in tweets: 
  tweet_list.append(tweet.text)
  analysis = TextBlob(tweet.text)
  score = SentimentIntensityAnalyzer().polarity_scores(tweet.text)
  print(tweet.created_at)
  print(score)
  neg = score['neg']
  neu = score['neu']
  pos = score['pos']
  comp = score['compound']
  polarity += analysis.sentiment.polarity
 
  if neg > pos:
    negative_list.append(tweet.text)
    negative += 1
  elif pos > neg:
    positive_list.append(tweet.text)
    positive += 1
  elif pos == neg:
    neutral_list.append(tweet.text)
    neutral += 1

positive = percentage(positive, noOfTweet)
negative = percentage(negative, noOfTweet)
neutral = percentage(neutral, noOfTweet)
polarity = percentage(polarity, noOfTweet)
positive = format(positive, '.1f')
negative = format(negative, '.1f')
neutral = format(neutral, '.1f')

In [None]:
#Number of Tweets (Total, Positive, Negative, Neutral)
tweet_list = pd.DataFrame(tweet_list)
neutral_list = pd.DataFrame(neutral_list)
negative_list = pd.DataFrame(negative_list)
positive_list = pd.DataFrame(positive_list)
print("total number: ",len(tweet_list))
print("positive number: ",len(positive_list))
print("negative number: ", len(negative_list))
print("neutral number: ",len(neutral_list))

total number:  50
positive number:  15
negative number:  10
neutral number:  25


In [None]:
tweet_list

Unnamed: 0,0
0,RT @Jayecane: I badly need to send 8 people mo...
1,RT @Jayecane: I badly need to send 8 people mo...
2,@MichellePhan @CashApp $RBaiZa #Bitcoin #bitco...
3,RT @Jayecane: I badly need to send 8 people mo...
4,RT @Coinimparator: #Coinimparator ve ailesi yi...
5,RT @Jayecane: I badly need to send 8 people mo...
6,"Bitcoin and Gold have gone up a bit, #xrp has ..."
7,RT @loopstarter: Take the power of the collect...
8,The strongest bullish signal has broken out fo...
9,Now that I agree with #Bitcoin you can’t chang...


In [None]:
api.search()

TweepError: ignored

In [None]:
for tweet in tweets:
 print(tweet.text)

RT @PolandYielder: TUSD is coming to Yield App🚀🚀🚀💎💎💎
Earn up to 14% p.a. 💪💪💪🪙🪙🪙📈📈📈
https://t.co/F1OBcHuym3
@YieldApp @YieldAppPoland #yield…
RT @Stray_cats_BSV: #StrayCats =
@Biarritz82 +
@LCarrion80 aka @BlogueroDigital;
quienes además
de compartir vida, compartimos
pasiones com…
RT @ParaBorsaCrypto: Endeksler ve #Bitcoin 'de hareketlilik başladı.
RT @AirDropTR_EN: 🔥HEDIYE  ZAMANI 🌟

3 KİŞİYE 250'₺ TOPLAM 750'₺

 RT yap

Takip et

Süre Cumartesi 21.00 ⏳kadar

Bol şans dostlar.

#BTC …
RT @gladstein: “Decentralizing both finance and the internet would offer a long-overdue counterweight to the very concentrated power and we…
RT @LeCoinBit: Instead of predicting, try ENGAGING.

The world is depressing when you’re not part of it. Try person… https://t.co/Ynv4zxHwf6
RT @zam_zach711: ALL PRICES ARE BACK TO $50.00 STARTING TODAY (MARCH 10-)!
#GospelMusic #BTC #crytocurrency #Coinbase #cashapp #ETH #Crypto…
🇺🇸 USD: $40,826
🇪🇺 EUR: €36,689 
🇬🇧 GBP: £30,976
🇨🇦 CAD: $51,605
🇦🇺 AUD: $55,365
🇯🇵 JP