In [1]:
import wget
import tweepy
from tweepy import TweepError
import json
from datetime import timedelta, datetime, timezone, date
import pandas as pd
import numpy as np
import sklearn
import nltk
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import string
import re

  from numpy.core.umath_tests import inner1d


### Keys

In [2]:
key_file = 'keys.json'
with open(key_file) as f:
                keys = json.load(f)

### Setting up tweepy

In [3]:
auth = tweepy.OAuthHandler(keys["consumer_key"], keys["consumer_secret"])
auth.set_access_token(keys["access_token"], keys["access_token_secret"])
api = tweepy.API(auth, wait_on_rate_limit=True)

### Let's stick with tweets in the past day

In [4]:
today = date.today()
today = datetime(today.year, today.month, today.day)
week_ago = today - timedelta(days=1)

In [5]:
start = week_ago.strftime('%Y-%m-%d %H:%M:%S')[0:10]

A script to print every tweet with the hashtag "bitcoin" in the last week

### change the value of i if you have trouble with tweepy.cursor

In [6]:
print("scraping tweeter data with hashtag bitcoin in the last day with tweepy")

scraping tweeter data with hashtag bitcoin in the last day with tweepy


In [7]:
timestamp = []
user = []
text = []
retweet_count = []
i = 0
for tweet in tweepy.Cursor(api.search, q = "#bitcoin", lang="en", since = start).items():
    #print(i)
    i += 1
    timestamp.append(tweet.created_at)
    retweet_count.append(tweet.retweet_count)
    text.append(tweet.text)
    user.append(tweet.user.screen_name)
    if i > 1500:
        break

In [8]:
start2 = int(round(timestamp[-1].replace(tzinfo=timezone.utc).timestamp()))

In [9]:
rawlink = "http://api.bitcoincharts.com/v1/trades.csv?symbol=bitstampUSD"
link = rawlink + "&start=" + str(int(round(start2)))

### download price data from Bitcoincharts

In [10]:
print("downloading and cleaning price data")

downloading and cleaning price data


In [11]:
filename = wget.download(link)

-1 / unknown

In [12]:
btcprice = pd.read_csv(filename, header = None)

In [13]:
btcprice.columns = ['unixtime', 'price', 'amount']

In [14]:
converted_time = btcprice['unixtime'].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))

In [15]:
d = {'timestamp': timestamp, 'user': user, 'text' : text, 'retweet' : retweet_count}
df = pd.DataFrame(data = d)
df.to_csv("most_recent_tweet.csv")

In [16]:
btcprice['timestamp'] = converted_time
btcprice2 = btcprice.iloc[::50, :].reset_index()
del btcprice2['index']

In [17]:
df2 = df.iloc[::-1].reset_index()
del df2['index']

In [18]:
btcprice2['timestamp'] = btcprice2['timestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

In [19]:
def cal_direction(array):
    direction = np.ones(len(array))
    for i in range(len(array) - 1):
        if array[i + 1] - array[i] < 0:
            direction[i + 1] = 0
    return(direction)

In [20]:
btcprice2 = btcprice2.assign(direction = cal_direction(btcprice2['price'].values))

In [21]:
print("merging tweet data with price data by timestamp")

merging tweet data with price data by timestamp


In [22]:
direction_tweet = np.zeros(len(df2))
for x in range(len(df2)):
    for y in range(len(btcprice2)):
        if (btcprice2.loc[y, 'timestamp'] > df2.loc[x, 'timestamp']):
            direction_tweet[x] = btcprice2.loc[y, 'direction']
            break
#direction_tweet

### natural language processing, clean and stem the tweet text, and use tfidf vectorizer to vectorize the text data

In [23]:
print("clean the tweet")

clean the tweet


In [24]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tfidf_vec = TfidfVectorizer(analyzer=clean_text)
x_tfidf = tfidf_vec.fit_transform(df2['text'])

x_tfidf.columns = tfidf_vec.get_feature_names()
x_counts_tfidf = pd.DataFrame(x_tfidf.toarray())

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [25]:
x_feature = pd.concat([df2[['retweet']], x_counts_tfidf], axis = 1)

In [26]:
x_feature2 = x_feature.loc[:int(round(0.8*len(x_feature)))-1, :]
direction_tweet2 = direction_tweet[:int(round(0.8*len(direction_tweet)))]
x_est = x_feature.loc[int(round(0.8*len(x_feature))):, :]

In [27]:
train_size = int(round(0.8*len(x_feature2)))

In [28]:
x_train = x_feature2.loc[:train_size-1, :]
x_test = x_feature2.loc[train_size:, :]
y_train = direction_tweet2[:train_size]
y_test = direction_tweet2[train_size:]

In [29]:
#x_train, x_test, y_train, y_test = train_test_split(x_feature, direction_tweet, test_size=0.2)

### training a random forest model

In [30]:
print("training the model")

training the model


In [31]:
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(x_train, y_train)

In [32]:
y_pred = rf_model.predict(x_test)
if sum(y_pred == 0) >= sum(y_pred == 1):
    label = 0
else:
    lebel = 1
precision, recall, fscore, support = score(y_test, y_pred, pos_label= label, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 0.383 / Recall: 0.872 / Accuracy: 0.45


In [33]:
y_est = rf_model.predict(x_est)
p1 = sum(y_est == 1)
p0 = sum(y_est == 0)
if p1 > p0:
    print("the random forest model detects an upward trend based on conversations on tweet with a probability of " + str(p1/len(y_est)))
else:
    print("the random forest model detects an downward trend based on conversations on tweet with a probability of " + str(p0/len(y_est)))

the random forest model detects an downward trend based on conversations on tweet with a probability of 0.8433333333333334
