In [1]:
import numpy as np
import pandas as pd
import os
import pickle

In [2]:
cache_file = '../cache/posts/twitter/data.pickle'
dataset_path = '../datasets/original/tweets.csv'

data_twitter = None

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data_twitter = pickle.load(f)
else:
    data_twitter = pd.read_csv(
        filepath_or_buffer=dataset_path,
        usecols=['timestamp', 'likes', 'retweets', 'text\r'],
        sep=';',
        on_bad_lines='skip',
        lineterminator='\n',
        low_memory=False,
        nrows=1_000_000,
    ).rename(columns={
        'timestamp': 'date',
        'text\r': 'text',
    })

    data_twitter['date'] = pd.to_datetime(data_twitter['date']).dt.date

    # data_twitter = data_twitter[data_twitter['likes'] > 0]

    data_twitter['interaction'] = data_twitter['likes'] + data_twitter['retweets']

    data_twitter = data_twitter.drop(
        ['likes', 'retweets'],
        axis=1,
    ).reset_index(drop=True)

    with open(cache_file, 'wb') as f:
        pickle.dump(data_twitter, f)

data_twitter

Unnamed: 0,date,text,interaction
0,2019-05-27,È appena uscito un nuovo video! LES CRYPTOMONN...,0
1,2019-05-27,Cardano: Digitize Currencies; EOS https://t.co...,0
2,2019-05-27,Another Test tweet that wasn't caught in the s...,3
3,2019-05-27,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...,0
4,2019-05-27,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...,0
...,...,...,...
999995,2019-05-21,Dutch Man Arrested Over $2.2 Million Bitcoin M...,0
999996,2019-05-21,This is how crypto can do some of it's best wo...,56
999997,2019-05-21,クレイグ・ライト氏、\nBitcoinホワイトペーパー\n著作権を登録 ✍️\n💸BSV高騰...,0
999998,2019-05-21,Bitcoin doesn't have inherent value. \nAltcoin...,0


In [4]:
cache_file = '../cache/posts/reddit/data.pickle'
dataset_path = '../datasets/original/bitcoin_reddit_all.csv'

data_reddit = None

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data_reddit = pickle.load(f)
else:
    data_reddit = pd.read_csv(
        filepath_or_buffer=dataset_path,
        usecols=['date', 'score', 'body'],
        sep=',',
        on_bad_lines='skip',
        lineterminator='\n',
        low_memory=False,
        nrows=1_000_000,
    ).rename(columns={
        'score': 'interaction',
        'body': 'text',
    })

    data_reddit['date'] = pd.to_datetime(data_reddit['date']).dt.date

    data_reddit = data_reddit[data_reddit['interaction'] > 0]

    data_reddit['interaction'] = data_reddit['interaction'].astype(int)

    data_reddit = data_reddit.reset_index(drop=True)

    with open(cache_file, 'wb') as f:
        pickle.dump(data_reddit, f)

data_reddit

Unnamed: 0,date,interaction,text
0,2014-06-26,162,How do you feel about Bitcoin? I don't really ...
1,2014-05-17,75,[The guy who blew a huge portion of his and hi...
2,2014-04-30,13,This was the bitcoin hat guy that ACTUALLY del...
3,2014-07-07,19,I'm sure this is *good* for bitcoin
4,2014-12-26,54,"I thought it was a pun on bitcoin, but I suppo..."
...,...,...,...
890263,2015-02-14,13,"For most users, the values they had about bitc..."
890264,2015-02-02,31,"Hey guys, Joe from Bonafide here. If you have ..."
890265,2015-02-23,13,Blockchain.Info is a terrible app and a great ...
890266,2015-02-13,14,"Ok, I've been lost on this paycoin thing, now ..."


In [6]:
data = pd.concat([data_twitter, data_reddit], axis=0)

data = data.sort_values('date').reset_index(drop=True)

data

Unnamed: 0,date,text,interaction
0,2009-01-11,Running bitcoin\r,20012
1,2009-05-08,"Interesting, it uses IRC as a high level proto...",1
2,2009-05-08,"No, that's not how bitcoin works, check out th...",2
3,2009-05-08,No - the richest person will be the one with t...,2
4,2009-09-24,Some reddit thought on Bitcoin [here](http://w...,1
...,...,...,...
1890263,2019-05-27,@_choicedelhi Join me on https://t.co/kdO3qkiV...,0
1890264,2019-05-27,"@duganist @KasteelCrypto @BambouClub Yes, but ...",0
1890265,2019-05-27,"Cointelegraph: Bitcoin Collector software, whi...",0
1890266,2019-05-27,Bitcoin、Litecoin、Monero、DASHなどを完全匿名で現金で個人間で簡単に...,0


In [7]:
data = data.dropna().reset_index(drop=True)

data.isna().sum()

date           0
text           0
interaction    0
dtype: int64

In [8]:
from datetime import timedelta

current_day = data.date.min()
min_index = 0
count_tweets = 0

n = 100

for t in data.itertuples():
    index = t.Index
    date = t.date

    if current_day == date:
        count_tweets += 1
        continue

    if count_tweets < n:
        min_index = index + 1

    current_day += timedelta(days=1)
    count_tweets = 1

data = data.loc[min_index:]

data.reset_index(drop=True, inplace=True)

data

Unnamed: 0,date,text,interaction
0,2019-05-26,There is a popular belief that Satoshi Nakamot...,40
1,2019-05-26,Generally speaking;\n\n#Crypto &amp; #Bitcoin ...,223
2,2019-05-26,Me saving money and getting free cryptos using...,68
3,2019-05-26,"$218,000 Bitcoin during the next bull run. Jan...",44
4,2019-05-26,Trading remains light on FreiExchange. Bitcoin...,0
...,...,...,...
7793,2019-05-27,@_choicedelhi Join me on https://t.co/kdO3qkiV...,0
7794,2019-05-27,"@duganist @KasteelCrypto @BambouClub Yes, but ...",0
7795,2019-05-27,"Cointelegraph: Bitcoin Collector software, whi...",0
7796,2019-05-27,Bitcoin、Litecoin、Monero、DASHなどを完全匿名で現金で個人間で簡単に...,0


In [10]:
import spacy
import re

cache_file = '../cache/posts/english/data.pickle'

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data = pickle.load(f)
else:
    nlp = spacy.load('en_core_web_sm')

    data['text'] = data['text'].apply(lambda t:
        re.sub(
            pattern=r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b',
            repl='',
            string=t,
            flags=re.MULTILINE,
        )
    )
    data['text'] = data['text'].apply(lambda t:
        re.sub(
            pattern=r'\$\w+[,]|\@\w+|[,]\@\w+',
            repl='',
            string=t
        )
    )
    data['text'] = data['text'].apply(lambda t:
        t.replace(
            '#', ''
        ).replace(
            '\n', ''
        ).replace(
            '\r', ''
        ).replace(
            '\b', ''
        ).replace(
            '\t', ''
        ).replace(
            '\\', ' '
        ).replace(
            '/', ' '
        ).replace(
            '|', ' '
        )
    )
    data['text'] = data['text'].apply(lambda t:
        re.sub(
            pattern=r'  +',
            repl=' ',
            string=t
        )
    )


    def lang_detect(doc):
        return doc.lang_


    data['lang'] = list(nlp.pipe(data['text'], batch_size=100, n_process=8))
    data['lang'] = data['lang'].apply(lang_detect)

    data = data[data['lang'] == 'en']
    # data = data[data['interaction'] > 0]
    df_grouped = data.groupby('date')

    dfs_to_concat = []

    for date, group in df_grouped:
        if len(group) >= 100:
            df_top_100 = group.nlargest(100, 'interaction')
            dfs_to_concat.append(df_top_100)

    data = pd.concat(
        dfs_to_concat,
    ).sort_values(
        ['date', 'interaction'],
        ascending=[True, False],
    ).drop(
        'lang',
        axis=1,
    ).reset_index(drop=True)

    with open(cache_file, 'wb') as f:
        pickle.dump(data, f)

data

Unnamed: 0,date,text,interaction
0,2019-05-26,Bitcoin new yearly highs. 000 is the next stop.,1832
1,2019-05-26,Bitcoin Price Slays Hits 12-Month High in Sudd...,1051
2,2019-05-26,My 8yr old just told me that she wants to be a...,1049
3,2019-05-26,Fidelity Is Really In Love With Bitcoin: Texas...,893
4,2019-05-26,"No, you're really not. This is gaslighting bul...",785
...,...,...,...
195,2019-05-27,HERKES Grafikleri yakından TAKİP etsin.GELECEĞ...,120
196,2019-05-27,"Borsanın ne demek olduğu,BELENSAY Profilinde S...",120
197,2019-05-27,"MAAZERET üreteceksen,FİNANSAL ANALİZ yapma..Gİ...",120
198,2019-05-27,"Today's bitcoin spot market is “significantly""...",119


In [11]:
from datetime import datetime

def check_skipped_days():
    x = 0
    arr = np.array([])
    day_before = datetime.now()
    index_before = 0
    min_index = 0

    for t in data.itertuples():
        index = t[0]
        d = t[1]
        diff = 0

        if x == 0:
            day_before, index_before = d, index
            x+=1
            continue

        diff = (d - day_before).days

        if diff > 1:
            arr = np.append(arr, '{} => {} : {} days'.format(index_before, index, diff))
            min_index = index_before

        day_before, index_before = d, index

    return arr, min_index + 1

In [12]:
skipped_days, min_index = check_skipped_days()

print(skipped_days)
print(len(skipped_days))

[]
0


In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


def analyze_sentiments():
    sia = SentimentIntensityAnalyzer()

    return data['text'].apply(lambda t: sia.polarity_scores(t)['compound'])

In [15]:
cache_file = '../cache/posts/sentiment/data.pickle'

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data = pickle.load(f)
else:
    data['sentiment'] = analyze_sentiments()

    with open(cache_file, 'wb') as f:
        pickle.dump(data, f)

data

Unnamed: 0,date,text,interaction,sentiment
0,2019-05-26,Bitcoin new yearly highs. 000 is the next stop.,1832,-0.2960
1,2019-05-26,Bitcoin Price Slays Hits 12-Month High in Sudd...,1051,0.0000
2,2019-05-26,My 8yr old just told me that she wants to be a...,1049,0.7250
3,2019-05-26,Fidelity Is Really In Love With Bitcoin: Texas...,893,0.6682
4,2019-05-26,"No, you're really not. This is gaslighting bul...",785,-0.3991
...,...,...,...,...
195,2019-05-27,HERKES Grafikleri yakından TAKİP etsin.GELECEĞ...,120,0.0000
196,2019-05-27,"Borsanın ne demek olduğu,BELENSAY Profilinde S...",120,0.0000
197,2019-05-27,"MAAZERET üreteceksen,FİNANSAL ANALİZ yapma..Gİ...",120,0.0000
198,2019-05-27,"Today's bitcoin spot market is “significantly""...",119,0.1263


In [21]:
data = data.groupby('date')['sentiment'].mean().reset_index()

data.columns = ['date', 'sentiment']

data

Unnamed: 0,date,sentiment
0,2019-05-26,0.231749
1,2019-05-27,0.147698


In [23]:
data.to_csv('../datasets/consolidated/posts.csv', index=False)