In [2]:
import numpy as np
import pandas as pd
import os
import pickle

In [4]:
cache_file = '../cache/posts/twitter/data.pickle'
dataset_path = '../datasets/original/tweets.csv'

data_twitter = None

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data_twitter = pickle.load(f)
else:
    data_twitter = pd.read_csv(
        filepath_or_buffer=dataset_path,
        usecols=['timestamp', 'likes', 'retweets', 'text\r'],
        sep=';',
        on_bad_lines='skip',
        lineterminator='\n',
        low_memory=False,
        # nrows=1_000_000,
    ).rename(columns={
        'timestamp': 'date',
        'text\r': 'text',
    })

    data_twitter['date'] = pd.to_datetime(data_twitter['date']).dt.date

    data_twitter = data_twitter[data_twitter['likes'] > 0]

    data_twitter['interaction'] = data_twitter['likes'] + \
        data_twitter['retweets']

    data_twitter = data_twitter.drop(
        ['likes', 'retweets'],
        axis=1,
    ).reset_index(drop=True)

    with open(cache_file, 'wb') as f:
        pickle.dump(data_twitter, f)

data_twitter

Unnamed: 0,date,text,interaction
0,2019-05-27,Another Test tweet that wasn't caught in the s...,3
1,2019-05-27,One of the useful articles of Stefan; here is ...,16
2,2019-05-21,"BTC IS STILL GOING STRONG!!\n\nThus, we are gi...",165
3,2019-05-22,BestMixer has been seized by the Dutch Police ...,9
4,2019-05-27,Invested my Life Savings into Bitcoin and Ethe...,2
...,...,...,...
1798378,2019-11-23,We are super happy to announce that TZC is add...,121
1798379,2019-11-18,Registration is now open for the biggest globa...,1727
1798380,2019-11-23,$200 #PayPal or #Bitcoin #giveaway \n1) RT thi...,44
1798381,2019-11-23,Happy #FibonacciDay \n\nA while back I created...,19


In [5]:
cache_file = '../cache/posts/reddit/data.pickle'
dataset_path = '../datasets/original/bitcoin_reddit_all.csv'

data_reddit = None

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data_reddit = pickle.load(f)
else:
    data_reddit = pd.read_csv(
        filepath_or_buffer=dataset_path,
        usecols=['date', 'score', 'body'],
        sep=',',
        on_bad_lines='skip',
        lineterminator='\n',
        low_memory=False,
        # nrows=1_000_000,
    ).rename(columns={
        'score': 'interaction',
        'body': 'text',
    })

    data_reddit['date'] = pd.to_datetime(data_reddit['date']).dt.date

    data_reddit = data_reddit[data_reddit['interaction'] > 0]

    data_reddit['interaction'] = data_reddit['interaction'].astype(int)

    data_reddit = data_reddit.reset_index(drop=True)

    with open(cache_file, 'wb') as f:
        pickle.dump(data_reddit, f)

data_reddit


Unnamed: 0,date,interaction,text
0,2014-06-26,162,How do you feel about Bitcoin? I don't really ...
1,2014-05-17,75,[The guy who blew a huge portion of his and hi...
2,2014-04-30,13,This was the bitcoin hat guy that ACTUALLY del...
3,2014-07-07,19,I'm sure this is *good* for bitcoin
4,2014-12-26,54,"I thought it was a pun on bitcoin, but I suppo..."
...,...,...,...
3760338,2019-03-10,14,I find it interesting how so many got caught u...
3760339,2019-03-22,14,Link solves some of the most persistent and pe...
3760340,2019-03-17,14,&gt;Aaron.M 11:19 AM \n&gt; \n&gt;damn \n&g...
3760341,2019-03-19,14,No because that wouldnt make sense. Roger Ver ...


In [4]:
cache_file = '../cache/posts/data.pickle'

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data = pickle.load(f)
else:
    data = pd.concat([data_twitter, data_reddit], axis=0)

    data = data.sort_values('date').reset_index(drop=True)

    with open(cache_file, 'wb') as f:
        pickle.dump(data, f)

data


Unnamed: 0,date,text,interaction
0,2009-01-11,Running bitcoin\r,20012
1,2009-01-21,Looking at ways to add more anonymity to bitco...,1936
2,2009-01-27,Thinking about how to reduce CO2 emissions fro...,1319
3,2009-01-29,From: Satoshi Nakamoto - 2009-01-11 22:32 Bitc...,44
4,2009-02-18,Just wrote: Bitcoin: new open source P2P e-cas...,106
...,...,...,...
5558721,2019-12-31,USDT is a “program” that can run on either Bit...,1
5558722,2019-12-31,^^^^AUTOMOD ***The following is a copy of the...,1
5558723,2019-12-31,"It's honestly scary, and really reminds me of ...",4
5558724,2019-12-31,"I kind of made the same mistake at first, but ...",1


In [5]:
data = data.dropna().reset_index(drop=True)

data.isna().sum()


date           0
text           0
interaction    0
dtype: int64

In [6]:
from datetime import timedelta

current_day = data.date.min()
min_index = 0
count_tweets = 0

n = 100

for t in data.itertuples():
    index = t.Index
    date = t.date

    if current_day == date:
        count_tweets += 1
        continue

    if count_tweets < n:
        min_index = index + 1

    current_day += timedelta(days=1)
    count_tweets = 1

data = data.loc[min_index:]

data.reset_index(drop=True, inplace=True)

data


Unnamed: 0,date,text,interaction
0,2013-01-21,The upcoming new version of the bitcoin-qt cli...,2
1,2013-01-21,"I agree, inflammatory remarks and fringe attra...",3
2,2013-01-21,Gotcha. It still seems vastly superior to cred...,1
3,2013-01-21,"very safe, has lots of reviews on the official...",1
4,2013-01-21,http://localbitcoins.com - you may have to exp...,10
...,...,...,...
5518158,2019-12-31,USDT is a “program” that can run on either Bit...,1
5518159,2019-12-31,^^^^AUTOMOD ***The following is a copy of the...,1
5518160,2019-12-31,"It's honestly scary, and really reminds me of ...",4
5518161,2019-12-31,"I kind of made the same mistake at first, but ...",1


In [8]:
import spacy
from spacy.language import Language

from spacy_language_detection import LanguageDetector


def get_lang_detector(nlp, name):
    return LanguageDetector(seed=10)


nlp_model = spacy.load("en_core_web_sm")

try:
    Language.factory("language_detector", func=get_lang_detector)
except ValueError:
    pass

nlp_model.add_pipe('language_detector', last=True)

print(nlp_model(data.loc[0, 'text'])._.language)


def generate_languages_csv(df):
    docs = list(nlp_model.pipe(
        df['text'].tolist(),
        batch_size=5_000,
        n_process=-1,
    ))

    languages = [doc._.language['language'] for doc in docs]

    pd.DataFrame(languages).to_csv(
        '../datasets/consolidated/languages.csv',
        index=False,
        mode='a',
        header=False
    )

{'language': 'en', 'score': 0.9999959507320948}


In [10]:
data = data.loc[0:10_000]

import re

cache_file = '../cache/posts/english/data.pickle'

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data = pickle.load(f)
else:
    data.loc[:,'text'] = data['text'].apply(lambda t:
        re.sub(
            pattern=r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b',
            repl='',
            string=t,
            flags=re.MULTILINE,
        )
    )
    data.loc[:,'text'] = data['text'].apply(lambda t:
        re.sub(
            pattern=r'\$\w+[,]|\@\w+|[,]\@\w+',
            repl='',
            string=t
        )
    )
    data.loc[:,'text'] = data['text'].apply(lambda t:
        t.replace(
            '#', ''
        ).replace(
            '\n', ''
        ).replace(
            '\r', ''
        ).replace(
            '\b', ''
        ).replace(
            '\t', ''
        ).replace(
            '\\', ' '
        ).replace(
            '/', ' '
        ).replace(
            '|', ' '
        )
    )
    data.loc[:,'text'] = data['text'].apply(lambda t:
        re.sub(
            pattern=r'  +',
            repl=' ',
            string=t
        )
    )

    steps = 10_000

    for i in range(0, len(data['date']), steps):
        generate_languages_csv(data[i:i+steps])

    data.loc[:, 'lang'] = pd.read_csv(
        filepath_or_buffer='../datasets/consolidated/languages.csv',
        lineterminator='\n',
        header=None,
        names=['lang']
    ).iloc[:, 0]

    df_grouped = data.groupby('date')

    dfs_to_concat = []

    for date, group in df_grouped:
        if len(group) >= 100:
            df_top_100 = group.nlargest(100, 'interaction')
            dfs_to_concat.append(df_top_100)

    data = pd.concat(
        dfs_to_concat,
    ).sort_values(
        ['date', 'interaction'],
        ascending=[True, False],
    ).drop(
        'lang',
        axis=1,
    ).reset_index(drop=True)

    with open(cache_file, 'wb') as f:
        pickle.dump(data, f)

In [11]:
from datetime import datetime


def check_skipped_days():
    x = 0
    arr = np.array([])
    day_before = datetime.now()
    index_before = 0
    min_index = 0

    for t in data.itertuples():
        index = t[0]
        d = t[1]
        diff = 0

        if x == 0:
            day_before, index_before = d, index
            x += 1
            continue

        diff = (d - day_before).days

        if diff > 1:
            arr = np.append(arr, '{} => {} : {} days'.format(
                index_before, index, diff))
            min_index = index_before

        day_before, index_before = d, index

    return arr, min_index + 1

In [12]:
skipped_days, min_index = check_skipped_days()

print(skipped_days)
print(len(skipped_days))


[]
0


In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


def analyze_sentiments():
    sia = SentimentIntensityAnalyzer()

    return data['text'].apply(lambda t: sia.polarity_scores(t)['compound'])

In [14]:
cache_file = '../cache/posts/sentiment/data.pickle'

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data = pickle.load(f)
else:
    data['sentiment'] = analyze_sentiments()

    with open(cache_file, 'wb') as f:
        pickle.dump(data, f)

data

Unnamed: 0,date,text,interaction,sentiment
0,2013-01-21,I don't know why he didn't accept Bitcoin from...,37,0.0813
1,2013-01-21,Still couldn't hurt to make Bitcoin an option.,19,0.4168
2,2013-01-21,r bitcoin is mostly for news related to Bitco...,19,0.8919
3,2013-01-21,"Honestly I felt this way for a while, mostly o...",13,0.9327
4,2013-01-21,"- you may have to expand your search radius, ...",10,0.1307
...,...,...,...,...
4195,2013-03-03,I'm sure there could always be the underground...,2,0.9144
4196,2013-03-03,"Ask yourself ""What are the main benefits of us...",2,0.8074
4197,2013-03-03,"Done eating, asked for the cheque, told them I...",2,0.6037
4198,2013-03-03,Great overview.Nice to see bitcoin all around.,2,0.6249


In [15]:
data = data.groupby('date')['sentiment'].mean().reset_index()

data.columns = ['date', 'sentiment']

data

Unnamed: 0,date,sentiment
0,2013-01-21,0.296672
1,2013-01-22,0.357815
2,2013-01-23,0.391366
3,2013-01-24,0.196592
4,2013-01-25,0.259927
5,2013-01-26,0.160243
6,2013-01-27,0.265702
7,2013-01-28,0.269674
8,2013-01-29,0.229464
9,2013-01-30,0.230275


In [16]:
data.to_csv('../datasets/consolidated/posts.csv', index=False)