In [1]:
import numpy as np
import pandas as pd
import os
import pickle

In [29]:
cache_file = '../cache/posts/twitter/data.pickle'
dataset_path = '../datasets/original/tweets.csv'

data_twitter = None

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data_twitter = pickle.load(f)
else:
    data_twitter = pd.read_csv(
        filepath_or_buffer=dataset_path,
        usecols=['timestamp', 'likes', 'retweets', 'text\r'],
        sep=';',
        on_bad_lines='skip',
        lineterminator='\n',
        low_memory=False,
    ).rename(columns={
        'timestamp': 'date',
        'text\r': 'text',
    })

    data_twitter['date'] = pd.to_datetime(data_twitter['date']).dt.date

    data_twitter = data_twitter[data_twitter['likes'] > 0]

    data_twitter['interaction'] = data_twitter['likes'] + data_twitter['retweets']

    data_twitter = data_twitter.drop(
        ['likes', 'retweets'],
        axis=1,
    ).reset_index(drop=True)

    with open(cache_file, 'wb') as f:
        pickle.dump(data_twitter, f)

data_twitter

Unnamed: 0,date,text,interaction
0,2019-05-27,Another Test tweet that wasn't caught in the s...,3
1,2019-05-27,One of the useful articles of Stefan; here is ...,16
2,2019-05-21,"BTC IS STILL GOING STRONG!!\n\nThus, we are gi...",165
3,2019-05-22,BestMixer has been seized by the Dutch Police ...,9
4,2019-05-27,Invested my Life Savings into Bitcoin and Ethe...,2
...,...,...,...
1798378,2019-11-23,We are super happy to announce that TZC is add...,121
1798379,2019-11-18,Registration is now open for the biggest globa...,1727
1798380,2019-11-23,$200 #PayPal or #Bitcoin #giveaway \n1) RT thi...,44
1798381,2019-11-23,Happy #FibonacciDay \n\nA while back I created...,19


In [30]:
cache_file = '../cache/posts/reddit/data.pickle'
dataset_path = '../datasets/original/bitcoin_reddit_all.csv'

data_reddit = None

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data_reddit = pickle.load(f)
else:
    data_reddit = pd.read_csv(
        filepath_or_buffer=dataset_path,
        usecols=['date', 'score', 'body'],
        sep=',',
        on_bad_lines='skip',
        lineterminator='\n',
        low_memory=False,
    ).rename(columns={
        'score': 'interaction',
        'body': 'text',
    })

    data_reddit['date'] = pd.to_datetime(data_reddit['date']).dt.date

    data_reddit = data_reddit[data_reddit['interaction'] > 0]

    data_reddit['interaction'] = data_reddit['interaction'].astype(int)

    data_reddit = data_reddit.reset_index(drop=True)

    with open(cache_file, 'wb') as f:
        pickle.dump(data_reddit, f)

data_reddit

Unnamed: 0,date,interaction,text
0,2014-06-26,162,How do you feel about Bitcoin? I don't really ...
1,2014-05-17,75,[The guy who blew a huge portion of his and hi...
2,2014-04-30,13,This was the bitcoin hat guy that ACTUALLY del...
3,2014-07-07,19,I'm sure this is *good* for bitcoin
4,2014-12-26,54,"I thought it was a pun on bitcoin, but I suppo..."
...,...,...,...
3760338,2019-03-10,14,I find it interesting how so many got caught u...
3760339,2019-03-22,14,Link solves some of the most persistent and pe...
3760340,2019-03-17,14,&gt;Aaron.M 11:19 AM \n&gt; \n&gt;damn \n&g...
3760341,2019-03-19,14,No because that wouldnt make sense. Roger Ver ...


In [2]:
cache_file = '../cache/posts/data.pickle'

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data = pickle.load(f)
else:
    data = pd.concat([data_twitter, data_reddit], axis=0)

    data = data.sort_values('date').reset_index(drop=True)

    with open(cache_file, 'wb') as f:
        pickle.dump(data, f)

data

Unnamed: 0,date,text,interaction
0,2009-01-11,Running bitcoin\r,20012
1,2009-01-21,Looking at ways to add more anonymity to bitco...,1936
2,2009-01-27,Thinking about how to reduce CO2 emissions fro...,1319
3,2009-01-29,From: Satoshi Nakamoto - 2009-01-11 22:32 Bitc...,44
4,2009-02-18,Just wrote: Bitcoin: new open source P2P e-cas...,106
...,...,...,...
5558721,2019-12-31,USDT is a “program” that can run on either Bit...,1
5558722,2019-12-31,^^^^AUTOMOD ***The following is a copy of the...,1
5558723,2019-12-31,"It's honestly scary, and really reminds me of ...",4
5558724,2019-12-31,"I kind of made the same mistake at first, but ...",1


In [3]:
data = data.dropna().reset_index(drop=True)

data.isna().sum()

date           0
text           0
interaction    0
dtype: int64

In [4]:
def verify_min_posts(df, min_posts):
    date_counts = df.groupby('date').size()

    dates_under_min_posts = date_counts[date_counts < min_posts].index

    if len(dates_under_min_posts) == 0:
        return df

    min_date = dates_under_min_posts[-1] + pd.Timedelta(days=1)

    filtered_df = df[df['date'] >= min_date]

    return filtered_df.reset_index(drop=True)

In [5]:
data = verify_min_posts(data, 400).copy()

data

Unnamed: 0,date,text,interaction
0,2013-10-14,The reason they don't make driving illegal is ...,5
1,2013-10-14,i'm proposing several working escrows to drive...,1
2,2013-10-14,I would say this drives miner incentive away u...,1
3,2013-10-14,I didn't say go and buy drugs with your Bitcoi...,2
4,2013-10-14,&gt; The price of the bitcoin is solely govern...,5
...,...,...,...
5389005,2019-12-31,USDT is a “program” that can run on either Bit...,1
5389006,2019-12-31,^^^^AUTOMOD ***The following is a copy of the...,1
5389007,2019-12-31,"It's honestly scary, and really reminds me of ...",4
5389008,2019-12-31,"I kind of made the same mistake at first, but ...",1


In [6]:
def check_dates_have_at_least_n_rows(df, n):
    count_by_date = df.groupby('date').size()

    if not (count_by_date >= n).all():
        dates_with_errors = count_by_date.loc[count_by_date < n]
        print(f'The following dates have less than {n} rows:')
        print(dates_with_errors)
    else:
        print(f'All dates have at least {n} rows.')

In [7]:
check_dates_have_at_least_n_rows(data, 400)

All dates have at least 400 rows.


In [8]:
def group_top_interaction(df, top):
    df_grouped = df.groupby('date')

    dfs_to_concat = []

    for date, group in df_grouped:
        if len(group) >= top:
            df_top = group.nlargest(top, 'interaction')
            dfs_to_concat.append(df_top)

    return dfs_to_concat

In [9]:
data = pd.concat(
    group_top_interaction(data, 400),
).sort_values(
    ['date', 'interaction'],
    ascending=[True, False],
).reset_index(drop=True)

data

Unnamed: 0,date,text,interaction
0,2013-10-14,Exploiting vulnerabilities in existing softwar...,84
1,2013-10-14,"""We need anonymous messaging — anonymous for t...",55
2,2013-10-14,"""We've got a lot of servers here. If we just ...",34
3,2013-10-14,precisely. \n\nNobody gives a fuck if loser n...,33
4,2013-10-14,Both Europe and the US seem determined to driv...,31
...,...,...,...
907995,2019-12-31,I'm going to disagree a little with Justin her...,2
907996,2019-12-31,Which coin would that be\n\nOh wownero \n\nA m...,2
907997,2019-12-31,The lighting network does not meaningfully inc...,2
907998,2019-12-31,There’s nowhere centralized to store what you’...,2


In [10]:
import spacy
from spacy.language import Language
from spacy_language_detection import LanguageDetector


def get_lang_detector(nlp, name):
    return LanguageDetector(seed=1)


nlp_model = spacy.load("en_core_web_sm")

try:
    Language.factory("language_detector", func=get_lang_detector)
except ValueError:
    pass

nlp_model.add_pipe('language_detector', last=True)

print(nlp_model(data.loc[0, 'text'])._.language)


def generate_languages_csv(df, csv_path):
    docs = list(nlp_model.pipe(
        df['text'].tolist(),
        batch_size=1_000,
        n_process=1,
    ))

    languages = [doc._.language['language'] for doc in docs]

    pd.DataFrame(languages).to_csv(
        csv_path,
        index=False,
        mode='a',
        header=False,
    )

{'language': 'en', 'score': 0.9999976375846074}


In [11]:
import re


def process_texts_and_get_english_top_100(df, languages_csv_path):
    df.loc[:, 'text'] = df['text'].apply(lambda t:
        re.sub(
            pattern=r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b',
            repl='',
            string=t,
            flags=re.MULTILINE,
        )
    )
    df.loc[:, 'text'] = df['text'].apply(lambda t:
        re.sub(
            pattern=r'\$\w+[,]|\@\w+|[,]\@\w+',
            repl='',
            string=t
        )
    )
    df.loc[:, 'text'] = df['text'].apply(lambda t:
        t.replace(
            '#', ''
        ).replace(
            '\n', ''
        ).replace(
            '\r', ''
        ).replace(
            '\b', ''
        ).replace(
            '\t', ''
        ).replace(
            '\\', ' '
        ).replace(
            '/', ' '
        ).replace(
            '|', ' '
        )
    )
    df.loc[:, 'text'] = df['text'].apply(lambda t:
        re.sub(
            pattern=r'  +',
            repl=' ',
            string=t
        )
    )

    steps = 10_000

    for i in range(0, len(df['date']), steps):
        generate_languages_csv(df[i:i+steps], languages_csv_path)

    df.loc[:, 'lang'] = pd.read_csv(
        filepath_or_buffer=languages_csv_path,
        lineterminator='\n',
        header=None,
        names=['lang']
    ).iloc[:, 0]

    df = df.loc[df['lang'] == 'en']

    return pd.concat(
        group_top_interaction(df, 100),
    ).sort_values(
        ['date', 'interaction'],
        ascending=[True, False],
    ).drop(
        'lang',
        axis=1,
    ).reset_index(drop=True)

In [12]:
cache_file = '../cache/posts/english/data.pickle'

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data = pickle.load(f)
else:
    data = process_texts_and_get_english_top_100(data, '../datasets/consolidated/languages.csv')

    with open(cache_file, 'wb') as f:
        pickle.dump(data, f)

data

Unnamed: 0,date,text,interaction
0,2013-10-14,Exploiting vulnerabilities in existing softwar...,84
1,2013-10-14,"""We need anonymous messaging — anonymous for t...",55
2,2013-10-14,"""We've got a lot of servers here. If we just s...",34
3,2013-10-14,precisely. Nobody gives a fuck if loser nation...,33
4,2013-10-14,Both Europe and the US seem determined to driv...,31
...,...,...,...
226995,2019-12-31,No sir it was legit IRS agent reminding about ...,9
226996,2019-12-31,"Oh, I just remember 3 years ago bitcoin core c...",9
226997,2019-12-31,"Dude, buy Bitcoin and Apple stock, trust me.",9
226998,2019-12-31,"Likewise, I've been in Bitcoin since 2011. Som...",9


In [13]:
def check_skipped_dates(df):
    groups = df.groupby('date')

    prev_date = None
    prev_index = None

    total = 0

    for date, group in groups:
        if prev_date is not None and (date - prev_date).days > 2:
            total += 1
            days_diff = (date - prev_date).days
            print(f'Indexes {prev_index} ({prev_date}) and {group.index[0]} '\
                  f'({date}) have difference of {days_diff} days.')

        prev_date = date
        prev_index = group.index[-1]

    print(f'Total errors: {total}')

In [14]:
check_skipped_dates(data)

Total errors: 0


In [15]:
def check_dates_have_exactly_n_rows(df, n):
    count_by_date = data.groupby('date').size()

    if not count_by_date.eq(n).all():
        dates_with_errors = count_by_date.loc[~count_by_date.eq(n)]
        print(f'The following dates have a different number of rows than {n}:')
        print(dates_with_errors)
    else:
        print(f'All dates have exactly {n} rows.')

In [16]:
check_dates_have_exactly_n_rows(data, 100)

All dates have exactly 100 rows.


In [17]:
from pysentimiento import create_analyzer

def analyze_sentiments(df):
    analyzer = create_analyzer(task="sentiment", lang="en")

    scores = df['text'].apply(lambda t: analyzer.predict(t).probas)

    return pd.DataFrame(list(scores)).loc[:, ['POS', 'NEU', 'NEG']]


In [18]:
cache_file = '../cache/posts/sentiment/pysentimiento/data.pickle'

sentiments = None

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        sentiments = pickle.load(f)
else:
    sentiments = analyze_sentiments(data).rename(columns={
        'POS': 'positive',
        'NEU': 'neutral',
        'NEG': 'negative',
    })

    with open(cache_file, 'wb') as f:
        pickle.dump(sentiments, f)

sentiments

Unnamed: 0,positive,neutral,negative
0,0.021167,0.544287,0.434546
1,0.081718,0.888788,0.029495
2,0.516477,0.428243,0.055280
3,0.002647,0.044041,0.953312
4,0.895787,0.102738,0.001475
...,...,...,...
226995,0.003085,0.045623,0.951292
226996,0.201755,0.788976,0.009269
226997,0.582603,0.409905,0.007493
226998,0.232485,0.723999,0.043516


In [19]:
cache_file = '../cache/posts/sentiment/data.pickle'

if os.path.isfile(cache_file):
    with open(cache_file, 'rb') as f:
        data = pickle.load(f)
else:
    data = pd.concat([data, sentiments], axis=1)

    with open(cache_file, 'wb') as f:
        pickle.dump(data, f)

data

Unnamed: 0,date,text,interaction,positive,neutral,negative
0,2013-10-14,Exploiting vulnerabilities in existing softwar...,84,0.021167,0.544287,0.434546
1,2013-10-14,"""We need anonymous messaging — anonymous for t...",55,0.081718,0.888788,0.029495
2,2013-10-14,"""We've got a lot of servers here. If we just s...",34,0.516477,0.428243,0.055280
3,2013-10-14,precisely. Nobody gives a fuck if loser nation...,33,0.002647,0.044041,0.953312
4,2013-10-14,Both Europe and the US seem determined to driv...,31,0.895787,0.102738,0.001475
...,...,...,...,...,...,...
226995,2019-12-31,No sir it was legit IRS agent reminding about ...,9,0.003085,0.045623,0.951292
226996,2019-12-31,"Oh, I just remember 3 years ago bitcoin core c...",9,0.201755,0.788976,0.009269
226997,2019-12-31,"Dude, buy Bitcoin and Apple stock, trust me.",9,0.582603,0.409905,0.007493
226998,2019-12-31,"Likewise, I've been in Bitcoin since 2011. Som...",9,0.232485,0.723999,0.043516


In [20]:
def get_average_sentiments_by_date(df):
    df = df.groupby('date')[['positive', 'neutral', 'negative']].mean().reset_index()

    df.columns = ['date', 'positive', 'neutral', 'negative']

    return df

In [21]:
data = get_average_sentiments_by_date(data)

data

Unnamed: 0,date,positive,neutral,negative
0,2013-10-14,0.224461,0.501282,0.274257
1,2013-10-15,0.288634,0.496198,0.215168
2,2013-10-16,0.257223,0.437274,0.305503
3,2013-10-17,0.235050,0.551573,0.213377
4,2013-10-18,0.231190,0.506552,0.262257
...,...,...,...,...
2265,2019-12-27,0.226849,0.446411,0.326740
2266,2019-12-28,0.169157,0.471205,0.359638
2267,2019-12-29,0.197365,0.523340,0.279295
2268,2019-12-30,0.170356,0.481577,0.348067


In [22]:
data.to_csv('../datasets/consolidated/posts.csv', index=False)

In [23]:
cache_file = '../cache/posts/data.pickle'

prev_min_date = data['date'].min()

with open(cache_file, 'rb') as f:
    data = pickle.load(f)

data = data.loc[data['date'] < prev_min_date]

data

Unnamed: 0,date,text,interaction
0,2009-01-11,Running bitcoin\r,20012
1,2009-01-21,Looking at ways to add more anonymity to bitco...,1936
2,2009-01-27,Thinking about how to reduce CO2 emissions fro...,1319
3,2009-01-29,From: Satoshi Nakamoto - 2009-01-11 22:32 Bitc...,44
4,2009-02-18,Just wrote: Bitcoin: new open source P2P e-cas...,106
...,...,...,...
169711,2013-10-13,If anyone is interested in trying an alternati...,1
169712,2013-10-13,The best thing for Bitcoin would be for a coun...,6
169713,2013-10-13,Are you using Bitcoin-qt? If so the encrypt an...,2
169714,2013-10-13,"given that it's a random *economic* professor,...",2


In [24]:
data = data.dropna().reset_index(drop=True)

data.isna().sum()

date           0
text           0
interaction    0
dtype: int64

In [25]:
data = verify_min_posts(data, 108).copy()

data

Unnamed: 0,date,text,interaction
0,2013-01-21,There might be some truth to that if that doll...,1
1,2013-01-21,The upcoming new version of the bitcoin-qt cli...,2
2,2013-01-21,"I agree, inflammatory remarks and fringe attra...",3
3,2013-01-21,Gotcha. It still seems vastly superior to cred...,1
4,2013-01-21,"very safe, has lots of reviews on the official...",1
...,...,...,...
129149,2013-10-13,If anyone is interested in trying an alternati...,1
129150,2013-10-13,The best thing for Bitcoin would be for a coun...,6
129151,2013-10-13,Are you using Bitcoin-qt? If so the encrypt an...,2
129152,2013-10-13,"given that it's a random *economic* professor,...",2


In [26]:
check_dates_have_at_least_n_rows(data, 108)

All dates have at least 108 rows.


In [27]:
check_skipped_dates(data)

Total errors: 0


In [28]:
data = pd.concat(
    group_top_interaction(data, 108),
).sort_values(
    ['date', 'interaction'],
    ascending=[True, False],
).reset_index(drop=True)

data

Unnamed: 0,date,text,interaction
0,2013-01-21,I don't know why he didn't accept Bitcoin from...,37
1,2013-01-21,Still couldn't hurt to make Bitcoin an option.,19
2,2013-01-21,/r/bitcoin is mostly for news related to Bitco...,19
3,2013-01-21,"Honestly I felt this way for a while, mostly o...",13
4,2013-01-21,http://localbitcoins.com - you may have to exp...,10
...,...,...,...
28723,2013-10-13,Really looking forward to seeing this exchange...,4
28724,2013-10-13,"Ideally, the best currency would be backed by ...",4
28725,2013-10-13,Great video about bitcoin that never mentions ...,4
28726,2013-10-13,"Hi mhuzaifa,\n\nI'm a Group Buy Coordinator on...",4


In [29]:
data = process_texts_and_get_english_top_100(data, '../datasets/consolidated/languages_2.csv')

data

Unnamed: 0,date,text,interaction
0,2013-01-21,I don't know why he didn't accept Bitcoin from...,37
1,2013-01-21,Still couldn't hurt to make Bitcoin an option.,19
2,2013-01-21,r bitcoin is mostly for news related to Bitco...,19
3,2013-01-21,"Honestly I felt this way for a while, mostly o...",13
4,2013-01-21,"- you may have to expand your search radius, ...",10
...,...,...,...
26595,2013-10-13,There are 14 CNY exchanges. Look at this inste...,4
26596,2013-10-13,It's not so much that nobody wants him to talk...,4
26597,2013-10-13,To drive bitcoin adoption further into the mai...,4
26598,2013-10-13,Isn't this also the problem though... Just las...,4


In [30]:
check_skipped_dates(data)

Total errors: 0


In [31]:
check_dates_have_exactly_n_rows(data, 100)

All dates have exactly 100 rows.


In [166]:
sentiments = analyze_sentiments(data).rename(columns={
    'POS': 'positive',
    'NEU': 'neutral',
    'NEG': 'negative',
})

sentiments

Unnamed: 0,positive,neutral,negative
0,0.035833,0.885713,0.078454
1,0.728674,0.268735,0.002591
2,0.002426,0.052522,0.945052
3,0.022364,0.456335,0.521301
4,0.277993,0.710762,0.011245
...,...,...,...
26595,0.125455,0.830115,0.044430
26596,0.021547,0.467572,0.510881
26597,0.672569,0.321997,0.005434
26598,0.013567,0.686811,0.299621


In [32]:
data = pd.concat([data, sentiments], axis=1)

data

Unnamed: 0,date,text,interaction,positive,neutral,negative
0,2013-01-21,I don't know why he didn't accept Bitcoin from...,37.0,0.021167,0.544287,0.434546
1,2013-01-21,Still couldn't hurt to make Bitcoin an option.,19.0,0.081718,0.888788,0.029495
2,2013-01-21,r bitcoin is mostly for news related to Bitco...,19.0,0.516477,0.428243,0.055280
3,2013-01-21,"Honestly I felt this way for a while, mostly o...",13.0,0.002647,0.044041,0.953312
4,2013-01-21,"- you may have to expand your search radius, ...",10.0,0.895787,0.102738,0.001475
...,...,...,...,...,...,...
226995,,,,0.003085,0.045623,0.951292
226996,,,,0.201755,0.788976,0.009269
226997,,,,0.582603,0.409905,0.007493
226998,,,,0.232485,0.723999,0.043516


In [33]:
data = get_average_sentiments_by_date(data)

data

Unnamed: 0,date,positive,neutral,negative
0,2013-01-21,0.224461,0.501282,0.274257
1,2013-01-22,0.288634,0.496198,0.215168
2,2013-01-23,0.257223,0.437274,0.305503
3,2013-01-24,0.235050,0.551573,0.213377
4,2013-01-25,0.231190,0.506552,0.262257
...,...,...,...,...
261,2013-10-09,0.278437,0.422556,0.299007
262,2013-10-10,0.208409,0.474589,0.317001
263,2013-10-11,0.219250,0.467341,0.313409
264,2013-10-12,0.275753,0.404974,0.319273


In [34]:
data.to_csv(
    '../datasets/consolidated/posts.csv',
    index=False,
    mode='a',
    header=None,
)

In [35]:
data = pd.read_csv(
    filepath_or_buffer='../datasets/consolidated/posts.csv',
    lineterminator='\n',
)

data['date'] = pd.to_datetime(data['date']).dt.date

data = data.sort_values('date').reset_index(drop=True)

data

Unnamed: 0,date,positive,neutral,negative
0,2013-01-21,0.224461,0.501282,0.274257
1,2013-01-22,0.288634,0.496198,0.215168
2,2013-01-23,0.257223,0.437274,0.305503
3,2013-01-24,0.235050,0.551573,0.213377
4,2013-01-25,0.231190,0.506552,0.262257
...,...,...,...,...
2531,2019-12-27,0.226849,0.446411,0.326740
2532,2019-12-28,0.169157,0.471205,0.359638
2533,2019-12-29,0.197365,0.523340,0.279295
2534,2019-12-30,0.170356,0.481577,0.348067


In [36]:
data.to_csv('../datasets/consolidated/posts.csv', index=False)