In [31]:
import os
import numpy as np
import pandas as pd
import cv2
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import string
import nltk
import ssl
import spacy
from sklearn.feature_extraction.text import CountVectorizer

'''
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
'''

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

STOP_WORDS = stopwords.words('english')

import warnings
warnings.filterwarnings('ignore')

from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

In [32]:
print(word_tokenize("hello     hi test. oi. oi"))


['hello', 'hi', 'test', '.', 'oi', '.', 'oi']


# Prices


In [25]:
df_SP = pd.read_csv("./Raw/SPX.csv")
df_BP = pd.read_csv("./Raw/BTC-USD.csv")

df_SP['Date'] = pd.to_datetime(df_SP['Date'])
df_BP['Date'] = pd.to_datetime(df_BP['Date'])

df_SP = df_SP.rename(columns={
    'Open': 'Open_S',
    'High': 'High_S',
    'Low': 'Low_S',
    'Close': 'Close_S',
    'Adj Close': 'Adj Close_S',
    'Volume': 'Volume_S'
})

df_BP = df_BP.rename(columns={
    'Open': 'Open_B',
    'High': 'High_B',
    'Low': 'Low_B',
    'Close': 'Close_B',
    'Adj Close': 'Adj Close_B',
    'Volume': 'Volume_B'
})

merged_prices = pd.merge(df_SP, df_BP, on='Date')

file_path = os.path.join('./Processed', 'combined_prices.csv')

merged_prices.to_csv(file_path, index=False)

# Sentiment 

### Bitcoin

In [26]:
df_BS = pd.read_csv("./Raw/BitcoinSent.csv")

#removed non-date data in the 'Date' header
df_BS['Date'] = pd.to_datetime(df_BS['Date'], errors='coerce')
df_BS = df_BS[df_BS['Date'].notna()]

df_BS.rename(columns={'text': 'body'}, inplace=True)
df_BS.drop(columns=['Sentiment'], inplace=True)

#sort in ascending date
df_BS = df_BS.sort_values(by='Date', ascending=True)
df_BS = df_BS.reset_index(drop=True)

#testing
df_BS.tail

<bound method NDFrame.tail of                Date                                               body
0        2014-09-18  LIVE: Profit = $414.71 (1.31 %). BUY B75.77 @ ...
1        2014-09-18  Current price: 418.77$ $BTCUSD $btc #bitcoin 2...
2        2014-09-18  1 #BTC (#Bitcoin) quotes:\n$423.60/$424.80 #Bi...
3        2014-09-18  In the last 10 mins, there were arb opps spann...
4        2014-09-18  Be judicious, buy your Bitcoins at https://Bit...
...             ...                                                ...
19344014 2019-11-23  €400 million investment in Blockchain and AI t...
19344015 2019-11-23  BTC/USD | $BTCUSD | $BTC $USD\n\nBitcoin Outlo...
19344016 2019-11-23  BTC\n\n長期的目線\n\n現在のトライアングル収束までに要した期間と人々の関心から、\...
19344017 2019-11-23  SPECIAL DEAL TO ANYONE HAS CASH APP OR BITCOIN...
19344018 2019-11-23  $BTC - an update on the longer term view for B...

[19344019 rows x 2 columns]>

Sampling

In [33]:
sampled_BS = df_BS.groupby('Date').apply(lambda x: x.sample(n=min(len(x), 10))).reset_index(drop=True)
'''
print(sampled_BS.tail)

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'unknown'

sampled_BS['language'] = sampled_BS['body'].apply(detect_language)
sampled_BS = sampled_BS[sampled_BS['language'] == 'en'].copy()
sampled_BS.drop(columns=['language'], inplace=True)
sampled_BS = sampled_BS.reset_index(drop=True)

print(sampled_BS.tail)
'''

print(sampled_BS.tail)

<bound method NDFrame.tail of             Date                                               body
0     2014-09-18  One Bitcoin now worth $418.00@bitstamp. High $...
1     2014-09-18  Harga #Bitcoin Per Tanggal 19-09-2014 07:00:06...
2     2014-09-18  Current price: 450.98$ $BTCUSD $btc #bitcoin 2...
3     2014-09-18  2014年9月18日 13:00:09\nbtc_jpy\n直近[last]:50900円\...
4     2014-09-18  LIVE: Profit = $309.50 (3.14 %). BUY B22.71 @ ...
...          ...                                                ...
18925 2019-11-23  Fibre2Fashion - BTC gives start to sustainable...
18926 2019-11-23  安心・安全で確実な月利20％のビットコインレンディングで資産を増やそう！#ビットコイン #B...
18927 2019-11-23  @silverguru22 It’s not needed YET because as o...
18928 2019-11-23  Full Stack #PHP #Developer West Drayton &amp; ...
18929 2019-11-23  let's join soon and get a surprise, do not reg...

[18930 rows x 2 columns]>


In [35]:
test = sampled_BS

token = word_tokenize(test.loc[0,'body'])
token

['One',
 'Bitcoin',
 'now',
 'worth',
 '$',
 '418.00',
 '@',
 'bitstamp',
 '.',
 'High',
 '$',
 '453.94',
 '.',
 'Low',
 '$',
 '407.94',
 '.',
 'Market',
 'Cap',
 '$',
 '5.553',
 'Billion',
 '#',
 'bitcoin']

### Stock

In [29]:
df_SS = pd.read_csv("./Raw/StockSent.csv")

df_SS['post_date'] = pd.to_datetime(df_SS['post_date'], unit='s')
df_SS['post_date'] = df_SS['post_date'].dt.strftime('%Y-%m-%d')

df_SS.rename(columns={'post_date': 'Date'}, inplace=True)
df_SS.drop(columns=['tweet_id', 'writer', 'comment_num', 'retweet_num', 'like_num'], inplace=True)

df_SS.tail

<bound method NDFrame.tail of                Date                                               body
0        2015-01-01  lx21 made $10,008  on $AAPL -Check it out! htt...
1        2015-01-01  Insanity of today weirdo massive selling. $aap...
2        2015-01-01  S&P100 #Stocks Performance $HD $LOW $SBUX $TGT...
3        2015-01-01  $GM $TSLA: Volkswagen Pushes 2014 Record Recal...
4        2015-01-01  Swing Trading: Up To 8.91% Return In 14 Days h...
...             ...                                                ...
3717959  2019-12-31  That $SPY $SPX puuump in the last hour was the...
3717960  2019-12-31  In 2020 I may start Tweeting out positive news...
3717961  2019-12-31  Patiently Waiting for the no twitter sitter tw...
3717962  2019-12-31  I don't discriminate. I own both $aapl and $ms...
3717963  2019-12-31  $AAPL #patent 10,522,475 Vertical interconnect...

[3717964 rows x 2 columns]>