In [29]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd
import cv2
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import string
import nltk
import ssl
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import string

import en_core_web_md
text_to_nlp = spacy.load('en_core_web_md')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

INVISIBLE_CHARS = re.compile(r'[\u200B\u200C\u200D\u200E\u200F\uFEFF\u00A0]')
URLS = re.compile(r'https?://\S+|www\.\S+')
EMOJIS = re.compile("["
                    u"\U0001F600-\U0001F64F"
                    u"\U0001F300-\U0001F5FF"
                    u"\U0001F680-\U0001F6FF"
                    u"\U0001F1E0-\U0001F1FF"
                    u"\U00002700-\U000027BF"
                    u"\U000024C2-\U0001F251"
                    "]+", flags=re.UNICODE)

REMOVE_PUNCT_DIGITS = str.maketrans('', '', string.punctuation + string.digits)
CUSTOM_STOPWORDS = {
    '|', '=', '1', '5', '2018', 'usd', 'price', 'exchange',
    '€', '24', 'utc', 'en', 'high', 'low', 'volume'
}
BITCOIN_ALIASES = {'btc', 'bit', 'bitcoin', 'bitcoins', 'bit coin'}
OTHER_CRYPTO = {
    'eth', 'ethereum', 'xrp', 'bch', 'ltc', 'etc', 'ada', 'doge', 'shiba',
    'polkadot', 'dot', 'bnb', 'solana', 'trx', 'eos', 'neo', 'iota', 'monero',
    'dash', 'zec', 'vechain', 'theta', 'stellar', 'xlm', 'avax', 'algo',
    'matic', 'near', 'icp', 'aptos', 'apt', 'kaspa', 'kas'
}

STOP_WORDS = stopwords.words('english')

import warnings
warnings.filterwarnings('ignore')

from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Prices


In [2]:
df_SP = pd.read_csv("./Raw/SPX.csv")
df_BP = pd.read_csv("./Raw/BTC-USD.csv")

df_SP['Date'] = pd.to_datetime(df_SP['Date'])
df_BP['Date'] = pd.to_datetime(df_BP['Date'])

df_SP = df_SP.rename(columns={
    'Open': 'Open_S',
    'High': 'High_S',
    'Low': 'Low_S',
    'Close': 'Close_S',
    'Adj Close': 'Adj Close_S',
    'Volume': 'Volume_S'
})

df_BP = df_BP.rename(columns={
    'Open': 'Open_B',
    'High': 'High_B',
    'Low': 'Low_B',
    'Close': 'Close_B',
    'Adj Close': 'Adj Close_B',
    'Volume': 'Volume_B'
})

merged_prices = pd.merge(df_SP, df_BP, on='Date')

file_path = os.path.join('./Processed', 'combined_prices.csv')

merged_prices.to_csv(file_path, index=False)

In [30]:
def clean_text(text):
    if not isinstance(text, str) or not text:
        return ''
    text = unicodedata.normalize('NFKC', text)
    text = INVISIBLE_CHARS.sub('', text)
    text = URLS.sub('', text)
    text = EMOJIS.sub('', text)
    text = text.lower()
    text = text.translate(REMOVE_PUNCT_DIGITS)
    tokens = text.split()

    has_bitcoin = False
    clean_tokens = []
    
    for token in tokens:
        if token in OTHER_CRYPTO or token in CUSTOM_STOPWORDS:
            continue
        
      
        if token in BITCOIN_ALIASES:
            if has_bitcoin:
                continue 
            else:
                clean_tokens.append('bitcoin')
                has_bitcoin = True
        else:
            clean_tokens.append(token)

    return ' '.join(clean_tokens)

def tokenize(text):
    clean_tokens = []
    for token in text_to_nlp(text):
        if (not token.is_stop) and (token.lemma_ != '-PRON-') and (not token.is_punct):
            clean_tokens.append(token.lemma_)
    return clean_tokens

def process_lang_data(text):
  cleaned_text = []
  punctuation = string.punctuation
  our_stopwords = stopwords.words('english')
  lemmatizer = WordNetLemmatizer()

  for token in word_tokenize(text):
    if token not in punctuation and token not in our_stopwords:
      clipped_token = lemmatizer.lemmatize(token)
      cleaned_text.append(clipped_token)

    return cleaned_text

# Sentiment 

### Bitcoin

In [4]:
df_BS = pd.read_csv("./Raw/BitcoinSent.csv")

#clean dataframe
df_BS['Date'] = pd.to_datetime(df_BS['Date'], errors='coerce')
df_BS = df_BS[df_BS['Date'].notna()]
df_BS = df_BS[df_BS['text'].notna()]
df_BS = df_BS[df_BS['text'].apply(lambda x: isinstance(x, str))]
df_BS = df_BS[df_BS['Sentiment'].notna()]
df_BS = df_BS[df_BS['Sentiment'].isin(['Positive', 'Negative'])]
df_BS['Sentiment'] = df_BS['Sentiment'].map({'Positive': 1, 'Negative': 0})

df_BS.rename(columns={'text': 'body'}, inplace=True)

#sort in ascending date
df_BS = df_BS.sort_values(by='Date', ascending=True)
df_BS = df_BS.reset_index(drop=True)

#testing
df_BS.tail

<bound method NDFrame.tail of                Date                                               body  \
0        2014-09-18  #RDD / #BTC on the exchanges:\nCryptsy: 0.0000...   
1        2014-09-18  Current price: 418.77$ $BTCUSD $btc #bitcoin 2...   
2        2014-09-18  1 #BTC (#Bitcoin) quotes:\n$423.60/$424.80 #Bi...   
3        2014-09-18  In the last 10 mins, there were arb opps spann...   
4        2014-09-18  Be judicious, buy your Bitcoins at https://Bit...   
...             ...                                                ...   
18452496 2019-11-23  €400 million investment in Blockchain and AI t...   
18452497 2019-11-23  BTC/USD | $BTCUSD | $BTC $USD\n\nBitcoin Outlo...   
18452498 2019-11-23  BTC\n\n長期的目線\n\n現在のトライアングル収束までに要した期間と人々の関心から、\...   
18452499 2019-11-23  SPECIAL DEAL TO ANYONE HAS CASH APP OR BITCOIN...   
18452500 2019-11-23  $BTC - an update on the longer term view for B...   

          Sentiment  
0                 0  
1                 0  
2              

In [31]:
#FOR VISUALIZATION - OTHER FILE
mask = (df_BS['Date'] >= '2018-01-07') & (df_BS['Date'] <= '2018-02-06')

visu_words = df_BS.loc[mask]
visu_words.drop(columns='Sentiment', inplace=True)
visu_words = visu_words.reset_index(drop=True)

visu_words = visu_words.groupby('Date').apply(lambda x: x.sample(n=min(len(x), 800))).reset_index(drop=True)

visu_words['body'] = visu_words['body'].apply(clean_text)
visu_words['body'] = visu_words['body'].apply(tokenize)

visu_words.tail
visu_words.to_pickle('visu_words.pkl')

In [6]:
#sample dataframe
sampled_BS = df_BS.groupby('Date').apply(lambda x: x.sample(n=min(len(x), 5))).reset_index(drop=True)

#clean every text
sampled_BS['body'] = sampled_BS['body'].apply(clean_text)

#apply tokenizer
sampled_BS['body'] = sampled_BS['body'].apply(tokenize)

'''
print(sampled_BS.tail)

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'unknown'

sampled_BS['language'] = sampled_BS['body'].apply(detect_language)
sampled_BS = sampled_BS[sampled_BS['language'] == 'en'].copy()
sampled_BS.drop(columns=['language'], inplace=True)
sampled_BS = sampled_BS.reset_index(drop=True)

print(sampled_BS.tail)
'''

sampled_BS.tail

<bound method NDFrame.tail of            Date                                               body  Sentiment
0    2014-09-18  [10, min, arb, opp, span, 36, exchange, pair, ...          0
1    2014-09-18  [live, profit, 30950, 314, buy, b2271, 43189, ...          0
2    2014-09-18  [45099, 0500, utc, 24h, range, 44357, 46284, v...          0
3    2014-09-18  [bitcoineurrate, €, 329, €, 3327, buy, €, 329,...          0
4    2014-09-18  [10, min, arb, opp, span, 39, exchange, pair, ...          0
...         ...                                                ...        ...
9455 2019-11-23                        [mnsairdrop, monnosairdrop]          1
9456 2019-11-23  [bitcoin, whale, move, 47000, bitcoin, 338, mi...          1
9457 2019-11-23  [price, 1, ltc, usd, 00, change, 1000, price, ...          1
9458 2019-11-23  [think, agree, vitaliks, comment, mountain, ma...          1
9459 2019-11-23  [15, m, volume, alert, phb, current, volume, 2...          1

[9460 rows x 3 columns]>

In [7]:
#join dates and merge text
test = sampled_BS
test['body_str'] = test['body'].apply(lambda tokens: ' '.join(tokens))

test = test.groupby('Date').agg({
    'body': list,
    'body_str': lambda texts: ' '.join(texts),
    'Sentiment': 'mean'
}).reset_index()


file_path = os.path.join('./Processed', 'btc_nlp_test.csv')
test.to_csv(file_path, index=False)



### Stock

In [8]:
df_SS = pd.read_csv("./Raw/StockSent.csv")

df_SS['post_date'] = pd.to_datetime(df_SS['post_date'], unit='s')
df_SS['post_date'] = df_SS['post_date'].dt.strftime('%Y-%m-%d')
df_SS.rename(columns={'post_date': 'Date'}, inplace=True)

df_SS.drop(columns=['tweet_id', 'writer', 'comment_num', 'retweet_num', 'like_num'], inplace=True)

df_SS.tail

<bound method NDFrame.tail of                Date                                               body
0        2015-01-01  lx21 made $10,008  on $AAPL -Check it out! htt...
1        2015-01-01  Insanity of today weirdo massive selling. $aap...
2        2015-01-01  S&P100 #Stocks Performance $HD $LOW $SBUX $TGT...
3        2015-01-01  $GM $TSLA: Volkswagen Pushes 2014 Record Recal...
4        2015-01-01  Swing Trading: Up To 8.91% Return In 14 Days h...
...             ...                                                ...
3717959  2019-12-31  That $SPY $SPX puuump in the last hour was the...
3717960  2019-12-31  In 2020 I may start Tweeting out positive news...
3717961  2019-12-31  Patiently Waiting for the no twitter sitter tw...
3717962  2019-12-31  I don't discriminate. I own both $aapl and $ms...
3717963  2019-12-31  $AAPL #patent 10,522,475 Vertical interconnect...

[3717964 rows x 2 columns]>

In [9]:
#sample dataframe
sampled_SS = df_SS.groupby('Date').apply(lambda x: x.sample(n=min(len(x), 5))).reset_index(drop=True)

#clean every text
sampled_SS['body'] = sampled_SS['body'].apply(clean_text)

#apply tokenizer
sampled_SS['body'] = sampled_SS['body'].apply(tokenize)

sampled_SS.tail

<bound method NDFrame.tail of             Date                                               body
0     2015-01-01  [memo, elonmusk, not, chestnut, roast, open, f...
1     2015-01-01  [timcook, hope, resolution, increase, buyback,...
2     2015-01-01  [close, year, 176, big, gainer, yhoo, peix, kn...
3     2015-01-01  [jeff, bezos, lose, 74, billion, amazon, bad, ...
4     2015-01-01              [myth, risk, option, cmg, tsla, bidu]
...          ...                                                ...
9125  2019-12-31  [fwiw, high, performance, vehicle, sua, high, ...
9126  2019-12-31  [good, 6, month, s, 1st, 6, month, awfulan, in...
9127  2019-12-31        [warn, not, believe, come, fwed, mouthtsla]
9128  2019-12-31  [ron, baron, legendary, investor, 30b, managem...
9129  2019-12-31  [nio, tsla, snap, spce, jnug, acb, attach, pos...

[9130 rows x 2 columns]>

In [10]:
#join dates and merge text
testS = sampled_SS
testS['body_str'] = testS['body'].apply(lambda tokens: ' '.join(tokens))

testS = testS.groupby('Date').agg({
    'body': list,
    'body_str': lambda texts: ' '.join(texts),
}).reset_index()


file_path = os.path.join('./Processed', 'stock_nlp_test.csv')
testS.to_csv(file_path, index=False)