In [115]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd
import cv2
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import string
import nltk
import ssl
import spacy
from sklearn.feature_extraction.text import CountVectorizer

import en_core_web_md
text_to_nlp = spacy.load('en_core_web_md')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

invisible_chars_pattern = re.compile(
    '[\u200B\u200C\u200D\u200E\u200F\uFEFF\u00A0]'
)

STOP_WORDS = stopwords.words('english')

from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Prices


In [116]:
df_SP = pd.read_csv("./Raw/SPX.csv")
df_BP = pd.read_csv("./Raw/BTC-USD.csv")

df_SP['Date'] = pd.to_datetime(df_SP['Date'])
df_BP['Date'] = pd.to_datetime(df_BP['Date'])

df_SP = df_SP.rename(columns={
    'Open': 'Open_S',
    'High': 'High_S',
    'Low': 'Low_S',
    'Close': 'Close_S',
    'Adj Close': 'Adj Close_S',
    'Volume': 'Volume_S'
})

df_BP = df_BP.rename(columns={
    'Open': 'Open_B',
    'High': 'High_B',
    'Low': 'Low_B',
    'Close': 'Close_B',
    'Adj Close': 'Adj Close_B',
    'Volume': 'Volume_B'
})

merged_prices = pd.merge(df_SP, df_BP, on='Date')

file_path = os.path.join('./Processed', 'combined_prices.csv')

merged_prices.to_csv(file_path, index=False)

In [117]:
def clean_text(text):
    if not isinstance(text, str):
        return text
    text = unicodedata.normalize('NFKC', text)
    text = invisible_chars_pattern.sub('', text)
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize(text):
    clean_tokens = []
    for token in text_to_nlp(text):
        if (not token.is_stop) and (token.lemma_ != '-PRON-') and (not token.is_punct):
            clean_tokens.append(token.lemma_)
    return clean_tokens

def process_lang_data(text):
  cleaned_text = []
  punctuation = string.punctuation
  our_stopwords = stopwords.words('english')
  lemmatizer = WordNetLemmatizer()

  for token in word_tokenize(text):
    if token not in punctuation and token not in our_stopwords:
      clipped_token = lemmatizer.lemmatize(token)
      cleaned_text.append(clipped_token)

  return cleaned_text

# Sentiment 

### Bitcoin

In [118]:
df_BS = pd.read_csv("./Raw/BitcoinSent.csv")

#clean dataframe
df_BS['Date'] = pd.to_datetime(df_BS['Date'], errors='coerce')
df_BS = df_BS[df_BS['Date'].notna()]
df_BS = df_BS[df_BS['text'].notna()]
df_BS = df_BS[df_BS['text'].apply(lambda x: isinstance(x, str))]
df_BS = df_BS[df_BS['Sentiment'].notna()]
df_BS = df_BS[df_BS['Sentiment'].isin(['Positive', 'Negative'])]
df_BS['Sentiment'] = df_BS['Sentiment'].map({'Positive': 1, 'Negative': 0})

df_BS.rename(columns={'text': 'body'}, inplace=True)

#sort in ascending date
df_BS = df_BS.sort_values(by='Date', ascending=True)
df_BS = df_BS.reset_index(drop=True)

#testing
df_BS.tail

<bound method NDFrame.tail of                Date                                               body  \
0        2014-09-18  #RDD / #BTC on the exchanges:\nCryptsy: 0.0000...   
1        2014-09-18  Current price: 418.77$ $BTCUSD $btc #bitcoin 2...   
2        2014-09-18  1 #BTC (#Bitcoin) quotes:\n$423.60/$424.80 #Bi...   
3        2014-09-18  In the last 10 mins, there were arb opps spann...   
4        2014-09-18  Be judicious, buy your Bitcoins at https://Bit...   
...             ...                                                ...   
18452496 2019-11-23  €400 million investment in Blockchain and AI t...   
18452497 2019-11-23  BTC/USD | $BTCUSD | $BTC $USD\n\nBitcoin Outlo...   
18452498 2019-11-23  BTC\n\n長期的目線\n\n現在のトライアングル収束までに要した期間と人々の関心から、\...   
18452499 2019-11-23  SPECIAL DEAL TO ANYONE HAS CASH APP OR BITCOIN...   
18452500 2019-11-23  $BTC - an update on the longer term view for B...   

          Sentiment  
0                 0  
1                 0  
2              

In [119]:
#sample dataframe
sampled_BS = df_BS.groupby('Date').apply(lambda x: x.sample(n=min(len(x), 4))).reset_index(drop=True)

#clean every text
sampled_BS['body'] = sampled_BS['body'].apply(clean_text)

#apply tokenizer
sampled_BS['body'] = sampled_BS['body'].apply(tokenize)

'''
print(sampled_BS.tail)

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'unknown'

sampled_BS['language'] = sampled_BS['body'].apply(detect_language)
sampled_BS = sampled_BS[sampled_BS['language'] == 'en'].copy()
sampled_BS.drop(columns=['language'], inplace=True)
sampled_BS = sampled_BS.reset_index(drop=True)

print(sampled_BS.tail)
'''

sampled_BS.tail

<bound method NDFrame.tail of            Date                                               body  Sentiment
0    2014-09-18  [current, price, 447.92, $, $, BTCUSD, $, btc,...          0
1    2014-09-18  [Bitcoin, trade, @bleutrade, $, 450.00, @btcec...          0
2    2014-09-18  [2014年9月18日, 18:00:09, btc_jpy, 直近[last]:49720...          0
3    2014-09-18  [rdd, BTC, exchange, Cryptsy, 0.00000014, Bitt...          0
4    2014-09-19  [current, price, 307.21, €, $, BTCEUR, $, btc,...          0
...         ...                                                ...        ...
7563 2019-11-22  [BTC, 6000$〜7000, $, BTCにおいて、6000$、7000$は重要な意味...          0
7564 2019-11-23  [@extstock, want, receive, free, 100, BTB, cry...          1
7565 2019-11-23  [Bitcoin, Dives, month, Low, China, Crackdown,...          1
7566 2019-11-23  [discover, easily, maximise, trade, enter, 📈, ...          1
7567 2019-11-23  [whale, dump, $, btc, need, buy, hope, btc, 3,...          1

[7568 rows x 3 columns]>

In [120]:
#join dates and merge text
test = sampled_BS
test['body_str'] = test['body'].apply(lambda tokens: ' '.join(tokens))

test = test.groupby('Date').agg({
    'body': list,
    'body_str': lambda texts: ' '.join(texts),
    'Sentiment': 'mean'
}).reset_index()


file_path = os.path.join('./Processed', 'btc_nlp_test.csv')
test.to_csv(file_path, index=False)



### Stock

In [121]:
df_SS = pd.read_csv("./Raw/StockSent.csv")

df_SS['post_date'] = pd.to_datetime(df_SS['post_date'], unit='s')
df_SS['post_date'] = df_SS['post_date'].dt.strftime('%Y-%m-%d')

df_SS.rename(columns={'post_date': 'Date'}, inplace=True)
df_SS.drop(columns=['tweet_id', 'writer', 'comment_num', 'retweet_num', 'like_num'], inplace=True)

df_SS.tail

<bound method NDFrame.tail of                Date                                               body
0        2015-01-01  lx21 made $10,008  on $AAPL -Check it out! htt...
1        2015-01-01  Insanity of today weirdo massive selling. $aap...
2        2015-01-01  S&P100 #Stocks Performance $HD $LOW $SBUX $TGT...
3        2015-01-01  $GM $TSLA: Volkswagen Pushes 2014 Record Recal...
4        2015-01-01  Swing Trading: Up To 8.91% Return In 14 Days h...
...             ...                                                ...
3717959  2019-12-31  That $SPY $SPX puuump in the last hour was the...
3717960  2019-12-31  In 2020 I may start Tweeting out positive news...
3717961  2019-12-31  Patiently Waiting for the no twitter sitter tw...
3717962  2019-12-31  I don't discriminate. I own both $aapl and $ms...
3717963  2019-12-31  $AAPL #patent 10,522,475 Vertical interconnect...

[3717964 rows x 2 columns]>