In [83]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import string
import nltk
import spacy
#from sklearn.feature_extraction.text import CountVectorizer
import string

import en_core_web_md
text_to_nlp = spacy.load('en_core_web_md')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

INVISIBLE_CHARS = re.compile(r'[\u200B\u200C\u200D\u200E\u200F\uFEFF\u00A0]')
URLS = re.compile(r'https?://\S+|www\.\S+')
EMOJIS = re.compile("["
                    u"\U0001F600-\U0001F64F"
                    u"\U0001F300-\U0001F5FF"
                    u"\U0001F680-\U0001F6FF"
                    u"\U0001F1E0-\U0001F1FF"
                    u"\U00002700-\U000027BF"
                    u"\U000024C2-\U0001F251"
                    "]+", flags=re.UNICODE)

REMOVE_PUNCT_DIGITS = str.maketrans('', '', string.punctuation + string.digits)
CUSTOM_STOPWORDS = {
    '|', '=', '1', '5', '2018', 'usd', 'price', 'exchange',
    '€', '24', 'utc', 'en', 'high', 'low', 'volume'
}
BITCOIN_ALIASES = {'btc', 'bit', 'bitcoin', 'bitcoins', 'bit coin'}
OTHER_CRYPTO = {
    'eth', 'ethereum', 'xrp', 'bch', 'ltc', 'etc', 'ada', 'doge', 'shiba',
    'polkadot', 'dot', 'bnb', 'solana', 'trx', 'eos', 'neo', 'iota', 'monero',
    'dash', 'zec', 'vechain', 'theta', 'stellar', 'xlm', 'avax', 'algo',
    'matic', 'near', 'icp', 'aptos', 'apt', 'kaspa', 'kas'
}

import warnings
warnings.filterwarnings('ignore')

In [84]:
def clean_text(text):
    if not isinstance(text, str) or not text:
        return ''
    text = unicodedata.normalize('NFKC', text)
    text = INVISIBLE_CHARS.sub('', text)
    text = URLS.sub('', text)
    text = EMOJIS.sub('', text)
    text = text.lower()
    text = text.translate(REMOVE_PUNCT_DIGITS)
    tokens = text.split()

    has_bitcoin = False
    clean_tokens = []
    
    for token in tokens:
        if token in OTHER_CRYPTO or token in CUSTOM_STOPWORDS:
            continue
      
        if token in BITCOIN_ALIASES:
            if has_bitcoin:
                continue 
            else:
                clean_tokens.append('bitcoin')
                has_bitcoin = True
        else:
            clean_tokens.append(token)

    return ' '.join(clean_tokens)

def tokenize(text):
    clean_tokens = []
    for token in text_to_nlp(text):
        if (not token.is_stop) and (token.lemma_ != '-PRON-') and (not token.is_punct):
            clean_tokens.append(token.lemma_)
    return clean_tokens

# Prices


In [85]:
df_SP2 = pd.read_csv("./Raw/SP500Price.csv")
df_BP2 = pd.read_csv("./Raw/BitcoinPrice.csv")

df_BP2.drop(columns=['Vol.','Change %'], inplace=True)

df_SP2 = df_SP2.rename(columns={
    'Open': 'Open_S',
    'Close/Last': 'Close_S',
    'High': 'High_S',
    'Low': 'Low_S',
})

df_BP2 = df_BP2.rename(columns={
    'Price': 'Close_B',
    'Open': 'Open_B',
    'High': 'High_B',
    'Low': 'Low_B',
})

for col in ['Close_B', 'Open_B', 'High_B', 'Low_B']:
    df_BP2[col] = df_BP2[col].str.replace(',', '')
    df_BP2[col] = df_BP2[col].astype(float)  

df_SP2['Date'] = pd.to_datetime(df_SP2['Date'])
df_BP2['Date'] = pd.to_datetime(df_BP2['Date'])

merged_prices = pd.merge(df_SP2, df_BP2, on='Date')
merged_prices = merged_prices.sort_values('Date')
merged_prices = merged_prices[merged_prices['Date'] <= '2023-06-22']

# Sentiment 

### Bitcoin

In [86]:
df_BS = pd.read_csv("./Raw/BitcoinSent.csv")

df_BS['date'] = pd.to_datetime(df_BS['date'], errors='coerce')
df_BS = df_BS[df_BS['date'].notna()]
df_BS = df_BS[df_BS['text'].notna()]
df_BS = df_BS[df_BS['text'].apply(lambda x: isinstance(x, str))]
df_BS = df_BS[df_BS['sentiment_label'].notna()]
df_BS = df_BS[df_BS['sentiment_label'].isin(['Positive', 'Negative', 'Neutral'])]
df_BS['sentiment_label'] = df_BS['sentiment_label'].map({'Positive': 1,'Neutral' : 0.5 , 'Negative': 0})

df_BS = df_BS.sort_values(by='date', ascending=True)
avg_sent = df_BS.groupby(df_BS['date'].dt.date)['sentiment_label'].mean().reset_index()

avg_sent = avg_sent.rename(columns={'date': 'Date'})
avg_sent['Date'] = pd.to_datetime(avg_sent['Date'])


prices_sent = pd.merge(merged_prices, avg_sent, on='Date', how='left')
file_path = os.path.join('./Processed', 'combined_prices.csv')

prices_sent.to_csv(file_path, index=False)

In [87]:

'''
#FOR VISUALIZATION - OTHER FILE

visu_words = df_BS
visu_words = visu_words.reset_index(drop=True)

visu_words = visu_words.groupby('date').apply(lambda x: x.sample(n=min(len(x), 3))).reset_index(drop=True)

visu_words['text'] = visu_words['text'].apply(clean_text)
visu_words['text'] = visu_words['text'].apply(tokenize)

visu_words.tail
visu_words.to_pickle('visu_words.pkl')
'''


"\n#FOR VISUALIZATION - OTHER FILE\n\nvisu_words = df_BS\nvisu_words = visu_words.reset_index(drop=True)\n\nvisu_words = visu_words.groupby('date').apply(lambda x: x.sample(n=min(len(x), 3))).reset_index(drop=True)\n\nvisu_words['text'] = visu_words['text'].apply(clean_text)\nvisu_words['text'] = visu_words['text'].apply(tokenize)\n\nvisu_words.tail\nvisu_words.to_pickle('visu_words.pkl')\n"

In [88]:
sampled_BS = df_BS.groupby('date').apply(lambda x: x.sample(n=min(len(x), 5))).reset_index(drop=True)

sampled_BS['text'] = sampled_BS['text'].apply(clean_text)

sampled_BS['text'] = sampled_BS['text'].apply(tokenize)

test = sampled_BS
test['body_str'] = test['text'].apply(lambda tokens: ' '.join(tokens))

test = test.groupby('date').agg({
    'text': list,
    'body_str': lambda texts: ' '.join(texts),
    'sentiment_label': 'mean'
}).reset_index()


file_path = os.path.join('./Processed', 'btc_nlp_test.csv')
test.to_csv(file_path, index=False)