# Datasets in Daily Time Frequency

In [1]:
import pandas as pd
import pickle



### BTC & ETH (Cryptocurrency), Stock, Forex News Sentiment Data (2020 - 2023)

In [2]:
with open('../Data_Collection/raw_news_sentiment_BTC_ETH.pkl', 'rb') as file:
    raw_crypto_sentiment_data = pickle.load(file)

with open('../Data_Collection/raw_news_sentiment_all_stock.pkl', 'rb') as file:
    raw_stock_sentiment_data = pickle.load(file)
    
with open('../Data_Collection/raw_news_sentiment_general_fx.pkl', 'rb') as file:
    raw_fx_sentiment_data = pickle.load(file)

In [3]:
import pandas as pd

def json_sentiment_data_to_df(pickled_sentiment_data, data_type ="crypto"):
    """
    Converts pickled news sentiment data into a structured pandas dataframe for data cleaning and pre-processing.

    Parameters:
    - pickled_sentiment_data: List of dictionaries, where each dictionary represents news sentiment data.
    - data_type: A string indicating the type of data in 'pickled_sentiment_data'. (defaults to "crypto"). Based on this, the function will format the resulting dataframe differently.

    Returns: 
    - A pandas dataframe structured and ready for data cleaning and pre-processing. The dataframe will contain 'date' as a column and other columns based on the type of sentiment data provided.
    """

    df = None
    
    # looping through available data in the pickled dataset
    for i in range(len(pickled_sentiment_data)):
        if len(pickled_sentiment_data[i]['data']) != 0: # if data is not empty
            
            temp_data_ = pickled_sentiment_data[i]['data']
            flattened_temp_data = []
            
            # looping through items in each data as per the structure of the dataset
        
            if data_type == "crypto":
                for date, vals in temp_data_.items():
                    for label, sentiment in vals.items():
                        flattened_temp_data.append({
                            'date': date,
                            'crypto_label': label,
                            **sentiment
                        })
            else:
                for date, sentiment in temp_data_.items():
                    flattened_temp_data.append({
                        'date': date,
                        **sentiment
                    })
                    
            temp_df = pd.DataFrame(flattened_temp_data)
            
            # converting 'date' column to datetime
            temp_df['date'] = pd.to_datetime(temp_df['date']) 

            # sort dataframe by date
            temp_df = temp_df.sort_values('date')

            # append new data to the final dataframe
            if df is None:
                df = temp_df
            else:
                df = pd.concat([df, temp_df])
        df = df.reset_index(drop=True)
                
    # sort dataframe by date
    df = df.sort_values('date')
    df = df.reset_index(drop=True)
    
    return df

In [5]:
fx_sentiment_data        = json_sentiment_data_to_df(raw_fx_sentiment_data, data_type="fx")
stock_sentiment_data_df  = json_sentiment_data_to_df(raw_stock_sentiment_data, data_type="stock")
crypto_sentiment_data_df = json_sentiment_data_to_df(raw_crypto_sentiment_data)


# ensuring that the 'date' column is of datetime type
fx_sentiment_data['date'] = pd.to_datetime(fx_sentiment_data['date'])
stock_sentiment_data_df['date'] = pd.to_datetime(stock_sentiment_data_df['date'])
crypto_sentiment_data_df['date'] = pd.to_datetime(crypto_sentiment_data_df['date'])

fx_sentiment_data.columns = ['date', 'forex_Neutral', 'forex_Positive', 'forex_Negative', 'forex_sentiment_score']
stock_sentiment_data_df.columns = ['date', 'stock_Neutral', 'stock_Positive', 'stock_Negative', 'stock_sentiment_score']
crypto_sentiment_data_df.columns = ['date', 'crypto_label', 'crypto_Neutral', 'crypto_Positive', 'crypto_Negative', 'crypto_sentiment_score']

# splitting the crypto data into two dataframes (BTC & ETH)
btc_sentiment_data_df = crypto_sentiment_data_df[crypto_sentiment_data_df['crypto_label'] == 'BTC'].copy()
eth_sentiment_data_df = crypto_sentiment_data_df[crypto_sentiment_data_df['crypto_label'] == 'ETH'].copy()

# computing total sentiments
btc_sentiment_data_df['btc_Total'] = btc_sentiment_data_df['crypto_Neutral'] + btc_sentiment_data_df['crypto_Positive'] + btc_sentiment_data_df['crypto_Negative']
eth_sentiment_data_df['eth_Total'] = eth_sentiment_data_df['crypto_Neutral'] + eth_sentiment_data_df['crypto_Positive'] + eth_sentiment_data_df['crypto_Negative']
fx_sentiment_data["forex_total"] = fx_sentiment_data['forex_Neutral'] + fx_sentiment_data['forex_Positive'] + fx_sentiment_data['forex_Negative'] 
stock_sentiment_data_df["stock_total"] =  stock_sentiment_data_df['stock_Neutral'] + stock_sentiment_data_df['stock_Positive'] + stock_sentiment_data_df['stock_Negative']

btc_sentiment_data_df.rename(columns={'crypto_sentiment_score': 'btc_sentiment_score'}, inplace=True)
eth_sentiment_data_df.rename(columns={'crypto_sentiment_score': 'eth_sentiment_score'}, inplace=True)

btc_sentiment_data_df.drop(columns=['crypto_Neutral', 'crypto_Positive', 'crypto_Negative', 'crypto_label'], inplace=True)
eth_sentiment_data_df.drop(columns=['crypto_Neutral', 'crypto_Positive', 'crypto_Negative', 'crypto_label'], inplace=True)

# mergin the dataframes
df_merged = pd.merge(fx_sentiment_data, stock_sentiment_data_df, on='date', how='outer')
df_merged = pd.merge(df_merged, btc_sentiment_data_df, on='date', how='outer')
df_merged = pd.merge(df_merged, eth_sentiment_data_df, on='date', how='outer')

# filling NaN values with defaults (0)
df_merged = df_merged.fillna(0)

df_merged = df_merged.sort_values('date')
df_merged = df_merged.reset_index(drop=True)

# dropping unnecessary columns
df_merged = df_merged.drop(["forex_Neutral", "forex_Positive", "forex_Negative",
                     "stock_Neutral", "stock_Positive", "stock_Negative"], axis=1)

df_merged.tail(30)
df_merged.to_csv('daily_news_sentiments.csv', index=False)
# df_merged.isna().sum()
df_merged

Unnamed: 0,date,forex_sentiment_score,forex_total,stock_sentiment_score,stock_total,btc_sentiment_score,btc_Total,eth_sentiment_score,eth_Total
0,2020-10-30,0.000,0.0,0.000,0.0,1.500,1,0.000,0.0
1,2020-11-13,0.000,0.0,0.000,0.0,1.500,1,0.000,0.0
2,2020-11-20,0.000,0.0,0.000,0.0,0.000,1,0.000,0.0
3,2020-11-27,0.000,0.0,0.000,0.0,0.000,1,0.000,0.0
4,2020-12-01,0.000,0.0,0.000,0.0,0.500,3,1.000,3.0
...,...,...,...,...,...,...,...,...,...
906,2023-05-22,-0.083,72.0,0.321,2335.0,0.364,140,0.085,53.0
907,2023-05-23,-0.245,98.0,0.409,2699.0,0.435,100,0.557,35.0
908,2023-05-24,-0.066,91.0,0.290,2640.0,0.101,104,0.349,43.0
909,2023-05-25,-0.311,106.0,0.286,2760.0,0.181,124,-0.276,38.0


In [6]:
del df_merged

### Glassnode Data + Daily BTC Price Data (2012 - 2023)

In [7]:
import pprint
glass_node_df = pd.read_csv('../Data_Collection/Glassnode_24h_Data-2012-2023.csv')
glass_node_df.rename(columns={'Unnamed: 0': 'date'}, inplace=True)
glass_node_df['date'] = pd.to_datetime(glass_node_df['date'])
# glass_node_df.isna().sum()

In [8]:
daily_btc_df = pd.read_csv('../Data_Collection/daily_btc_data.csv')
daily_btc_df = daily_btc_df.sort_values('timestamp')
daily_btc_df = daily_btc_df.reset_index(drop=True)
daily_btc_df = daily_btc_df.drop(["unix_timestamp"], axis=1)
daily_btc_df['timestamp'] = pd.to_datetime(daily_btc_df['timestamp'])

In [9]:
df_merged = pd.merge(glass_node_df, daily_btc_df, left_on='date', right_on='timestamp', how='inner')
df_merged = df_merged.fillna(0)

In [10]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

#normalasing
scalers = {}

for cols in df_merged.columns:
    if cols not in ['date', 'timestamp']:
        scaler = MinMaxScaler()
        df_merged[cols] = scaler.fit_transform(df_merged[[cols]])
        scalers[cols] = scaler  # save the scaler for this column

In [12]:
from statsmodels.tsa.stattools import grangercausalitytests, adfuller  
import numpy as np
def get_grangers_causation(data, y, maxlag=10, test='ssr_chi2test', verbose=False):  
    """
    Computes the Granger's causality test between all numeric columns of the data and a specified target column.
    
    Parameters:
    - data (pd.DataFrame): The input dataframe containing all the series.
    - y (str): The target column name.
    - maxlag (int): The maximum lag to be considered for the Granger's causality test.
    - test (str): The statistical test for Granger's causality (default is 'ssr_chi2test').
    - verbose (bool): Whether to print details during processing.
    
    Returns:
    - pd.DataFrame: A dataframe containing p-values for the Granger's causality test for each column against the target column.
    """  
    data = data.select_dtypes(include=[np.number])  # filtering out non-numeric columns
    
    # Looping through each column in the data
    for col in data.columns:
        # Checking if the series is stationary using Augmented Dickey-Fuller test
        if not check_stationarity(data[col]):
            
            if verbose: print(f"{col} is not stationary. Differencing...")
                
            # Differencing the non-stationary series to make it stationary
            data[col] = data[col].diff().dropna()
    
    # New df to hold the p-values from the Granger causality tests
    df = pd.DataFrame(np.zeros((len(data.columns)-1, 1)), columns=['p_value'], index=data.columns.difference([y]))
    
    # Looping through each column in the new DataFrame
    for x in df.index:
        # Granger causality test on the current pair of series
        test_result = grangercausalitytests(data[[y, x]].dropna(), maxlag=maxlag, verbose=False)
        
        p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
        if verbose: print(f'Y = {y}, X = {x}, P Values = {p_values}')
        min_p_value = np.min(p_values)
        df.loc[x, 'p_value'] = min_p_value
    
    return df

def check_stationarity(series):
    """
    Checks the stationarity of a given series using the Augmented Dickey-Fuller test.
    
    Parameters:
    - series (pd.Series): The time series data.

    Returns:
    - bool: True if the series is stationary, False otherwise.
    """

    # Augmented Dickey-Fuller test on the series
    result = adfuller(series)
    # True if p-value is less than or equal to 0.05 (indicating stationarity), False otherwise
    return result[1] <= 0.05

grangers_result = get_grangers_causation(df_merged, 'close', maxlag=7, test='ssr_chi2test', verbose=True)


Hash Rate is not stationary. Differencing...


Active Addresses is not stationary. Differencing...
Supply Last Active 1+ Years Ago is not stationary. Differencing...
Transaction Size (Total) is not stationary. Differencing...


Transaction Count is not stationary. Differencing...
Transaction Rate is not stationary. Differencing...


Stock-to-Flow Deflection is not stationary. Differencing...




Net Unrealized Profit/Loss (NUPL) is not stationary. Differencing...
Exchange Balance (Percent) is not stationary. Differencing...




Exchange Withdrawals is not stationary. Differencing...
Total Addresses is not stationary. Differencing...


Difficulty is not stationary. Differencing...


Total Addresses (USDT) is not stationary. Differencing...
Transfer Volume (USDT) is not stationary. Differencing...
Market Cap is not stationary. Differencing...


Delta Cap is not stationary. Differencing...
Realized Cap is not stationary. Differencing...


Investor Capitalization is not stationary. Differencing...
Balanced Price is not stationary. Differencing...
Realized Price is not stationary. Differencing...


Relative Unrealized Profit is not stationary. Differencing...


Supply Last Active 3y-5y is not stationary. Differencing...
Supply last active 5y-7y is not stationary. Differencing...


Market Cap (USDT) is not stationary. Differencing...
Exchange Inflow Volume (Total) (USDT) is not stationary. Differencing...
Exchange Outflow Volume (Total) (USDT) is not stationary. Differencing...


Circulating Supply is not stationary. Differencing...


Price (ETH) is not stationary. Differencing...
Price (LTC) is not stationary. Differencing...
Price is not stationary. Differencing...




HODL Waves 1y_2y is not stationary. Differencing...


HODL Waves 2y_3y is not stationary. Differencing...


HODL Waves 3y_5y is not stationary. Differencing...
HODL Waves 5y_7y is not stationary. Differencing...


HODL Waves 7y_10y is not stationary. Differencing...
HODL Waves more_10y is not stationary. Differencing...




Realized Cap HODL Waves 1y_2y is not stationary. Differencing...


Realized Cap HODL Waves 2y_3y is not stationary. Differencing...
Realized Cap HODL Waves 3y_5y is not stationary. Differencing...


Realized Cap HODL Waves 5y_7y is not stationary. Differencing...


Realized Cap HODL Waves 7y_10y is not stationary. Differencing...


Stock-to-Flow Ratio price is not stationary. Differencing...
Stock-to-Flow Ratio ratio is not stationary. Differencing...
close is not stationary. Differencing...


high is not stationary. Differencing...
low is not stationary. Differencing...


open is not stationary. Differencing...
Y = close, X = Active Addresses, P Values = [0.6724, 0.8876, 0.6922, 0.1758, 0.2267, 0.0471, 0.0028]
Y = close, X = Balanced Price, P Values = [0.3576, 0.1229, 0.1736, 0.109, 0.1823, 0.0884, 0.2923]




Y = close, X = Block Height, P Values = [0.7182, 0.5903, 0.6445, 0.6809, 0.7838, 0.5241, 0.6388]
Y = close, X = Circulating Supply, P Values = [0.5997, 0.8657, 0.8825, 0.1444, 0.0546, 0.0336, 0.0851]
Y = close, X = Delta Cap, P Values = [0.3125, 0.1236, 0.2905, 0.3045, 0.47, 0.331, 0.6377]
Y = close, X = Difficulty, P Values = [0.0474, 0.1115, 0.1847, 0.0364, 0.059, 0.0013, 0.0]
Y = close, X = Exchange Balance (Percent), P Values = [0.6145, 0.1841, 0.2522, 0.4014, 0.5108, 0.6345, 0.3812]
Y = close, X = Exchange Deposits, P Values = [0.8188, 0.9739, 0.9485, 0.8867, 0.269, 0.3048, 0.1559]




Y = close, X = Exchange Inflow Volume (Total), P Values = [0.3844, 0.3056, 0.3994, 0.5674, 0.5334, 0.653, 0.6996]
Y = close, X = Exchange Inflow Volume (Total) (USDT), P Values = [0.0002, 0.0011, 0.0011, 0.0016, 0.0, 0.0001, 0.0002]
Y = close, X = Exchange Net Position Change, P Values = [0.4395, 0.7365, 0.5273, 0.3706, 0.3183, 0.115, 0.1318]
Y = close, X = Exchange Outflow Volume (Total), P Values = [0.2857, 0.4783, 0.6673, 0.8129, 0.823, 0.8922, 0.471]
Y = close, X = Exchange Outflow Volume (Total) (USDT), P Values = [0.0006, 0.002, 0.0021, 0.0045, 0.0003, 0.0009, 0.0016]
Y = close, X = Exchange Withdrawals, P Values = [0.094, 0.1047, 0.2115, 0.3262, 0.1528, 0.0215, 0.0109]
Y = close, X = HODL Waves 1d_1w, P Values = [0.6431, 0.8828, 0.6844, 0.7987, 0.8827, 0.7951, 0.7369]




Y = close, X = HODL Waves 1m_3m, P Values = [0.5278, 0.641, 0.5175, 0.6732, 0.7877, 0.871, 0.8688]
Y = close, X = HODL Waves 1w_1m, P Values = [0.5976, 0.466, 0.5836, 0.4564, 0.6045, 0.7195, 0.7073]
Y = close, X = HODL Waves 1y_2y, P Values = [0.5485, 0.8345, 0.8981, 0.9595, 0.9872, 0.9664, 0.9795]
Y = close, X = HODL Waves 24h, P Values = [0.6361, 0.6269, 0.6923, 0.5453, 0.5786, 0.6708, 0.733]
Y = close, X = HODL Waves 2y_3y, P Values = [0.7677, 0.7681, 0.5641, 0.6537, 0.6623, 0.7851, 0.8134]
Y = close, X = HODL Waves 3m_6m, P Values = [0.427, 0.2364, 0.3346, 0.3224, 0.4515, 0.4785, 0.3978]




Y = close, X = HODL Waves 3y_5y, P Values = [0.3063, 0.1549, 0.2715, 0.4007, 0.1774, 0.2021, 0.247]
Y = close, X = HODL Waves 5y_7y, P Values = [0.7091, 0.4614, 0.6675, 0.8013, 0.7929, 0.8712, 0.8751]
Y = close, X = HODL Waves 6m_12m, P Values = [0.3597, 0.6551, 0.81, 0.3541, 0.4982, 0.5066, 0.6097]
Y = close, X = HODL Waves 7y_10y, P Values = [0.9314, 0.6984, 0.7465, 0.8643, 0.7336, 0.8149, 0.7516]
Y = close, X = HODL Waves more_10y, P Values = [0.9342, 0.5441, 0.3256, 0.097, 0.1673, 0.1432, 0.228]
Y = close, X = Hash Rate, P Values = [0.066, 0.0213, 0.0433, 0.061, 0.1029, 0.0745, 0.0851]
Y = close, X = Investor Capitalization, P Values = [0.3931, 0.166, 0.3259, 0.3193, 0.4736, 0.3218, 0.6539]




Y = close, X = MVRV Ratio, P Values = [0.3996, 0.6843, 0.8958, 0.9705, 0.9864, 0.9875, 0.9916]
Y = close, X = MVRV Z-Score, P Values = [0.3054, 0.5053, 0.7492, 0.8906, 0.9396, 0.9289, 0.9507]
Y = close, X = Market Cap, P Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Y = close, X = Market Cap (USDT), P Values = [0.6001, 0.8496, 0.8705, 0.1059, 0.0507, 0.0509, 0.1526]
Y = close, X = Net Unrealized Profit/Loss (NUPL), P Values = [0.6861, 0.7678, 0.8349, 0.9343, 0.9726, 0.9877, 0.9875]
Y = close, X = Price, P Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


Y = close, X = Price (ETH), P Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Y = close, X = Price (LTC), P Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Y = close, X = Price Drawdown from ATH, P Values = [0.4718, 0.7429, 0.824, 0.934, 0.9758, 0.9889, 0.9946]
Y = close, X = Realized Cap, P Values = [0.4179, 0.1838, 0.3451, 0.331, 0.482, 0.3222, 0.6641]
Y = close, X = Realized Cap HODL Waves 1d_1w, P Values = [0.4999, 0.7723, 0.5941, 0.731, 0.8015, 0.6353, 0.4257]
Y = close, X = Realized Cap HODL Waves 1m_3m, P Values = [0.8481, 0.8526, 0.3094, 0.4352, 0.5866, 0.6643, 0.6859]
Y = close, X = Realized Cap HODL Waves 1w_1m, P Values = [0.5619, 0.3491, 0.4314, 0.1746, 0.1797, 0.2634, 0.1725]




Y = close, X = Realized Cap HODL Waves 1y_2y, P Values = [0.2298, 0.4721, 0.6582, 0.8471, 0.9382, 0.9656, 0.9714]
Y = close, X = Realized Cap HODL Waves 24h, P Values = [0.8108, 0.5321, 0.6387, 0.386, 0.4885, 0.5053, 0.5564]
Y = close, X = Realized Cap HODL Waves 2y_3y, P Values = [0.772, 0.7916, 0.9226, 0.8797, 0.9476, 0.9782, 0.9862]
Y = close, X = Realized Cap HODL Waves 3m_6m, P Values = [0.6744, 0.5764, 0.2438, 0.2447, 0.3661, 0.3155, 0.2661]
Y = close, X = Realized Cap HODL Waves 3y_5y, P Values = [0.9928, 0.9158, 0.9666, 0.9861, 0.9497, 0.9815, 0.9629]
Y = close, X = Realized Cap HODL Waves 5y_7y, P Values = [0.6454, 0.8643, 0.9276, 0.9664, 0.9891, 0.8678, 0.7524]
Y = close, X = Realized Cap HODL Waves 6m_12m, P Values = [0.3372, 0.6171, 0.6699, 0.4809, 0.5984, 0.4789, 0.58]




Y = close, X = Realized Cap HODL Waves 7y_10y, P Values = [0.9539, 0.6529, 0.2616, 0.0097, 0.0001, 0.0, 0.0001]
Y = close, X = Realized Cap HODL Waves more_10y, P Values = [0.8283, 0.7208, 0.6428, 0.5668, 0.5735, 0.6006, 0.6968]
Y = close, X = Realized Price, P Values = [0.3983, 0.1863, 0.3591, 0.3317, 0.4904, 0.3408, 0.6775]
Y = close, X = Relative Unrealized Profit, P Values = [0.7014, 0.928, 0.9708, 0.9755, 0.9753, 0.9917, 0.9939]
Y = close, X = SOPR, P Values = [0.3794, 0.5876, 0.8499, 0.9167, 0.979, 0.9935, 0.9955]
Y = close, X = Stock-to-Flow Deflection, P Values = [0.8166, 0.9704, 0.9898, 0.9965, 0.9975, 0.9992, 0.9998]
Y = close, X = Stock-to-Flow Ratio daysTillHalving, P Values = [0.278, 0.5505, 0.731, 0.8433, 0.9021, 0.9571, 0.9751]




Y = close, X = Stock-to-Flow Ratio price, P Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Y = close, X = Stock-to-Flow Ratio ratio, P Values = [0.1248, 0.0494, 0.1119, 0.1562, 0.008, 0.0, 0.0]
Y = close, X = Supply Last Active 1+ Years Ago, P Values = [0.3274, 0.6203, 0.3108, 0.4557, 0.5929, 0.4945, 0.5101]
Y = close, X = Supply Last Active 3+ Years Ago, P Values = [0.7546, 0.4807, 0.4721, 0.6165, 0.671, 0.3279, 0.3448]
Y = close, X = Supply Last Active 3y-5y, P Values = [0.2536, 0.0934, 0.176, 0.2748, 0.0896, 0.1018, 0.1267]
Y = close, X = Supply in Smart Contracts, P Values = [0.701, 0.7968, 0.8651, 0.8924, 0.8195, 0.884, 0.9443]
Y = close, X = Supply last active 5y-7y, P Values = [0.6939, 0.4131, 0.6084, 0.7483, 0.7264, 0.8173, 0.8276]




Y = close, X = Total Addresses, P Values = [0.2297, 0.4402, 0.6541, 0.8208, 0.3802, 0.4924, 0.2724]
Y = close, X = Total Addresses (USDT), P Values = [0.2297, 0.4402, 0.6541, 0.8208, 0.3802, 0.4924, 0.2724]
Y = close, X = Transaction Count, P Values = [0.4673, 0.729, 0.7219, 0.4045, 0.5492, 0.2483, 0.0788]
Y = close, X = Transaction Rate, P Values = [0.4673, 0.729, 0.7219, 0.4045, 0.5492, 0.2483, 0.0788]
Y = close, X = Transaction Size (Total), P Values = [0.5547, 0.8002, 0.3373, 0.2857, 0.3918, 0.0507, 0.0127]
Y = close, X = Transfer Volume (USDT), P Values = [0.0102, 0.0333, 0.0571, 0.1013, 0.0022, 0.0037, 0.0054]
Y = close, X = aSOPR, P Values = [0.1502, 0.272, 0.5771, 0.6458, 0.8034, 0.9064, 0.8748]




Y = close, X = high, P Values = [0.6744, 0.2515, 0.8439, 0.2471, 0.0017, 0.0049, 0.0]
Y = close, X = low, P Values = [0.0099, 0.0082, 0.0141, 0.0002, 0.0001, 0.0, 0.0]
Y = close, X = open, P Values = [0.812, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Y = close, X = volume, P Values = [0.3047, 0.584, 0.745, 0.8255, 0.9071, 0.9581, 0.8783]




In [13]:
# to inverse transform:
for cols in df_merged.columns:
    if cols not in ['date', 'timestamp']:
        scaler = scalers[cols]  # retrieve the scaler for this column
        df_merged[cols] = scaler.inverse_transform(df_merged[[cols]])

In [14]:
# column names where p-value is equal to or more than 0.05
cols_to_remove = grangers_result[grangers_result['p_value'] >= 0.05].index.tolist()
cols_to_remove.append("timestamp")
# removing columns with p-value is equal to or more than 0.05
df_merged = df_merged.drop(columns=cols_to_remove)

In [15]:
df_merged

Unnamed: 0,date,Hash Rate,Active Addresses,Transaction Size (Total),Exchange Withdrawals,Difficulty,Transfer Volume (USDT),Market Cap,Exchange Inflow Volume (Total) (USDT),Exchange Outflow Volume (Total) (USDT),...,Price (ETH),Price (LTC),Price,Realized Cap HODL Waves 7y_10y,Stock-to-Flow Ratio price,Stock-to-Flow Ratio ratio,close,high,low,open
0,2012-01-01,8.644451e+12,11236.0,2570570.0,2.0,4.981859e+15,0.000000e+00,4.227734e+07,0.000000e+00,0.000000e+00,...,0.000000,0.000000,5.278399,0.000000,5.278399,2.812318,5.00,5.00,4.58,4.58
1,2012-01-02,8.449615e+12,11711.0,2847973.0,2.0,4.981859e+15,0.000000e+00,4.182319e+07,0.000000e+00,0.000000e+00,...,0.000000,0.000000,5.216780,0.000000,5.216780,2.826331,5.00,5.00,5.00,5.00
2,2012-01-03,9.035402e+12,12892.0,2852636.0,4.0,4.981859e+15,0.000000e+00,3.918951e+07,0.000000e+00,0.000000e+00,...,0.000000,0.000000,4.883428,0.000000,4.883428,2.836625,5.29,5.32,5.14,5.32
3,2012-01-04,8.340603e+12,12313.0,2844382.0,3.0,4.981859e+15,0.000000e+00,4.488123e+07,0.000000e+00,0.000000e+00,...,0.000000,0.000000,5.587490,0.000000,5.587490,2.845461,5.57,5.57,4.93,4.93
4,2012-01-05,7.964820e+12,15079.0,3584138.0,5.0,4.981859e+15,0.000000e+00,5.614132e+07,0.000000e+00,0.000000e+00,...,0.000000,0.000000,6.982925,0.000000,6.982925,2.856593,6.65,6.65,5.72,5.72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4132,2023-04-25,2.984295e+20,904996.0,199589484.0,57722.0,2.092182e+23,3.922787e+09,5.484817e+11,6.383354e+08,5.514699e+08,...,1868.337764,91.257827,28336.037970,0.001282,28336.037970,111539.990292,28300.00,28392.00,27197.00,27515.00
4133,2023-04-26,3.146757e+20,891781.0,208924858.0,62472.0,2.092182e+23,4.821793e+09,5.493166e+11,7.531167e+08,8.919745e+08,...,1866.207079,88.176122,28377.978077,0.001278,28377.978077,111626.748654,28428.00,30022.00,27242.00,28306.00
4134,2023-04-27,3.205615e+20,895642.0,202641495.0,58381.0,2.092182e+23,4.427074e+09,5.705654e+11,7.230365e+08,6.770063e+08,...,1909.578131,89.669738,29474.465532,0.001273,29474.465532,111924.641377,29485.00,29887.00,28389.00,28427.00
4135,2023-04-28,3.561016e+20,952849.0,236417172.0,59703.0,2.092182e+23,3.448769e+09,5.683250e+11,5.613031e+08,6.475488e+08,...,1894.035120,89.832748,29357.334314,0.001272,29357.334314,111837.853682,29333.00,29600.00,28922.00,29486.00


In [16]:
df_merged.to_csv('glassnode_data.csv', index=False)


### Data From Yahoo Finance AND: Indices & Commodoties (2012 - 2023)

In [17]:
import pandas as pd
yf_data_df = pd.read_csv('../Data_Collection/indices_commodoties_2012-2023.csv')
yf_data_df

Unnamed: 0,Date,000001.SS,^AXJO,^DJI,^FTSE,^GDAXI,^GSPC,^IXIC,^N225,^VIX,CL=F,GC=F,NG=F,SI=F,ZW=F
0,2012-01-02,,,,,6075.520020,,,,,,,,,
1,2012-01-03,,4101.200195,12397.379883,5699.899902,6166.569824,1277.060059,2648.719971,,22.969999,102.959999,1599.699951,2.993,29.533001,657.00
2,2012-01-04,2169.389893,4187.799805,12418.419922,5668.500000,6111.549805,1277.300049,2648.360107,8560.110352,22.219999,103.220001,1611.900024,3.096,29.063000,650.00
3,2012-01-05,2148.451904,4142.700195,12415.700195,5624.299805,6095.990234,1281.060059,2669.860107,8488.709961,21.480000,101.809998,1619.400024,2.980,29.264999,629.25
4,2012-01-06,2163.395020,4108.500000,12359.919922,5649.700195,6057.919922,1277.810059,2674.219971,8390.349609,20.629999,101.559998,1616.099976,3.062,28.653000,624.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2965,2023-05-25,3201.260010,7138.200195,32764.650391,7570.899902,15793.799805,4151.279785,12698.089844,30801.130859,19.139999,71.830002,1943.099976,2.307,22.785999,604.25
2966,2023-05-26,3212.500000,7154.799805,33093.339844,7627.200195,15983.969727,4205.450195,12975.690430,30916.310547,17.950001,72.669998,1944.099976,2.181,23.240000,616.00
2967,2023-05-29,3221.449951,7217.399902,,,15952.730469,,,31233.539062,,,,,,
2968,2023-05-30,3224.209961,7209.299805,33042.781250,7522.100098,15908.910156,4205.520020,13017.429688,31328.160156,17.459999,69.459999,1958.000000,2.327,23.125000,591.00


In [18]:
from sklearn.impute import KNNImputer

# excluding 'Date' column
yf_data_df = yf_data_df.rename(columns={'Date': 'date'})
cols = [col for col in yf_data_df.columns if col != 'date']
yf_data_sub = yf_data_df[cols]

imputer = KNNImputer(n_neighbors=10)  
df_imputed = imputer.fit_transform(yf_data_sub)

# the result is a numpy array, so converting it back to a DataFrame:
df_imputed = pd.DataFrame(df_imputed, columns=yf_data_sub.columns, index=yf_data_sub.index)

# merging back with 'Date' column
yf_data_df_imputed = pd.concat([yf_data_df[['date']], df_imputed], axis=1)
yf_data_df_imputed

Unnamed: 0,date,000001.SS,^AXJO,^DJI,^FTSE,^GDAXI,^GSPC,^IXIC,^N225,^VIX,CL=F,GC=F,NG=F,SI=F,ZW=F
0,2012-01-02,2241.369165,4081.059985,12436.353125,5499.900049,6075.520020,1299.851013,2764.617017,8546.461035,21.723000,90.273000,1607.820007,2.7151,28.512000,648.925
1,2012-01-03,2229.307471,4101.200195,12397.379883,5699.899902,6166.569824,1277.060059,2648.719971,8462.751953,22.969999,102.959999,1599.699951,2.9930,29.533001,657.000
2,2012-01-04,2169.389893,4187.799805,12418.419922,5668.500000,6111.549805,1277.300049,2648.360107,8560.110352,22.219999,103.220001,1611.900024,3.0960,29.063000,650.000
3,2012-01-05,2148.451904,4142.700195,12415.700195,5624.299805,6095.990234,1281.060059,2669.860107,8488.709961,21.480000,101.809998,1619.400024,2.9800,29.264999,629.250
4,2012-01-06,2163.395020,4108.500000,12359.919922,5649.700195,6057.919922,1277.810059,2674.219971,8390.349609,20.629999,101.559998,1616.099976,3.0620,28.653000,624.750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2965,2023-05-25,3201.260010,7138.200195,32764.650391,7570.899902,15793.799805,4151.279785,12698.089844,30801.130859,19.139999,71.830002,1943.099976,2.3070,22.785999,604.250
2966,2023-05-26,3212.500000,7154.799805,33093.339844,7627.200195,15983.969727,4205.450195,12975.690430,30916.310547,17.950001,72.669998,1944.099976,2.1810,23.240000,616.000
2967,2023-05-29,3221.449951,7217.399902,33820.185352,7621.860010,15952.730469,4264.580981,13184.376074,31233.539062,17.761000,72.430000,1944.940002,2.7429,23.961000,648.350
2968,2023-05-30,3224.209961,7209.299805,33042.781250,7522.100098,15908.910156,4205.520020,13017.429688,31328.160156,17.459999,69.459999,1958.000000,2.3270,23.125000,591.000


### Federal Reserve Economic Data (FRED): Rates (2012 -2023)

In [19]:
# combined_usa_rates_2012-2023.csv
fred_data_df = pd.read_csv('../Data_Collection/combined_usa_rates_2012-2023.csv')
fred_data_df = fred_data_df.dropna()
fred_data_df.tail(3)

Unnamed: 0,DATE,DFF,CPIAUCSL
4136,2023-04-29,4.83,302.918
4137,2023-04-30,4.83,302.918
4138,2023-05-01,4.83,303.294


In [20]:
fred_data_df = fred_data_df.rename(columns={'DATE': 'date'})
merged_df = yf_data_df_imputed.merge(fred_data_df, on='date', how='inner')
merged_df

Unnamed: 0,date,000001.SS,^AXJO,^DJI,^FTSE,^GDAXI,^GSPC,^IXIC,^N225,^VIX,CL=F,GC=F,NG=F,SI=F,ZW=F,DFF,CPIAUCSL
0,2012-01-02,2241.369165,4081.059985,12436.353125,5499.900049,6075.520020,1299.851013,2764.617017,8546.461035,21.723000,90.273000,1607.820007,2.7151,28.512000,648.925,0.04,227.842
1,2012-01-03,2229.307471,4101.200195,12397.379883,5699.899902,6166.569824,1277.060059,2648.719971,8462.751953,22.969999,102.959999,1599.699951,2.9930,29.533001,657.000,0.07,227.842
2,2012-01-04,2169.389893,4187.799805,12418.419922,5668.500000,6111.549805,1277.300049,2648.360107,8560.110352,22.219999,103.220001,1611.900024,3.0960,29.063000,650.000,0.07,227.842
3,2012-01-05,2148.451904,4142.700195,12415.700195,5624.299805,6095.990234,1281.060059,2669.860107,8488.709961,21.480000,101.809998,1619.400024,2.9800,29.264999,629.250,0.07,227.842
4,2012-01-06,2163.395020,4108.500000,12359.919922,5649.700195,6057.919922,1277.810059,2674.219971,8390.349609,20.629999,101.559998,1616.099976,3.0620,28.653000,624.750,0.07,227.842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2943,2023-04-25,3264.870117,7292.430029,33530.828125,7891.100098,15872.129883,4071.629883,11799.160156,28620.070312,18.760000,77.070000,1994.000000,2.3070,24.877001,638.750,4.83,302.918
2944,2023-04-26,3264.100098,7316.299805,33301.871094,7852.600098,15795.730469,4055.989990,11854.349609,28416.470703,18.840000,74.300003,1985.699951,2.1170,24.871000,627.250,4.83,302.918
2945,2023-04-27,3285.879883,7292.700195,33826.160156,7831.600098,15800.450195,4135.350098,12142.240234,28457.679688,17.030001,74.760002,1989.900024,2.3550,24.983000,614.750,4.83,302.918
2946,2023-04-28,3323.270020,7309.200195,34098.160156,7870.600098,15922.379883,4169.479980,12226.580078,28856.439453,15.780000,76.779999,1990.099976,2.4100,24.999001,619.750,4.83,302.918


In [21]:
merged_df.isna().sum()

date         0
000001.SS    0
^AXJO        0
^DJI         0
^FTSE        0
^GDAXI       0
^GSPC        0
^IXIC        0
^N225        0
^VIX         0
CL=F         0
GC=F         0
NG=F         0
SI=F         0
ZW=F         0
DFF          0
CPIAUCSL     0
dtype: int64

In [22]:
merged_df.to_csv('indices_rates_commodoties.csv', index=False)
