In [None]:
!pip install import_ipynb
import import_ipynb

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import statsmodels.api as sm
import matplotlib.ticker as ticker
import model
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

from datetime import datetime
from statsmodels.tsa.seasonal import STL
from scipy import signal
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy import fftpack
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn import model_selection, metrics
from sklearn.model_selection import GridSearchCV, learning_curve, train_test_split
from sklearn import neighbors, linear_model, svm, tree, ensemble
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from collections import Counter
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

In [None]:
# display the result as a dataframe
def result_show(res_df):
    df_lst = []
    for i in range(len(res_df)):
        flat_list = [item for sublist in res_df[i] for item in sublist]
        data = [list(t) for t in flat_list]
        res = pd.DataFrame(data, columns=['train', 'test'], index=['lgb', 'xgb', 'rf', 'knn'])
        df_lst.append(res)
        
    result_df = pd.concat([d.T for d in df_lst])
    return result_df

In [None]:
# model fit  
def fit_model(x_train, y_train, x_test, y_test, k=5):
    lightgbm = model.Model_Fit(clf = lgb.LGBMRegressor(random_state=42))
    xgb = model.Model_Fit(clf = XGBRegressor(objective="reg:squarederror", random_state=42))
    rf = model.Model_Fit(clf = ensemble.RandomForestRegressor(random_state=42))
    knn = model.Model_Fit(clf = neighbors.KNeighborsRegressor())

    lgb_matrix = []
    xgb_matrix = []
    rf_matrix = [] 
    knn_matrix = []

    lgb_matrix.append(model.lgb_model(x_train, y_train, x_test, y_test, lightgbm, k))
    xgb_matrix.append(model.xgb_model(x_train, y_train, x_test, y_test, xgb, k))
    rf_matrix.append(model.rf_model(x_train, y_train, x_test, y_test, rf, k))
    knn_matrix.append(model.knn_model(x_train, y_train, x_test, y_test, knn, k))

    return lgb_matrix, xgb_matrix, rf_matrix, knn_matrix

In [None]:
def bitcoin_models(cols, df, data_option):
    x_features = list(set(cols) - set(['price_usd_num', 'ethereum_price_usd_num']))
    result_lst = []
    
    y = df['price_usd_num']
    X = df[x_features]
    
    if data_option == 'lookback':
        Xs = []
        Ys = []
        look_back = 10
        features_count = len(X.columns)
        for j in range(look_back, len(X)):
            Xs.append(X.iloc[j - look_back: j].values)
            Ys.append(y.iloc[j])

        Xs, Ys = np.array(Xs), np.array(Ys)
        x_train = Xs[:int(Xs.shape[0] * 0.8)]
        y_train = Ys[:int(Ys.shape[0] * 0.8)]
        x_test = Xs[int(Xs.shape[0] * 0.8):]
        y_test = Ys[int(Ys.shape[0] * 0.8):]

        x_train = np.reshape(x_train, (x_train.shape[0], -1))
        x_test = np.reshape(x_test, (x_test.shape[0], -1))
                
        scaler = MinMaxScaler()
        scaled_train_data = scaler.fit_transform(x_train)
        scaled_test_data = scaler.transform(x_test)

        scaled_train_data_model = np.reshape(scaled_train_data, (-1, look_back, features_count))
        scaled_test_data_model = np.reshape(scaled_test_data, (-1, look_back, features_count))
        # convert 3D to 2D
        scaled_train_data_model = scaled_train_data_model.reshape(scaled_train_data_model.shape[0], (scaled_train_data_model.shape[1]*scaled_train_data_model.shape[2]))
        scaled_test_data_model = scaled_test_data_model.reshape(scaled_test_data_model.shape[0], (scaled_test_data_model.shape[1]*scaled_test_data_model.shape[2]))

        a = 'lgb_matrix'
        b = 'xgb_matrix'
        c = 'rf_matrix'
        d = 'knn_matrix'
        a, b, c, d = fit_model(scaled_train_data_model, y_train, scaled_test_data_model, y_test)

        result_lst.append([a, b, c, d])
        
    else: 
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
        a = 'lgb_matrix'
        b = 'xgb_matrix'
        c = 'rf_matrix'
        d = 'knn_matrix'
        
        mms = MinMaxScaler()
        x_train_norm = mms.fit_transform(x_train)
        x_test_norm = mms.transform(x_test)

        a, b, c, d = fit_model(x_train_norm, y_train, x_test_norm, y_test)

        result_lst.append([a, b, c, d])

    return result_show(result_lst)

# 1 Data Preprocessing 

In [None]:
df = pd.read_csv('./data.csv', index_col=0)
df = df[df['timestamp'] >= '2016-01-01 00:00:00+00:00']
df.set_index('timestamp', inplace = True, drop = True)
df.index = [datetime.strptime(ts, "%Y-%m-%d %H:%M:%S%z").strftime("%Y-%m-%d") for ts in df.index]
df

# 2 Explanatory Data Analysis 

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.pairplot(df)

In [None]:
# correlation matrix 
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), cmap='coolwarm', annot=True)
plt.show()

In [None]:
# data visualisation 
df.index = pd.to_datetime(df.index)
features = list(df.columns)
fig, axs = plt.subplots(nrows=len(features), ncols=1,
                        figsize=(10, 4*len(features)))

# loop through each feature and chart it
for i, feature in enumerate(features):
    color = 'tab:blue'
    axs[i].set_xlabel('Time')
    axs[i].set_ylabel(feature, color=color)
    axs[i].plot(df.index, np.log(df[feature]), color=color)
    
    date_form = mdates.DateFormatter('%Y-%01-%01') # set the date format 
    axs[i].xaxis.set_major_locator(mdates.YearLocator(month=1, day=1))
    axs[i].xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    axs[i].tick_params(axis='y', labelcolor=color)

    ax2 = axs[i].twinx()
    # plot the bitcoin price 
    color = 'tab:red'
    ax2.set_ylabel('Price', color=color)
    ax2.plot(df.index, np.log(df['price_usd_num']), color=color)
    
    ax2.tick_params(axis='y', labelcolor=color)
    ax2.xaxis.set_major_locator(mdates.YearLocator(month=1, day=1))
    ax2.xaxis.set_major_formatter(mdates.DateFormatter('%Y-01-01'))
    axs[i].set_title(
        'Bitcoin {} and Price over Time'.format(feature), fontsize=14)

plt.tight_layout()
plt.show()

# 3 Bitcoin Trends and Seasonal Exploration and Super Seasonal Detection
## 3.1 Bitcoin Trends and Seasonal Exploration

In [None]:
# Part 1: Visualize Bitcoin trends and seasonality
# seasonal breakdown of bitcoin prices
btc_decomposed = seasonal_decompose(df['price_usd_num'], model='additive', period=365)

fig, axs = plt.subplots(4, 1, figsize=(14,8))

axs[0].plot(btc_decomposed.observed, label='Original')
axs[0].legend(loc='upper left')
axs[0].set_title('Original')

# plot trend component
axs[1].plot(btc_decomposed.trend, label='Trend')
axs[1].legend(loc='upper left')
axs[1].set_title('Trend')

# plot seasonal component
axs[2].plot(btc_decomposed.seasonal,label='Seasonality')
axs[2].legend(loc='upper left')
axs[2].set_title('Seasonality')

# plot residual component
axs[3].plot(btc_decomposed.resid, label='Residuals')
axs[3].legend(loc='upper left')
axs[3].set_title('Residuals')

# for each subgraph set the year to the main scale on the x axis
for ax in axs:
    ax.xaxis.set_major_locator(mdates.YearLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-01-01'))


plt.tight_layout()
plt.show()

## 3.2 Bitcoin Super Seasonal Detection

In [None]:
# perform the STL decomposition
stl = STL(df['price_usd_num'], period=365)
result = stl.fit()

# check residual
resid = result.resid

# STL decomposition is performed again
stl_resid = STL(resid, period=365)
result_resid = stl_resid.fit()

# if the seasonal component of the residual is not close to zero, then there may be superseasonality
if not np.allclose(result_resid.seasonal, 0, atol=1e-10):
    print("可能存在超季节性")

# graphical representation
fig, (ax1, ax2) = plt.subplots(2, figsize=(8,6))

# draw the original residuals
ax1.plot(resid)
ax1.set_title('Residuals')

# plot the seasonal component of residuals
ax2.plot(result_resid.seasonal)
ax2.set_title('Seasonality in Residuals')

plt.tight_layout()
plt.show()

# 4 Causal analysis and visualization
## 4.1 Relational Analysis and Visualization of hash-rate and price 

In [None]:
selected_hashrate_price_columns = ['mean_hash_rate', 'price_usd_num']
hashrate_price_data = df[selected_hashrate_price_columns]

correlation = hashrate_price_data.corr()
print("Bitcoin Hash Rate vs Price Correlation Matrix:")
print(correlation)

# conduct Granger Causality Test
model_gct_hp = sm.tsa.stattools.grangercausalitytests(hashrate_price_data, maxlag=1)

# print the results of the causation test
print("Granger Causality Test Results:")
for lag in model_gct_hp.keys():
    print("Lag:", lag)
    print("Null Hypothesis (H0):", model_gct_hp[lag][0]['ssr_chi2test'][0])
    print("Alternative Hypothesis (H1):", model_gct_hp[lag][0]['ssr_chi2test'][1])
    print("p-value:", model_gct_hp[lag][0]['ssr_chi2test'][1])
    print("-------------------------------------------")
    
plt.scatter(hashrate_price_data['mean_hash_rate'], hashrate_price_data['price_usd_num'])
plt.xlabel('Hash Rate')
plt.ylabel('Price')
plt.title('Bitcoin Hash Rate vs Bitcoin Price')
plt.show()

**Correlation analysis:** The correlation coefficient matrix shows that the correlation coefficient between bitcoin hash rate and price is 0.658008, indicating a moderate positive correlation between them. This means that bitcoin's hash rate and price move in tandem to some extent, but there is not a strong linear relationship.

**Causality analysis:** The results of Granger causality test show that at 1 lag order, the P-value of F test and Chi-square test based on ssr are both 1.0000, which does not support the statistically significant causal effect of bitcoin hash rate on price. This means that with a given lag order, it is impossible to conclude that the bitcoin hash rate has a causal effect on the price.

**Conclusion:** In summary, according to the analysis results, it can be concluded that there is a certain positive correlation between bitcoin hash rate and price, but it cannot be determined that bitcoin hash rate has a causal effect on price. This means that changes in Bitcoin's hash rate may be related to price fluctuations in some way, but it cannot be relied on alone to predict price changes.

## 4.2 Relational Analysis and Visualization of Active and Bitcoin price

In [None]:
selected_active_price_columns = ['active_addr_num', 'price_usd_num']
active_price_data = df[selected_active_price_columns]

correlation_ap = active_price_data.corr()
print("active_addr_num vs Price Correlation Matrix:")
print(correlation_ap)

model_gct_ap = sm.tsa.stattools.grangercausalitytests(active_price_data, maxlag=1)

print("Granger Causality Test Results:")
for lag in model_gct_ap.keys():
    print("Lag:", lag)
    print("Null Hypothesis (H0):", model_gct_ap[lag][0]['ssr_chi2test'][0])
    print("Alternative Hypothesis (H1):", model_gct_ap[lag][0]['ssr_chi2test'][1])
    print("p-value:", model_gct_ap[lag][0]['ssr_chi2test'][1])
    print("-------------------------------------------")
    
plt.scatter(active_price_data['active_addr_num'], active_price_data['price_usd_num'])
plt.xlabel('Active Address')
plt.ylabel('Price ')
plt.title('Active Address vs Price')
plt.show()

**Correlation analysis:** The correlation coefficient matrix shows that the correlation coefficient between the number of active addresses and the price of bitcoin is 0.674372, indicating that there is a moderate positive correlation between them.
This means that the number of active addresses and the price of bitcoin change synchronously to some extent, with a certain linear relationship.

**Causality analysis:** The results of Granger causality test show that, under 1 lag order, the P-values of F-test and Chi-square test based on ssr are both very small (0.0000).

The number of active addresses supporting bitcoin has a statistically significant causal effect on the price. This means that changes in the number of active Bitcoin addresses can have an impact on the price.

**Conclusion:** In summary, according to the analysis results, it can be concluded that there is a certain positive correlation between the number of active addresses of bitcoin and the price, and the number of active addresses of bitcoin has a causal effect on the price.
More active addresses could mean more user engagement and transaction activity, which could affect the supply, demand and price of bitcoin.

## 4.3 Relational analysis and visualization of Bitcoin-number-of-address-with-balance-10k and Bitcoin price

In [None]:
selected_10k_price_columns = ['addr_with_balance_10k_num', 'price_usd_num']
k10_price_data = df[selected_10k_price_columns]

correlation_k10p = k10_price_data.corr()
print("addr_with_balance_10k_num vs Price Correlation Matrix:")
print(correlation_k10p)

model_gct_k10p = sm.tsa.stattools.grangercausalitytests(k10_price_data, maxlag=1)

print("Granger Causality Test Results:")
for lag in model_gct_k10p.keys():
    print("Lag:", lag)
    print("Null Hypothesis (H0):", model_gct_k10p[lag][0]['ssr_chi2test'][0])
    print("Alternative Hypothesis (H1):", model_gct_k10p[lag][0]['ssr_chi2test'][1])
    print("p-value:", model_gct_k10p[lag][0]['ssr_chi2test'][1])
    print("-------------------------------------------")
    
plt.scatter(k10_price_data['addr_with_balance_10k_num'], k10_price_data['price_usd_num'])
plt.xlabel('balance-10k')
plt.ylabel('Price ')
plt.title('bitcoin-number-of-addresses-with-balance-10k vs Price')
plt.show()

**Correlation analysis:** There is a negative correlation between the number of bitcoin address balances above USD 10,000 and the price of bitcoin, with a correlation coefficient of -0.754. This means that as the number of Bitcoin addresses with a balance of $10,000 or more increases, bitcoin prices tend to fall.

**Causality analysis:** This means that the number of Bitcoin addresses with a balance above $10,000 can influence the price of bitcoin to some extent.
The p value was 0.0045, which was less than the significance level (usually 0.05), indicating that the results were statistically significant.

**Conclusion:** The number of addresses with a balance of more than $10,000 can predict the price of bitcoin to a certain extent, and affect the price changes of Bitcoin to a certain extent.

## 4.4 Relational analysis and visualization of bitcoin price and ethereum-price
### Visualization of bitcoin price and ethereum-price

In [None]:
fig = plt.figure(figsize=(15, 6))
plt.plot(df.index, df['price_usd_num'], label='Bitcoin Price')
plt.plot(df.index, df['ethereum_price_usd_num'], label='Ethereum Price')
plt.xlabel('Timestamp')
plt.ylabel('Price')
plt.legend()  
plt.title('Bitcoin and Ethereum Price Trend')
plt.xticks(rotation=45)
plt.gca().xaxis.set_major_locator(ticker.MultipleLocator(120))   
plt.show()

In [None]:
selected_price_columns = ['price_usd_num', 'ethereum_price_usd_num']
correlation_data = df[selected_price_columns]

correlation = correlation_data.corr()
print("Correlation Matrix:")
print(correlation)

model_gct = sm.tsa.stattools.grangercausalitytests(correlation_data, maxlag=1)

print("Granger Causality Test Results:")
for lag in model_gct.keys():
    print("Lag:", lag)
    print("Null Hypothesis (H0):", model_gct[lag][0]['ssr_chi2test'][0])
    print("Alternative Hypothesis (H1):", model_gct[lag][0]['ssr_chi2test'][1])
    print("p-value:", model_gct[lag][0]['ssr_chi2test'][1])
    print("-------------------------------------------")

In [None]:
'''Using Ethereum price as independent variable and bitcoin price as dependent variable, the co-integration between prices is calculated. 
Cointegration analysis can help to determine whether two time series have a long-term stable relationship.'''
model_ols = sm.OLS(correlation_data['price_usd_num'], sm.add_constant(correlation_data['ethereum_price_usd_num'])).fit()
residuals = model_ols.resid

plt.plot(df['ethereum_price_usd_num'], residuals, 'o')
plt.axhline(0, color='r', linestyle='--')
plt.xlabel('Ethereum Price (USD)')
plt.ylabel('Residuals')
plt.title('Residuals Plot')
plt.show()

print("Cointegration Analysis:")
print(model_ols.summary())

**Correlation analysis:** Correlation coefficient matrix shows that the correlation coefficient between bitcoin price and Ethereum price is 0.937912, indicating that there is a highly positive correlation between them.
This means that there is a strong linear relationship between bitcoin prices and Ethereum prices, which move in very similar trends.

**Causality analysis:** The results of Granger causality test show that, under 1 lag order, the P-values of F-test and Chi-square test based on ssr are very small (both 0.0000).
Supporting Ethereum prices has a statistically significant causal effect on bitcoin prices. This means that changes in the price of Ethereum can have an impact on the price of bitcoin.

**Co-integration analysis:** The results of co-integration analysis show that the R-squared value obtained by OLS regression model is 0.880, indicating that there is a strong co-integration relationship between bitcoin price and Ethereum price.
The significance test results of regression coefficient show that the coefficient of Ethereum price is 13.9142, which is statistically significant. This means that Ethereum prices have some predictive power over changes in bitcoin prices.

It is concluded that there is a highly positive correlation between the price of bitcoin and the price of Ethereum, and the price of ethereum has a causal effect on the price of bitcoin. This means that fluctuations and price changes in the Ethereum market may have an impact on the Bitcoin market, and there is a certain degree of linkage between the two.

**Conclusion:** Therefore, when analyzing and predicting the price of bitcoin, it is necessary to consider the price of Ethereum as an important factor, and it may provide certain prediction indicators for the price of bitcoin.

# 5 Prediction Model Construction

In [None]:
# case 1: all features 
all_cols = list(df.columns)
nolookback_df = bitcoin_models(all_cols, df, 'no')

In [None]:
lookback_df = bitcoin_models(all_cols, df, 'lookback')
lookback_df

In [None]:
list(df.columns)