In [22]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import plotly.graph_objs as go
import plotly.io as pio


# Satge 01 : Data Understanding

In [23]:
df = pd.read_csv('../data/Bitcoin Historical Data.csv')
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,04/08/2024,71630.1,69358.0,72710.8,69110.5,105.78K,3.27%
1,04/07/2024,69360.4,68897.3,70285.8,68849.4,46.99K,0.68%
2,04/06/2024,68890.6,67830.5,69632.0,67467.2,41.48K,1.56%
3,04/05/2024,67830.6,68498.7,68692.2,66023.3,88.97K,-0.97%
4,04/04/2024,68496.5,65968.4,69238.8,65096.3,100.30K,3.84%


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      5000 non-null   object
 1   Price     5000 non-null   object
 2   Open      5000 non-null   object
 3   High      5000 non-null   object
 4   Low       5000 non-null   object
 5   Vol.      4994 non-null   object
 6   Change %  5000 non-null   object
dtypes: object(7)
memory usage: 273.6+ KB


In [25]:
df.describe()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
count,5000,5000.0,5000.0,5000.0,5000.0,4994,5000
unique,5000,4059.0,4051.0,4047.0,4056.0,4425,1554
top,08/01/2010,0.1,0.1,0.1,0.1,1.05M,0.00%
freq,1,86.0,87.0,83.0,87.0,6,415


In [26]:
df_candlestick = df[['Date', 'Open', 'High', 'Low', 'Price']]

# 7. Plot candlestick chart using Plotly
fig = go.Figure(data=[go.Candlestick(
    x=df_candlestick['Date'],
    open=df_candlestick['Open'],
    high=df_candlestick['High'],
    low=df_candlestick['Low'],
    close=df_candlestick['Price']
)])

fig.update_layout(
    title='Candlestick Chart',
    xaxis_title='Date',
    yaxis_title='Price',
    xaxis_rangeslider_visible=False
)

# Display the plot
pio.show(fig)

### Dataset in Reverse order , We have to sort it from earlist date to latest date

# Stage 02 : Data Pre-processing

### 1. Sort the data from earliest day to last day

In [27]:
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
df = df.sort_values('Date').reset_index(drop=True)
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2010-08-01,0.1,0.1,0.1,0.1,2.60K,0.00%
1,2010-08-02,0.1,0.1,0.1,0.1,3.60K,0.00%
2,2010-08-03,0.1,0.1,0.1,0.1,9.82K,0.00%
3,2010-08-04,0.1,0.1,0.1,0.1,3.49K,0.00%
4,2010-08-05,0.1,0.1,0.1,0.1,5.03K,0.00%


### 2. Add data lower bound: Keep data from 2011 onwards

In [28]:
df = df[df['Date'] >= '2011-01-01']
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
153,2011-01-01,0.3,0.3,0.3,0.3,2.82K,0.00%
154,2011-01-02,0.3,0.3,0.3,0.3,5.35K,0.00%
155,2011-01-03,0.3,0.3,0.3,0.3,1.43K,0.00%
156,2011-01-04,0.3,0.3,0.3,0.3,1.88K,0.00%
157,2011-01-05,0.3,0.3,0.3,0.3,0.36K,0.00%


### 3. Remove %, K, and , from the values in all columns

In [29]:
df.replace({',': '', 'K': 'e3', 'M': 'e6', 'B': 'e9', '%': ''}, regex=True, inplace=True)
df['Vol.'] = df['Vol.'].astype(float)  # Convert to float for scaling
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
153,2011-01-01,0.3,0.3,0.3,0.3,2820.0,0.0
154,2011-01-02,0.3,0.3,0.3,0.3,5350.0,0.0
155,2011-01-03,0.3,0.3,0.3,0.3,1430.0,0.0
156,2011-01-04,0.3,0.3,0.3,0.3,1880.0,0.0
157,2011-01-05,0.3,0.3,0.3,0.3,360.0,0.0


###4.Convert columns to numeric

In [30]:
for col in ['Price', 'Open', 'High', 'Low', 'Change %']:
    df[col] = pd.to_numeric(df[col])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4847 entries, 153 to 4999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      4847 non-null   datetime64[ns]
 1   Price     4847 non-null   float64       
 2   Open      4847 non-null   float64       
 3   High      4847 non-null   float64       
 4   Low       4847 non-null   float64       
 5   Vol.      4841 non-null   float64       
 6   Change %  4847 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 302.9 KB



### 5. Remove any duplicate values if exisists


In [31]:
print( "shape before remove duplicates:", df.shape)
df.drop_duplicates(inplace=True)
print( "shape after remove duplicates:", df.shape)

shape before remove duplicates: (4847, 7)
shape after remove duplicates: (4847, 7)


### 6. Add lag 7 columns for each column, and rename them properly


In [32]:
for col in ['Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']:
    for lag in range(1, 8):
        df[f'{col}_lag{lag}'] = df[col].shift(lag)

# 6.1. Remove any null values
df.dropna(inplace=True)


df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %,Price_lag1,Price_lag2,Price_lag3,...,Vol._lag5,Vol._lag6,Vol._lag7,Change %_lag1,Change %_lag2,Change %_lag3,Change %_lag4,Change %_lag5,Change %_lag6,Change %_lag7
160,2011-01-08,0.3,0.3,0.3,0.3,1630.0,0.0,0.3,0.3,0.3,...,1430.0,5350.0,2820.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161,2011-01-09,0.3,0.3,0.3,0.3,1690.0,0.0,0.3,0.3,0.3,...,1880.0,1430.0,5350.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162,2011-01-10,0.3,0.3,0.3,0.3,10360.0,0.0,0.3,0.3,0.3,...,360.0,1880.0,1430.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
163,2011-01-11,0.3,0.3,0.3,0.3,4860.0,0.0,0.3,0.3,0.3,...,3460.0,360.0,1880.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
164,2011-01-12,0.3,0.3,0.4,0.3,31360.0,0.0,0.3,0.3,0.3,...,42600.0,3460.0,360.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 8.Shifting the 'Price' column to create 'Tomorrow_Price'

In [33]:
df['Tomorrow_Price'] = df['Price'].shift(-1)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %,Price_lag1,Price_lag2,Price_lag3,...,Vol._lag6,Vol._lag7,Change %_lag1,Change %_lag2,Change %_lag3,Change %_lag4,Change %_lag5,Change %_lag6,Change %_lag7,Tomorrow_Price
160,2011-01-08,0.3,0.3,0.3,0.3,1630.0,0.0,0.3,0.3,0.3,...,5350.0,2820.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3
161,2011-01-09,0.3,0.3,0.3,0.3,1690.0,0.0,0.3,0.3,0.3,...,1430.0,5350.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3
162,2011-01-10,0.3,0.3,0.3,0.3,10360.0,0.0,0.3,0.3,0.3,...,1880.0,1430.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3
163,2011-01-11,0.3,0.3,0.3,0.3,4860.0,0.0,0.3,0.3,0.3,...,360.0,1880.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3
164,2011-01-12,0.3,0.3,0.4,0.3,31360.0,0.0,0.3,0.3,0.3,...,3460.0,360.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3


# Exploratory Data analysis ( EDA )

In [34]:
## I will do it later

# Stage 4: Feature Engineering

In [35]:
# Split the dataset: last 365 rows for testing, rest for training
train_df = df.iloc[:-365]
test_df = df.iloc[-365:]

# Print the shapes of the resulting DataFrames
print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)

Training set shape: (4461, 50)
Testing set shape: (365, 50)


# Stage 05 : Model building

In [36]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Input
from tensorflow.keras.callbacks import EarlyStopping


In [37]:
# Model building
X_train = train_df.drop(['Date', 'Tomorrow_Price'], axis=1).values
test_df_features = test_df.drop(['Date', 'Tomorrow_Price'], axis=1).values
y_train = train_df['Tomorrow_Price'].values
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))


In [38]:
# Scale down X_train and y_train separately
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()
X_train_scaled = X_scaler.fit_transform(X_train.reshape(-1, X_train.shape[2])).reshape(X_train.shape)
test_df_features_scaled = X_scaler.transform(test_df_features.reshape(-1, test_df_features.shape[1])).reshape(test_df_features.shape)
y_train_scaled = y_scaler.fit_transform(y_train.reshape(-1, 1)).flatten()

In [None]:

# Build the LSTM model
model = Sequential()
model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(5000, return_sequences=True))
model.add(LSTM(500, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(250))
model.add(Dense(50))
model.add(Dense(8))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(X_train_scaled, y_train_scaled, 
                    batch_size=8, 
                    epochs=200, 
                    validation_split=0.2, 
                    callbacks=[early_stopping])


Epoch 1/200
[1m168/446[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m27s[0m 99ms/step - loss: 0.0012

In [None]:
# Display training history
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')
plt.show()

In [None]:
predictions_scaled = model.predict(test_df_features_scaled.reshape(-1, 1, test_df_features_scaled.shape[1]))


In [None]:
# Inverse transform the predictions
predictions = y_scaler.inverse_transform(predictions_scaled)

# Plotting the test data and predictions
plt.figure(figsize=(12, 6))
plt.plot(test_df['Date'], test_df['Tomorrow_Price'], label='Actual Price', color='blue')
plt.plot(test_df['Date'], predictions, label='Predicted Price', color='red')
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Actual vs Predicted Prices')
plt.legend()
plt.show()