# Cryptocurrency Price Data Exploration

This notebook explores cryptocurrency price data and performs initial analysis to understand the dataset characteristics. This exploration is crucial for model development and feature engineering.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Set plotting style
plt.style.use('ggplot')
sns.set_theme(style="whitegrid")

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', '{:.2f}'.format)

## 1. Load the Dataset

We'll be using cryptocurrency data from Kaggle, which contains historical price information for major cryptocurrencies.

In [None]:
# Define the data directory
DATA_DIR = '../data/raw'

# List available CSV files
csv_files = [f for f in os.listdir(DATA_DIR) if f.endswith('.csv')]
print(f"Available CSV files: {csv_files}")

# Load Bitcoin data as an example
# In a real implementation, this would load the actual Kaggle dataset
# For this notebook, we'll simulate the data

# Create sample data
def generate_sample_data(crypto_name, days=1000, start_price=40000):
    np.random.seed(42)  # For reproducibility
    
    # Generate dates
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days)
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Generate prices with random walk
    prices = [start_price]
    for i in range(1, len(dates)):
        # Daily change between -5% and 5%
        change = np.random.normal(0, 0.02)  # Mean 0, std 2%
        prices.append(prices[-1] * (1 + change))
    
    # Create DataFrame
    df = pd.DataFrame({
        'Date': dates,
        'Price': prices,
        'Open': [price * np.random.uniform(0.97, 1.0) for price in prices],
        'High': [price * np.random.uniform(1.0, 1.05) for price in prices],
        'Low': [price * np.random.uniform(0.95, 1.0) for price in prices],
        'Volume': [np.random.uniform(1e9, 5e9) for _ in prices]
    })
    
    return df

# Generate sample data for cryptocurrencies
btc_df = generate_sample_data('Bitcoin', start_price=40000)
eth_df = generate_sample_data('Ethereum', start_price=2000)
xrp_df = generate_sample_data('Ripple', start_price=0.5)

# Display the first few rows of Bitcoin data
btc_df.head()

## 2. Data Exploration and Visualization

Let's explore the dataset and visualize the historical prices.

In [None]:
# Check dataset info
print("Bitcoin Dataset Info:")
btc_df.info()

# Descriptive statistics
print("\nDescriptive Statistics:")
btc_df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
btc_df.isnull().sum()

In [None]:
# Plot Bitcoin price over time
plt.figure(figsize=(14, 7))
plt.plot(btc_df['Date'], btc_df['Price'], label='BTC Close Price')
plt.title('Bitcoin Price History')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Plot daily price changes
btc_df['Daily_Return'] = btc_df['Price'].pct_change() * 100

plt.figure(figsize=(14, 7))
plt.plot(btc_df['Date'][1:], btc_df['Daily_Return'][1:], color='blue')
plt.title('Bitcoin Daily Returns (%)')
plt.xlabel('Date')
plt.ylabel('Daily Return (%)')
plt.grid(True)
plt.show()

In [None]:
# Distribution of daily returns
plt.figure(figsize=(10, 6))
sns.histplot(btc_df['Daily_Return'].dropna(), kde=True, bins=50)
plt.title('Distribution of Bitcoin Daily Returns')
plt.xlabel('Daily Return (%)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Compare multiple cryptocurrencies
# Standardize the prices for comparison
btc_normalized = btc_df.copy()
eth_normalized = eth_df.copy()
xrp_normalized = xrp_df.copy()

btc_normalized['Price'] = btc_normalized['Price'] / btc_normalized['Price'].iloc[0]
eth_normalized['Price'] = eth_normalized['Price'] / eth_normalized['Price'].iloc[0]
xrp_normalized['Price'] = xrp_normalized['Price'] / xrp_normalized['Price'].iloc[0]

plt.figure(figsize=(14, 7))
plt.plot(btc_normalized['Date'], btc_normalized['Price'], label='Bitcoin')
plt.plot(eth_normalized['Date'], eth_normalized['Price'], label='Ethereum')
plt.plot(xrp_normalized['Date'], xrp_normalized['Price'], label='Ripple')
plt.title('Normalized Price Comparison (Base = First Day)')
plt.xlabel('Date')
plt.ylabel('Normalized Price')
plt.grid(True)
plt.legend()
plt.show()

## 3. Correlation Analysis

Let's examine the correlation between different cryptocurrencies and between price and volume.

In [None]:
# Merge datasets on Date
btc_price = btc_df[['Date', 'Price']].rename(columns={'Price': 'BTC_Price'})
eth_price = eth_df[['Date', 'Price']].rename(columns={'Price': 'ETH_Price'})
xrp_price = xrp_df[['Date', 'Price']].rename(columns={'Price': 'XRP_Price'})

# Merge
merged_df = pd.merge(btc_price, eth_price, on='Date', how='inner')
merged_df = pd.merge(merged_df, xrp_price, on='Date', how='inner')

# Correlation matrix
corr_matrix = merged_df.drop('Date', axis=1).corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Between Cryptocurrency Prices')
plt.show()

In [None]:
# Check correlation between price and volume for Bitcoin
btc_price_volume_corr = btc_df[['Price', 'Volume']].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(btc_price_volume_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Between Bitcoin Price and Volume')
plt.show()

In [None]:
# Scatter plot of Price vs Volume
plt.figure(figsize=(10, 6))
plt.scatter(btc_df['Volume'], btc_df['Price'], alpha=0.5)
plt.title('Bitcoin Price vs Volume')
plt.xlabel('Volume')
plt.ylabel('Price (USD)')
plt.grid(True)
plt.show()

## 4. Feature Engineering

Let's create some additional features that might be useful for our model.

In [None]:
# Add features to Bitcoin data
btc_features = btc_df.copy()

# Price momentum (5-day and 20-day)
btc_features['Price_5d_pct'] = btc_features['Price'].pct_change(periods=5) * 100
btc_features['Price_20d_pct'] = btc_features['Price'].pct_change(periods=20) * 100

# Moving averages
btc_features['MA_5'] = btc_features['Price'].rolling(window=5).mean()
btc_features['MA_20'] = btc_features['Price'].rolling(window=20).mean()
btc_features['MA_50'] = btc_features['Price'].rolling(window=50).mean()

# Volatility (standard deviation of returns)
btc_features['Volatility_5d'] = btc_features['Daily_Return'].rolling(window=5).std()
btc_features['Volatility_20d'] = btc_features['Daily_Return'].rolling(window=20).std()

# Relative price to moving average
btc_features['Price_Rel_MA5'] = btc_features['Price'] / btc_features['MA_5']
btc_features['Price_Rel_MA20'] = btc_features['Price'] / btc_features['MA_20']

# Volume features
btc_features['Volume_5d_pct'] = btc_features['Volume'].pct_change(periods=5) * 100
btc_features['Volume_MA_5'] = btc_features['Volume'].rolling(window=5).mean()
btc_features['Volume_Rel_MA5'] = btc_features['Volume'] / btc_features['Volume_MA_5']

# Display the new features
btc_features.dropna().head()

In [None]:
# Plot some of the engineered features
plt.figure(figsize=(14, 10))

# Plot 1: Price with moving averages
plt.subplot(2, 2, 1)
plt.plot(btc_features['Date'], btc_features['Price'], label='Price')
plt.plot(btc_features['Date'], btc_features['MA_5'], label='5-day MA')
plt.plot(btc_features['Date'], btc_features['MA_20'], label='20-day MA')
plt.plot(btc_features['Date'], btc_features['MA_50'], label='50-day MA')
plt.title('Bitcoin Price with Moving Averages')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(True)

# Plot 2: Volatility
plt.subplot(2, 2, 2)
plt.plot(btc_features['Date'], btc_features['Volatility_5d'], label='5-day Volatility')
plt.plot(btc_features['Date'], btc_features['Volatility_20d'], label='20-day Volatility')
plt.title('Bitcoin Price Volatility')
plt.xlabel('Date')
plt.ylabel('Volatility (Std Dev of Returns)')
plt.legend()
plt.grid(True)

# Plot 3: Price Momentum
plt.subplot(2, 2, 3)
plt.plot(btc_features['Date'], btc_features['Price_5d_pct'], label='5-day Momentum')
plt.plot(btc_features['Date'], btc_features['Price_20d_pct'], label='20-day Momentum')
plt.title('Bitcoin Price Momentum (%)')
plt.xlabel('Date')
plt.ylabel('Price Change (%)')
plt.legend()
plt.grid(True)

# Plot 4: Relative Price to Moving Average
plt.subplot(2, 2, 4)
plt.plot(btc_features['Date'], btc_features['Price_Rel_MA5'], label='Price / 5-day MA')
plt.plot(btc_features['Date'], btc_features['Price_Rel_MA20'], label='Price / 20-day MA')
plt.axhline(y=1, color='r', linestyle='--')
plt.title('Bitcoin Price Relative to Moving Averages')
plt.xlabel('Date')
plt.ylabel('Ratio')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

## 5. Data Preparation for Modeling

Let's prepare the data for our prediction models by normalizing features and creating sequences.

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Select features to use
features = ['Price', 'Open', 'High', 'Low', 'Volume', 
            'MA_5', 'MA_20', 'Volatility_5d', 
            'Price_Rel_MA5', 'Price_Rel_MA20', 'Volume_Rel_MA5']

# Filter the data to rows that have all features
btc_model_data = btc_features.dropna(subset=features).copy()

# Normalize the features using Min-Max scaling
scaler = MinMaxScaler()
btc_model_data[features] = scaler.fit_transform(btc_model_data[features])

# Display the normalized data
btc_model_data[features].head()

In [None]:
# Create sequences for time series prediction
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length, 0])  # 0 index corresponds to Price
    return np.array(X), np.array(y)

# Define sequence length
sequence_length = 60  # 60 days of historical data

# Prepare data for sequence creation
data_array = btc_model_data[features].values

# Create sequences
X, y = create_sequences(data_array, sequence_length)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

In [None]:
# Split data into training, validation, and test sets
train_size = int(0.7 * len(X))
val_size = int(0.15 * len(X))

X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]

print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")

## 6. Conclusion

In this notebook, we explored the cryptocurrency price data, performed feature engineering, and prepared the data for our LGBM and neural network models. The key observations are:

1. **Price Trends**: We observed the historical price trends of Bitcoin, Ethereum, and Ripple.
2. **Volatility**: Cryptocurrency prices show high volatility, as evidenced by the daily returns distribution.
3. **Correlations**: There are strong correlations between different cryptocurrencies, suggesting common market factors.
4. **Feature Engineering**: We created several features like moving averages, momentum, and volatility metrics that can help our models capture market patterns.
5. **Data Preparation**: We normalized the data and created sequences for time series prediction using a 60-day window.

The prepared dataset will be used to train our LGBM Neural network model as described in the research paper.